In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import pickle

In [2]:
def getSeqDB(filename, ftype="fasta"):
    geneDB = []
    with open(filename, 'r') as allseqs:
        geneDB = list(SeqIO.parse(allseqs, ftype))
    return geneDB

def createKmers(k, geneDB):
    kmers = {}
    for seqrec in geneDB:
        seq = seqrec.seq
        for i in range(0, len(seq)-k+1):
            ind = str(seq[i:i+k])
            try: 
                kmers[ind] = kmers[ind] + [(seqrec.id, i)]
            except:
                kmers[ind] = [(seqrec.id , i)]
    return kmers

def saveKmers(kmers, k, filename):
    with open(str(k)+'_kmers_'+filename+'_kb.pickle', 'wb') as kfile:
        pickle.dump(kmers, kfile)
        
def loadKmers(k, filename):
    with open(str(k)+'_kmers_'+filename+'_kb.pickle', 'rb') as kfile:
        return pickle.load(kfile)
    
def getKmers(k, filename):
    try:
        return loadKmers(k, filename)
    except:
        print("constructing kmers database")
        kmers = createKmers(k, getSeqDB(filename))
        saveKmers(kmers, str(k), filename)
        return kmers

In [3]:
def matchSeq(test, kmers, k):
    row = {'Kmer': '_', 'SeqID':'None', 'Kmer Ind':-1, 'Seq Ind':-1}
    seqDB = []
    
    for i in range(0, len(test)-k+1):
        ind = test[i:i+k]
        try:
            row['Kmer'] = ind
            row['Kmer Ind'] = i
            for seq in kmers[ind]:
                row['SeqID'] = seq[0]
                row['Seq Ind'] = seq[1]
                seqDB = seqDB+[dict(row)]
        except:
            pass
    return pd.DataFrame(seqDB)

def rankSeq(seqDB, t, rCount=True):
    seqDB["Distance"] = seqDB.apply(lambda row: abs(row["Kmer Ind"]-row["Seq Ind"]), axis=1)
    seqRank = seqDB[['SeqID', 'Kmer']].groupby(by="SeqID").count().join(seqDB[['SeqID', 'Distance']].groupby(by="SeqID").sum())
    seqRank["Freq"] = seqRank.apply(lambda row: row["Distance"]/row["Kmer"], axis=1)
    seqRank = seqRank[seqRank['Kmer'].map(int)>t]
    if (rCount):
        return seqRank.sort_values(by="Kmer", ascending=False), seqDB
    else:
        return seqRank.sort_values(by="Freq").iloc[0:10], seqDB

In [4]:
def main(filename, testseq, k=11, threshold=2):

    kmers = getKmers(k, filename) 
    seqDB = matchSeq(testseq, kmers, k)
    seqRank = rankSeq(seqDB, threshold)
    
    return seqRank, seqDB

seqRank, seqDB = main("small.fasta", """MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR
    IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL
    AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC
    KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML
    DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK
    VMFFVAGAVLVAILISTVRW""", 5, 13)
seqDB

Unnamed: 0,Kmer,Kmer Ind,Seq Ind,SeqID,Distance
0,MSIIG,0,0,sp|Q6GZX3|002L_FRG3G,0
1,SIIGA,1,1,sp|Q6GZX3|002L_FRG3G,0
2,SIIGA,1,49,sp|P38720|6PGD1_YEAST,48
3,SIIGA,1,52,sp|P53319|6PGD2_YEAST,51
4,SIIGA,1,441,sp|Q9C8G9|AB1C_ARATH,440
5,IIGAT,2,2,sp|Q6GZX3|002L_FRG3G,0
6,IIGAT,2,50,sp|P38720|6PGD1_YEAST,48
7,IIGAT,2,53,sp|P53319|6PGD2_YEAST,51
8,IIGAT,2,460,sp|Q8T9W4|ABCB3_DICDI,458
9,IIGAT,2,267,sp|Q9P8Q7|ACEA_CANAX,265


In [None]:
k = 5
allIncSS = {}
for s in seqDB["SeqID"].unique()[0:1]:
    tempDB = seqDB[seqDB["SeqID"]==s]
    tempDB['Kmer2'] = tempDB['Kmer']
    tempDB['Kmer Diff'] = tempDB['Kmer Ind'] - tempDB.shift(1)['Kmer Ind']
    tempDB['Seq Diff'] = tempDB['Seq Ind'] - tempDB.shift(1)['Seq Ind']
    incSeq = []
    temp2 = tempDB.groupby(by='Kmer2').min()[['Kmer', 'Seq Diff', 'Kmer Diff']]
#     temp2 = str(temp2['Kmer'])
#     tempDB = str(tempDB['Kmer'])
#     print(tempDB.drop(['Seq Diff', 'Kmer Diff', 'Distance', 'Kmer2'], axis=1).join(temp2, on='Kmer', how='inner', lsuffix='left', rsuffix='right'))
    print(tempDB[].join(temp2, on=['Kmer']))
#     for ind, row in tempDB.iterrows():
#         if np.isnan(row['Kmer Diff']):
#             count = 1
#             start = row['Kmer Ind']
#             end = start + k - 1
#         elif row['Kmer Diff'] <= 1.0 :
#             end = row['Kmer Ind']+ k -1
#             count = count+1
#         else:
#             incSeq = incSeq + [(start, end, count)]
#             count = 0
#             start = row['Kmer Ind']
#             end = start + k - 1
#     incSeq = incSeq + [(start, end, count)]
#     allIncSS[s] = incSeq
# allIncSS

In [49]:
k = 5
subseq = []
for s in seqDB["SeqID"].unique()[0:1]:
    tempDB = seqDB[seqDB["SeqID"]==s]
    tempDB = tempDB.sort_values(by=['Kmer Ind', 'Seq Ind'])
    s_seq = e_seq = []
    temp = []
#     start_k = end_k = 0
#     start_s = end_s = 0
    for j in range(0, len(tempDB)): 
        row = tempDB.iloc[j]
        rem = []
        toAdd = True

        for i in range(0, len(e_seq)):
            if row['Kmer Ind'] - e_seq[i][0] > 1:
                rem = rem + [i]
            elif row['Kmer Ind'] == e_seq[i][0]:
                if row['Seq Ind'] == e_seq[i][1]:
                    toAdd = False
            elif row['Kmer Ind'] - e_seq[i][0] == 1:
                if row['Seq Ind'] - e_seq[i][1] == 1:
                    e_seq[i] = (row['Kmer Ind'], row['Seq Ind'])
                    toAdd = False
                elif row['Seq Ind'] == e_seq[i][1]:
                    toAdd = False
            else:
                print("HELP: ERROR")
        if toAdd:
            e_seq = e_seq + [(row['Kmer Ind'], row['Seq Ind'])]
            s_seq = s_seq + [(row['Kmer Ind'], row['Seq Ind'])]
        
        for i in range(0, len(rem)):
            temp = temp+[((s_seq[rem[i]-i][0], e_seq[rem[i]-i][0]), (s_seq[rem[i]-i][1], e_seq[rem[i]-i][1]))]
#             print(s_seq, e_seq)
            del s_seq[rem[i]-i]
            del e_seq[rem[i]-i]
#             print(s_seq, e_seq)
#             print(temp)
#             input()
    for sS, eS in zip(s_seq, e_seq):
        temp = temp + [((sS[0], eS[0]), (sS[1], eS[1]))]
    subseq = subseq + [(temp, s)]
#         if (len(prev)<=0):
#             start_k = row['Kmer Ind']
#         elif (row['Kmer Ind']-prev['Kmer Ind']) > 1 or (row['Seq Ind']-prev['Seq Ind']) > 1:
#             end_k = prev['Kmer Ind'] + k - 1
#             end_s = prev['Seq Ind'] + k - 1
#             i_subseq = i_subseq + [((start_k, end_k), (start_s, end_s))]
#             start_s = row['Seq Ind']
#             start_k = row['Kmer Ind']
#     end_k = prev['Kmer Ind'] + k - 1
#     end_s = prev['Seq Ind'] + k -1
#     i_subseq = i_subseq + [((start_k, end_k), (start_s, end_s))]
#     subseq = subseq + [(i_subseq, s)]

for i in subseq[0][0]:
    print(i, '\t', i[0][1]-i[0][0]-i[1][1]+i[1][0])

[([((0, 55), (0, 55)),
   ((65, 120), (60, 115)),
   ((130, 185), (120, 175)),
   ((195, 250), (180, 235)),
   ((282, 282), (263, 263)),
   ((289, 290), (287, 288)),
   ((289, 293), (284, 288)),
   ((289, 296), (281, 288)),
   ((289, 299), (278, 288)),
   ((289, 302), (275, 288)),
   ((289, 305), (272, 288)),
   ((292, 308), (269, 285)),
   ((295, 308), (269, 282)),
   ((298, 308), (269, 279)),
   ((301, 308), (269, 276)),
   ((304, 308), (269, 273)),
   ((307, 308), (269, 270)),
   ((260, 315), (240, 295)),
   ((325, 340), (300, 315))],
  'sp|Q6GZX3|002L_FRG3G')]

In [44]:
tempDB.iloc[224:251]

Unnamed: 0,Kmer,Kmer Ind,Seq Ind,SeqID,Distance
464,DDGSV,260,240,sp|Q6GZX3|002L_FRG3G,20
465,DGSVT,261,241,sp|Q6GZX3|002L_FRG3G,20
468,GSVTM,262,242,sp|Q6GZX3|002L_FRG3G,20
469,SVTMD,263,243,sp|Q6GZX3|002L_FRG3G,20
470,VTMDD,264,244,sp|Q6GZX3|002L_FRG3G,20
476,TMDDV,265,245,sp|Q6GZX3|002L_FRG3G,20
477,MDDVK,266,246,sp|Q6GZX3|002L_FRG3G,20
479,DDVKN,267,247,sp|Q6GZX3|002L_FRG3G,20
481,DVKNT,268,248,sp|Q6GZX3|002L_FRG3G,20
483,VKNTI,269,249,sp|Q6GZX3|002L_FRG3G,20
