In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import pickle

In [2]:
def getSeqDB(filename, ftype="fasta"):
    geneDB = []
    with open(filename, 'r') as allseqs:
        geneDB = list(SeqIO.parse(allseqs, ftype))
    return geneDB

def createKmers(k, geneDB):
    kmers = {}
    for seqrec in geneDB:
        seq = seqrec.seq
        for i in range(0, len(seq)-k+1):
            ind = str(seq[i:i+k])
            try: 
                kmers[ind] = kmers[ind] + [(seqrec.id, i)]
            except:
                kmers[ind] = [(seqrec.id , i)]
    return kmers

def saveKmers(kmers, k, filename):
    with open(str(k)+'_kmers_'+filename+'_kb.pickle', 'wb') as kfile:
        pickle.dump(kmers, kfile)
        
def loadKmers(k, filename):
    with open(str(k)+'_kmers_'+filename+'_kb.pickle', 'rb') as kfile:
        return pickle.load(kfile)
    
def getKmers(k, filename):
    try:
        return loadKmers(k, filename)
    except:
        print("constructing kmers database")
        kmers = createKmers(k, getSeqDB(filename))
        saveKmers(kmers, str(k), filename)
        return kmers

In [3]:
def matchSeq(test, kmers, k):
    row = {'Kmer': '_', 'SeqID':'None', 'Kmer Ind':-1, 'Seq Ind':-1}
    seqDB = []
    
    for i in range(0, len(test)-k+1):
        ind = test[i:i+k]
        try:
            row['Kmer'] = ind
            row['Kmer Ind'] = i
            for seq in kmers[ind]:
                row['SeqID'] = seq[0]
                row['Seq Ind'] = seq[1]
                seqDB = seqDB+[dict(row)]
        except:
            pass
    return pd.DataFrame(seqDB)

def rankSeq(seqDB, t, rCount=True):
    seqDB["Distance"] = seqDB.apply(lambda row: abs(row["Kmer Ind"]-row["Seq Ind"]), axis=1)
    seqRank = seqDB[['SeqID', 'Kmer']].groupby(by="SeqID").count().join(seqDB[['SeqID', 'Distance']].groupby(by="SeqID").sum())
    seqRank["Freq"] = seqRank.apply(lambda row: row["Distance"]/row["Kmer"], axis=1)
    seqRank = seqRank[seqRank['Kmer'].map(int)>t]
    if (rCount):
        return seqRank.sort_values(by="Kmer", ascending=False), seqDB
    else:
        return seqRank.sort_values(by="Freq").iloc[0:10], seqDB

In [62]:
def main(filename, testseq, k=11, threshold=2):

    kmers = getKmers(k, filename) 
    seqDB = matchSeq(testseq, kmers, k)
    seqRank = rankSeq(seqDB, threshold)
    
    return seqRank, seqDB

seqRank, seqDB = main("small.fasta", """MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCARIKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSLAERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADCKCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNMLDDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRKVMFFVAGAVLVAILISTVRW""", 5, 13)
seqDB

Unnamed: 0,Kmer,Kmer Ind,Seq Ind,SeqID,Distance
0,MSIIG,0,0,sp|Q6GZX3|002L_FRG3G,0
1,SIIGA,1,1,sp|Q6GZX3|002L_FRG3G,0
2,SIIGA,1,49,sp|P38720|6PGD1_YEAST,48
3,SIIGA,1,52,sp|P53319|6PGD2_YEAST,51
4,SIIGA,1,441,sp|Q9C8G9|AB1C_ARATH,440
5,IIGAT,2,2,sp|Q6GZX3|002L_FRG3G,0
6,IIGAT,2,50,sp|P38720|6PGD1_YEAST,48
7,IIGAT,2,53,sp|P53319|6PGD2_YEAST,51
8,IIGAT,2,460,sp|Q8T9W4|ABCB3_DICDI,458
9,IIGAT,2,267,sp|Q9P8Q7|ACEA_CANAX,265


In [67]:
k = 5
subseq = []
seqDB = seqDB.sort_values(by='Kmer Ind')

for s in seqDB["SeqID"].unique():
    tempDB = seqDB[seqDB["SeqID"]==s]
    s_seq = e_seq = []
    temp = []

    for j in range(0, len(tempDB)): 
        row = tempDB.iloc[j]
        rem = []
        toAdd = True

        for i in range(0, len(e_seq)):
            if row['Kmer Ind'] - e_seq[i][0] > 1:
                rem = rem + [i]
            elif row['Kmer Ind'] == e_seq[i][0]:
                if row['Seq Ind'] == e_seq[i][1]:
                    toAdd = False
            elif row['Kmer Ind'] - e_seq[i][0] == 1:
                if row['Seq Ind'] - e_seq[i][1] == 1:
                    e_seq[i] = (row['Kmer Ind'], row['Seq Ind'])
                    toAdd = False
                elif row['Seq Ind'] == e_seq[i][1]:
                    toAdd = False
            else:
                print("HELP: ERROR")
        if toAdd:
            e_seq = e_seq + [(row['Kmer Ind'], row['Seq Ind'])]
            s_seq = s_seq + [(row['Kmer Ind'], row['Seq Ind'])]
        
        for i in range(0, len(rem)):
            temp = temp+[((s_seq[rem[i]-i][0], e_seq[rem[i]-i][0]), (s_seq[rem[i]-i][1], e_seq[rem[i]-i][1]))]
            del s_seq[rem[i]-i]
            del e_seq[rem[i]-i]

    for sS, eS in zip(s_seq, e_seq):
        temp = temp + [((sS[0], eS[0]), (sS[1], eS[1]))]
    subseq = subseq + [(temp, s)]

for i in subseq[0][0]:
    print(i, '\t', i[0][1]-i[0][0]-i[1][1]+i[1][0])

((262, 262), (263, 263)) 	 0
((263, 263), (262, 262)) 	 0
((269, 270), (287, 288)) 	 0
((269, 273), (284, 288)) 	 0
((269, 276), (281, 288)) 	 0
((269, 279), (278, 288)) 	 0
((269, 282), (275, 288)) 	 0
((269, 285), (272, 288)) 	 0
((272, 288), (269, 285)) 	 0
((275, 288), (269, 282)) 	 0
((278, 288), (269, 279)) 	 0
((281, 288), (269, 276)) 	 0
((284, 288), (269, 273)) 	 0
((287, 288), (269, 270)) 	 0
((0, 315), (0, 315)) 	 0


In [71]:
k = 5
row = {'matching #': -1, 'inp start':-1, 'inp end':-1, 'inp length':-1, 'Seq ID':'None', 'seq start':-1, 'seq end':-1, 'seq length':-1}
matchDF = []

for v in subseq:
    row['Seq ID'] = v[1]
    for s in v[0]:
        row['matching #'] = s[0][1]-s[0][0]+1
        row['inp start'] = s[0][0]
        row['inp end'] = s[0][1]+k
        row['seq start'] = s[1][0]
        row['seq end'] = s[1][1]+k
        row['inp length'] = row['inp end'] - row['inp start']
        row['seq length'] = row['seq end'] - row['seq start']
        matchDF = matchDF + [dict(row)]

pd.DataFrame(matchDF)[['Seq ID', 'matching #', 'inp start', 'inp end', 'inp length', 'seq start', 'seq end', 'seq length']].sort_values(by='matching #', ascending=False).reset_index(drop=True)

Unnamed: 0,Seq ID,matching #,inp start,inp end,inp length,seq start,seq end,seq length
0,sp|Q6GZX3|002L_FRG3G,316,0,320,320,0,320,320
1,sp|Q6GZX3|002L_FRG3G,17,269,290,21,272,293,21
2,sp|Q6GZX3|002L_FRG3G,17,272,293,21,269,290,21
3,sp|Q6GZX3|002L_FRG3G,14,269,287,18,275,293,18
4,sp|Q6GZX3|002L_FRG3G,14,275,293,18,269,287,18
5,sp|Q6GZX3|002L_FRG3G,11,269,284,15,278,293,15
6,sp|Q6GZX3|002L_FRG3G,11,278,293,15,269,284,15
7,sp|Q6GZX3|002L_FRG3G,8,269,281,12,281,293,12
8,sp|Q6GZX3|002L_FRG3G,8,281,293,12,269,281,12
9,sp|Q91G63|034R_IIV6,6,271,281,10,32,42,10


In [51]:
subseq

[([((0, 55), (0, 55)),
   ((65, 120), (60, 115)),
   ((130, 185), (120, 175)),
   ((195, 250), (180, 235)),
   ((282, 282), (263, 263)),
   ((289, 290), (287, 288)),
   ((289, 293), (284, 288)),
   ((289, 296), (281, 288)),
   ((289, 299), (278, 288)),
   ((289, 302), (275, 288)),
   ((289, 305), (272, 288)),
   ((292, 308), (269, 285)),
   ((295, 308), (269, 282)),
   ((298, 308), (269, 279)),
   ((301, 308), (269, 276)),
   ((304, 308), (269, 273)),
   ((307, 308), (269, 270)),
   ((260, 315), (240, 295)),
   ((325, 340), (300, 315))],
  'sp|Q6GZX3|002L_FRG3G')]