In [1]:
from Bio import SeqIO
import pandas as pd
import pickle

In [2]:
def getSeqDB(filename, ftype="fasta"):
    geneDB = []
    with open(filename, 'r') as allseqs:
        geneDB = list(SeqIO.parse(allseqs, ftype))
    return geneDB

def createKmers(k, geneDB):
    kmers = {}
    for seqrec in geneDB:
        seq = seqrec.seq
        for i in range(0, len(seq)-k+1):
            ind = str(seq[i:i+k])
            try: 
                kmers[ind] = kmers[ind] + [(seqrec.id, i)]
            except:
                kmers[ind] = [(seqrec.id , i)]
    return kmers

def saveKmers(kmers, k, filename):
    with open(str(k)+'_kmers_'+filename+'_kb.pickle', 'wb') as kfile:
        pickle.dump(kmers, kfile)
        
def loadKmers(k, filename):
    with open(str(k)+'_kmers_'+filename+'_kb.pickle', 'rb') as kfile:
        return pickle.load(kfile)
    
def getKmers(k, filename):
    try:
        return loadKmers(k, filename)
    except:
        print("constructing kmers database")
        kmers = createKmers(k, getSeqDB(filename))
        saveKmers(kmers, str(k), filename)
        return kmers

In [3]:
def matchSeq(test, kmers, k):
    row = {'Kmer': '_', 'SeqID':'None', 'Kmer Ind':-1, 'Seq Ind':-1}
    seqDB = []
    
    for i in range(0, len(test)-k+1):
        ind = test[i:i+k]
        try:
            row['Kmer'] = ind
            row['Kmer Ind'] = i
            for seq in kmers[ind]:
                row['SeqID'] = seq[0]
                row['Seq Ind'] = seq[1]
                seqDB = seqDB+[dict(row)]
        except:
            pass
    return pd.DataFrame(seqDB)

def rankSeq(seqDB, t, rCount=True):
    seqDB["Distance"] = seqDB.apply(lambda row: abs(row["Kmer Ind"]-row["Seq Ind"]), axis=1)
    seqRank = seqDB[['SeqID', 'Kmer']].groupby(by="SeqID").count().join(seqDB[['SeqID', 'Distance']].groupby(by="SeqID").sum())
    seqRank["Freq"] = seqRank.apply(lambda row: row["Distance"]/row["Kmer"], axis=1)
    seqRank = seqRank[seqRank['Kmer'].map(int)>t]
    if (rCount):
        return seqRank.sort_values(by="Kmer", ascending=False), seqDB
    else:
        return seqRank.sort_values(by="Freq").iloc[0:10], seqDB

In [4]:
def main(filename, testseq, k=11, threshold=2):

    kmers = getKmers(k, filename) 
    seqRank = rankSeq(matchSeq(testseq, kmers, k), threshold)
    
    return (seqRank)
    

seqRank, seqDB = main("small.fasta", """MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR
    IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL
    AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC
    KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML
    DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK
    VMFFVAGAVLVAILISTVRW""", 5, 13)
seqRank

Unnamed: 0_level_0,Kmer,Distance,Freq
SeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sp|Q6GZX3|002L_FRG3G,412,5520,13.398058
sp|Q8R5A3|AB1IP_MOUSE,41,10510,256.341463
sp|Q91G63|034R_IIV6,40,10558,263.95
sp|Q6PFT9|AB1IP_DANRE,36,9418,261.611111
sp|Q7Z5R6|AB1IP_HUMAN,25,6943,277.72
sp|Q55FT9|ABIA_DICDI,17,460,27.058824
sp|Q8CBW3|ABI1_MOUSE,16,1436,89.75
sp|Q8IZP0|ABI1_HUMAN,16,1868,116.75
sp|Q9QZM5|ABI1_RAT,16,1356,84.75
sp|Q6DCV1|AB1IP_XENLA,15,4030,268.666667


In [150]:
seqDB

Unnamed: 0,Kmer,Kmer Ind,Seq Ind,SeqID,Distance
0,MSIIG,0,0,sp|Q6GZX3|002L_FRG3G,0
1,SIIGA,1,1,sp|Q6GZX3|002L_FRG3G,0
2,SIIGA,1,49,sp|P38720|6PGD1_YEAST,48
3,SIIGA,1,52,sp|P53319|6PGD2_YEAST,51
4,SIIGA,1,441,sp|Q9C8G9|AB1C_ARATH,440
5,IIGAT,2,2,sp|Q6GZX3|002L_FRG3G,0
6,IIGAT,2,50,sp|P38720|6PGD1_YEAST,48
7,IIGAT,2,53,sp|P53319|6PGD2_YEAST,51
8,IIGAT,2,460,sp|Q8T9W4|ABCB3_DICDI,458
9,IIGAT,2,267,sp|Q9P8Q7|ACEA_CANAX,265


In [152]:
seqDB

Unnamed: 0,Kmer,Kmer Ind,Seq Ind,SeqID,Distance
0,MSIIG,0,0,sp|Q6GZX3|002L_FRG3G,0
1,SIIGA,1,1,sp|Q6GZX3|002L_FRG3G,0
2,SIIGA,1,49,sp|P38720|6PGD1_YEAST,48
3,SIIGA,1,52,sp|P53319|6PGD2_YEAST,51
4,SIIGA,1,441,sp|Q9C8G9|AB1C_ARATH,440
5,IIGAT,2,2,sp|Q6GZX3|002L_FRG3G,0
6,IIGAT,2,50,sp|P38720|6PGD1_YEAST,48
7,IIGAT,2,53,sp|P53319|6PGD2_YEAST,51
8,IIGAT,2,460,sp|Q8T9W4|ABCB3_DICDI,458
9,IIGAT,2,267,sp|Q9P8Q7|ACEA_CANAX,265
