In [None]:
import os
from dotenv import load_dotenv
import numpy as np
import tensorflow as tf
import pandas as pd
import datetime
from tqdm.notebook import tqdm

load_dotenv(override=True)

DATA_PATH = os.getenv('DATA_PATH')
print(DATA_PATH)

# subontology (CCO, BPO or BPO)
SOs = ['CCO', 'MFO', 'BPO']

## Reading fasta, obo and tsv files

In [None]:
from Bio import SeqIO

sequences = [rec.seq for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]
ids = [rec.id for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]

In [None]:
print("There are {} sequences in the dataset.".format(len(sequences)))

In [None]:
import networkx
import obonet

# Read the taxrank ontology
url = os.path.join(DATA_PATH, "Train/go-basic.obo")
graph = obonet.read_obo(url)



## Label encoding

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import pickle 


with open(os.path.join(DATA_PATH,'MLB_CCO.pkl'), 'rb') as f:
    mlbCco = pickle.load(f)
with open(os.path.join(DATA_PATH,'MLB_MFO.pkl'), 'rb') as f:
    mlbMfo = pickle.load(f)
with open(os.path.join(DATA_PATH,'MLB_BPO.pkl'), 'rb') as f:
    mlbBpo = pickle.load(f)

print(len(mlbCco.classes_))
print(len(mlbMfo.classes_))
print(len(mlbBpo.classes_))

## Amino acids encoding

In [None]:
aminos_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'X']

In [None]:
aa_dict = {'A': 1, 'B':24, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'O': 21, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'U': 22, 'V': 18, 'W': 19, 'Y': 20, 'X':30, 'Z':23}

## Build Dataset

In [None]:
maxLen = 35375

In [None]:
import warnings
from itertools import product

k = 3

allAA = list(aa_dict.keys())
allAA.sort()
allCombinations= list(product(*(allAA for i in range(k))))
allCombinations=np.array([''.join(el) for el in allCombinations])

positionDict = dict(zip(allCombinations, np.arange(0,allCombinations.size).T))

mapping = lambda x: aa_dict[x]
vectMapping = np.vectorize(mapping)


def generator():
    for i,seq in enumerate(sequences):
        kmers = [seq[j:j+k] if j < len(seq)-(k-1) else 0 for j,el in enumerate(seq)]
        kmers = kmers[0:-(k-1)]
        kmers = [str(el) for el in kmers]
        values, counts = np.unique(kmers, return_counts=True)
        freqVector=np.zeros(allCombinations.shape)
        for j,v in enumerate(values):
            freqVector[positionDict[v]] = counts[j]
        yield ids[i], freqVector



In [None]:
g = generator()
test = next(g)
print("The first sample sequence: {}".format(test))

## Tensorflow Classification

In [None]:
import tensorflow as tf


dataset = tf.data.Dataset.from_generator(generator, output_signature=(tf.TensorSpec(shape=(),dtype=tf.dtypes.string),
         tf.TensorSpec(shape=(allCombinations.size,), dtype=tf.int32)))
print(list(dataset.take(1)))


In [None]:
CCOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_CCO_epoch_20_valF1Score0.738"))
MFOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_MFO_epoch_16_valF1Score0.853"))
# BPOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_BPO_epoch_9_valF1Score0.568"))

In [None]:

# probs= CCOmodel.predict(tf.expand_dims(list(dataset.take(64))[0][1], 0))
# prediction= [1 if p > 0.5 else 0 for p in probs[0]]
# probabilities= probs[probs>0.5]
# # classes = np.argwhere(prediction)
# print(mlb.inverse_transform(np.array([prediction])))
# print(probabilities)


batchedDataset = dataset.batch(512)
tableData=[]

for entries, data in tqdm(batchedDataset):

    probsCCO= CCOmodel.predict_on_batch(data)
    probsMFO= MFOmodel.predict_on_batch(data)
    # probsBPO= BPOmodel.predict_on_batch(data)

    for i,prob in enumerate(probsCCO):
        prediction = np.where(probsCCO[i] > 0.5, 1, 0)
        # prediction= [1 if p > 0.5 else 0 for p in prob]
        probabilities= prob[prob>0.5]
        entry = entries[i]
        GOs = mlbCco.inverse_transform(np.array([prediction]))
        for j,g in enumerate(GOs[0]):
            tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

    for i,prob in enumerate(probsMFO):
        prediction = np.where(probsMFO[i] > 0.5, 1, 0)
        # prediction= [1 if p > 0.5 else 0 for p in prob]
        probabilities= prob[prob>0.5]
        entry = entries[i]
        GOs = mlbMfo.inverse_transform(np.array([prediction]))
        for j,g in enumerate(GOs[0]):
            tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

    # for i,prob in enumerate(probsBPO):
    #     prediction = np.where(probsBPO[i] > 0.5, 1, 0)
    #     # prediction= [1 if p > 0.5 else 0 for p in prob]
    #     probabilities= prob[prob>0.5]
    #     entry = entries[i]
    #     GOs = mlbBpo.inverse_transform(np.array([prediction]))
    #     for j,g in enumerate(GOs[0]):
    #         tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

        
# results = pd.DataFrame(tableData, columns=['Entry ID', 'GO', 'Probability'])


## 1vR Classifiers

In [None]:

with open(os.path.join(DATA_PATH,"ClassifierArray_"+"MFO"+".bin"), "rb") as f: 
    classifiersMFO = pickle.load(f)
with open(os.path.join(DATA_PATH,"ClassifierArray_"+"BPO"+".bin"), "rb") as f: 
    classifiersBPO = pickle.load(f)
with open(os.path.join(DATA_PATH,"ClassifierArray_"+"CCO"+".bin"), "rb") as f: 
    classifiersCCO = pickle.load(f)
 

Split classifiers for multiprocessing

In [None]:
NoClassifiersMFO = len(classifiersMFO)
classifiersMFO1 = [c if m>=0 and m<NoClassifiersMFO//3 else None for m,c in enumerate(classifiersMFO)]
classifiersMFO2 = [c if m>=NoClassifiersMFO//3 and m<2*NoClassifiersMFO//3 else None for m,c in enumerate(classifiersMFO)]
classifiersMFO3 = [c if m>=2*NoClassifiersMFO//3 and m<NoClassifiersMFO else None for m,c in enumerate(classifiersMFO)]

NoClassifiersBPO = len(classifiersBPO)
classifiersBPO1 = [c if m>=0 and m<NoClassifiersBPO//3 else None for m,c in enumerate(classifiersBPO)]
classifiersBPO2 = [c if m>=NoClassifiersBPO//3 and m<2*NoClassifiersBPO//3 else None for m,c in enumerate(classifiersBPO)]
classifiersBPO3 = [c if m>=2*NoClassifiersBPO//3 and m<NoClassifiersBPO else None for m,c in enumerate(classifiersBPO)]

NoClassifiersCCO = len(classifiersCCO)
classifiersCCO1 = [c if m>=0 and m<NoClassifiersCCO//3 else None for m,c in enumerate(classifiersCCO)]
classifiersCCO2 = [c if m>=NoClassifiersCCO//3 and m<2*NoClassifiersCCO//3 else None for m,c in enumerate(classifiersCCO)]
classifiersCCO3 = [c if m>=2*NoClassifiersCCO//3 and m<NoClassifiersCCO else None for m,c in enumerate(classifiersCCO)]

In [18]:
from multiprocessing import Process, Pool
import multiprocessing


def task(classifiers, mlb, freqVectors, entryIds):
    temp=[]
    for j,c in tqdm(enumerate(classifiers), total=len(classifiers), position=1, leave=False):
        if(c is None):
            continue
        resArr = c.predict_proba(freqVectors)
        probas=[res[1] for res in resArr]
        for m,p in enumerate(probas):
            if(p>0.99):
                temp.append([entryIds[m] , mlb.classes_[j], p])
    return temp



tableData=[]
g = generator()
entryIds=[]
freqVectors=[]

for i,(entryId, fVec) in tqdm(enumerate(g), smoothing=0.05, total=len(sequences), position=0, leave=False):
    entryIds.append(entryId)
    freqVectors.append(fVec)
        
    if(len(freqVectors)>1) or i==len(sequences)-1:
        # res = classifiers[0].predict(freqVectors[0:1000])
        # results = np.concatenate((results,res))
        # p1=Process(target=task, args=(classifiersCCO, freqVectors, entryIds))
        # p2=Process(target=task, args=(classifiersMFO, freqVectors, entryIds))
        # p3=Process(target=task, args=(classifiersBPO, freqVectors, entryIds))
        with Pool(processes=12) as pool:
            iterable=[(classifiersCCO1, mlbCco, freqVectors, entryIds), (classifiersCCO2, mlbCco, freqVectors, entryIds), (classifiersCCO3, mlbCco, freqVectors, entryIds),
                      (classifiersBPO1, mlbBpo, freqVectors, entryIds), (classifiersBPO2, mlbBpo, freqVectors, entryIds), (classifiersBPO3, mlbBpo, freqVectors, entryIds),
                      (classifiersMFO1, mlbMfo, freqVectors, entryIds), (classifiersMFO2, mlbMfo, freqVectors, entryIds), (classifiersMFO3, mlbMfo, freqVectors, entryIds)]
            result = pool.starmap(task, iterable)
            # print(result)
            for r in result:
                tableData = tableData+r
            freqVectors=[]
            entryIds=[]

  0%|          | 0/141865 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
results = pd.DataFrame(tableData, columns=['Entry ID', 'GO', 'Probability'])

In [20]:
results

Unnamed: 0,Entry ID,GO,Probability
0,P62259,GO:0005871,0.997291
1,P62259,GO:0005875,0.997826
2,P62259,GO:0005930,0.992776
3,Q9CQV8,GO:0043226,0.992121
4,P62259,GO:0045171,0.993526
...,...,...,...
188,P61982,GO:0051219,0.995546
189,P68510,GO:0099106,0.995813
190,P68510,GO:0140313,0.996503
191,P61982,GO:0140313,0.996049


In [None]:
results.to_csv(os.path.join(DATA_PATH, "submission.tsv"), sep="\t", header=False, index=False)