In [1]:
import os
from dotenv import load_dotenv
import numpy as np
import tensorflow as tf
import pandas as pd
import datetime

load_dotenv(override=True)

DATA_PATH = os.getenv('DATA_PATH')
print(DATA_PATH)

# subontology (CCO, MFO or BPO)
SOs = ['CCO', 'MFO', 'BPO']

2023-06-08 22:00:23.211546: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


/mnt/e/ML/cafa-5-protein-function-prediction


## Reading fasta, obo and tsv files

In [38]:
from Bio import SeqIO

sequences = [rec.seq for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]
ids = [rec.id for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]

In [39]:
print("There are {} sequences in the dataset.".format(len(sequences)))

There are 141865 sequences in the dataset.


In [40]:
import networkx
import obonet

# Read the taxrank ontology
url = os.path.join(DATA_PATH, "Train/go-basic.obo")
graph = obonet.read_obo(url)



## Label encoding

In [41]:
from sklearn.preprocessing import MultiLabelBinarizer
import pickle 


with open(os.path.join(DATA_PATH,'MLB_CCO.pkl'), 'rb') as f:
    mlbCco = pickle.load(f)
with open(os.path.join(DATA_PATH,'MLB_MFO.pkl'), 'rb') as f:
    mlbMfo = pickle.load(f)
with open(os.path.join(DATA_PATH,'MLB_BPO.pkl'), 'rb') as f:
    mlbBpo = pickle.load(f)

print(len(mlbCco.classes_))
print(len(mlbMfo.classes_))
print(len(mlbBpo.classes_))

2957
7224
21285


## Amino acids encoding

In [6]:
aminos_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'X']

In [7]:
aa_dict = {'A': 1, 'B':24, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'O': 21, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'U': 22, 'V': 18, 'W': 19, 'Y': 20, 'X':30, 'Z':23}

## Build Dataset

In [8]:
maxLen = 35375

In [51]:
import warnings
from itertools import product

k = 3

allAA = list(aa_dict.keys())
allAA.sort()
allCombinations= list(product(*(allAA for i in range(k))))
allCombinations=np.array([''.join(el) for el in allCombinations])

positionDict = dict(zip(allCombinations, np.arange(0,allCombinations.size).T))

mapping = lambda x: aa_dict[x]
vectMapping = np.vectorize(mapping)


def generator():
    for i,seq in enumerate(sequences):
        kmers = [seq[i:i+k] if i < len(seq)-(k-1) else 0 for i,el in enumerate(seq)]
        kmers = kmers[0:-(k-1)]
        kmers = [str(el) for el in kmers]
        values, counts = np.unique(kmers, return_counts=True)
        freqVector=np.zeros(allCombinations.shape)
        for j,v in enumerate(values):
            freqVector[positionDict[v]] = counts[j]
        yield ids[i], freqVector



In [43]:
g = generator()
test = next(g)
print("The first sample sequence: {}".format(test))

The first sample sequence: ('Q9CQV8', array([1., 0., 0., ..., 0., 0., 0.]))


## Tensorflow Classification

In [53]:
import tensorflow as tf


dataset = tf.data.Dataset.from_generator(generator, output_signature=(tf.TensorSpec(shape=(),dtype=tf.dtypes.string),
         tf.TensorSpec(shape=(allCombinations.size,), dtype=tf.int32)))
print(list(dataset.take(1)))


[(<tf.Tensor: shape=(), dtype=string, numpy=b'Q9CQV8'>, <tf.Tensor: shape=(15625,), dtype=int32, numpy=array([1, 0, 0, ..., 0, 0, 0], dtype=int32)>)]


2023-06-08 22:33:59.433574: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


In [49]:
CCOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_CCO_epoch_20_valF1Score0.738"))
MFOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_MFO_epoch_16_valF1Score0.853"))
BPOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_BPO_epoch_9_valF1Score0.568"))



In [54]:
from tqdm import  tqdm
# probs= CCOmodel.predict(tf.expand_dims(list(dataset.take(64))[0][1], 0))
# prediction= [1 if p > 0.5 else 0 for p in probs[0]]
# probabilities= probs[probs>0.5]
# # classes = np.argwhere(prediction)
# print(mlb.inverse_transform(np.array([prediction])))
# print(probabilities)


batchedDataset = dataset.batch(512)
tableData=[]

for entries, data in tqdm(batchedDataset):

    probsCCO= CCOmodel.predict_on_batch(data)
    probsMFO= MFOmodel.predict_on_batch(data)
    probsBPO= BPOmodel.predict_on_batch(data)

    for i,prob in enumerate(probsCCO):
        prediction = np.where(probsCCO[i] > 0.5, 1, 0)
        # prediction= [1 if p > 0.5 else 0 for p in prob]
        probabilities= prob[prob>0.5]
        entry = entries[i]
        GOs = mlbCco.inverse_transform(np.array([prediction]))
        for j,g in enumerate(GOs[0]):
            tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

    for i,prob in enumerate(probsMFO):
        prediction = np.where(probsMFO[i] > 0.5, 1, 0)
        # prediction= [1 if p > 0.5 else 0 for p in prob]
        probabilities= prob[prob>0.5]
        entry = entries[i]
        GOs = mlbMfo.inverse_transform(np.array([prediction]))
        for j,g in enumerate(GOs[0]):
            tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

    for i,prob in enumerate(probsBPO):
        prediction = np.where(probsBPO[i] > 0.5, 1, 0)
        # prediction= [1 if p > 0.5 else 0 for p in prob]
        probabilities= prob[prob>0.5]
        entry = entries[i]
        GOs = mlbBpo.inverse_transform(np.array([prediction]))
        for j,g in enumerate(GOs[0]):
            tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

        
results = pd.DataFrame(tableData, columns=['Entry ID', 'GO', 'Probability'])


0it [00:00, ?it/s]2023-06-08 22:34:04.616989: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-06-08 22:34:05.177981: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32 and shape [512,15625]
	 [[{{node Placeholder/_0}}]]
2023-06-08 22:34:12.759388: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32 and shape [5

In [55]:
results.to_csv(os.path.join(DATA_PATH, "submission.tsv"), sep="\t", header=False, index=False)