In [1]:
import os
from dotenv import load_dotenv
import numpy as np
import tensorflow as tf
import pandas as pd
import datetime
from tqdm.notebook import tqdm

load_dotenv(override=True)

DATA_PATH = os.getenv('DATA_PATH')
DATA_PATH_INTERPRO = os.getenv('DATA_PATH_INTERPRO')
print(DATA_PATH)
print(DATA_PATH_INTERPRO)

# subontology (CCO, BPO or BPO)
SOs = ['CCO', 'MFO', 'BPO']

2023-08-09 17:15:31.209773: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


/mnt/e/ML/cafa-5-protein-function-prediction
/mnt/e/ML/output


## Reading fasta, obo and tsv files

In [2]:
from Bio import SeqIO

sequences = [rec.seq for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]
ids = [rec.id for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]

In [3]:
print("There are {} sequences in the dataset.".format(len(sequences)))

There are 141865 sequences in the dataset.


In [4]:
import networkx
import obonet

# Read the taxrank ontology
url = os.path.join(DATA_PATH, "Train/go-basic.obo")
graph = obonet.read_obo(url)



## Label encoding

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer
import pickle 


with open(os.path.join(DATA_PATH,'MLB_CCO.pkl'), 'rb') as f:
    mlbCco = pickle.load(f)
with open(os.path.join(DATA_PATH,'MLB_MFO.pkl'), 'rb') as f:
    mlbMfo = pickle.load(f)
with open(os.path.join(DATA_PATH,'MLB_BPO.pkl'), 'rb') as f:
    mlbBpo = pickle.load(f)

print(len(mlbCco.classes_))
print(len(mlbMfo.classes_))
print(len(mlbBpo.classes_))

2055
4282
11807


## Amino acids encoding

In [6]:
aminos_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'X']

In [7]:
aa_dict = {'A': 1, 'B':24, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'O': 21, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'U': 22, 'V': 18, 'W': 19, 'Y': 20, 'X':30, 'Z':23}

## T5 Embeds

In [8]:
train_embeddings = np.load(os.path.join(DATA_PATH, "t5/test_embeds.npy"))

column_num = train_embeddings.shape[1]
t5df = pd.DataFrame(
    train_embeddings, columns=["Column_" + str(i) for i in range(1, column_num + 1)]
)
t5Dimension = t5df.shape[1]

train_protein_ids = np.load(os.path.join(DATA_PATH, "t5/test_ids.npy"))
t5df["ids"] = train_protein_ids
print(train_protein_ids.shape)
print(t5df.shape)
t5df = t5df.drop_duplicates(subset=["ids"])
print(t5df.shape)
t5df.set_index("ids", inplace=True)
t5df.head()

(141865,)
(141865, 1025)
(141864, 1025)


Unnamed: 0_level_0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,Column_10,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9CQV8,0.054705,0.06342,-0.01532,-0.016506,0.042195,0.021592,-0.118535,-0.063298,-0.046146,-0.102311,...,-0.019581,-0.043712,-0.072322,0.002404,0.018459,-0.047278,0.012195,-0.043319,0.036009,0.063093
P62259,0.090373,0.089842,-0.023887,-0.011446,0.051465,0.020982,-0.110989,-0.066646,-0.041259,-0.087551,...,-0.024399,-0.041957,-0.066329,0.006856,0.028449,-0.053758,0.009699,-0.05335,0.019644,0.07963
P68510,0.043588,0.039572,-0.014332,-0.011769,0.045109,0.015847,-0.103339,-0.047735,-0.02273,-0.091452,...,-0.029648,-0.037944,-0.046043,0.003603,0.018028,-0.030746,0.003671,-0.044464,0.030974,0.040322
P61982,0.055668,0.04956,-0.019646,-0.006977,0.039897,0.021177,-0.108079,-0.047191,-0.031517,-0.100057,...,-0.02321,-0.041704,-0.04844,0.006088,0.02011,-0.046751,-0.006635,-0.041455,0.016683,0.05703
O70456,0.022637,0.014306,-0.002696,-0.034456,0.034854,0.020822,-0.114046,-0.050019,-0.026491,-0.097928,...,-0.019185,-0.032108,-0.051394,0.008448,0.015208,-0.037987,0.030977,-0.042407,0.041232,0.047161


## Interpro Data

In [9]:
import pickle


with open(os.path.join(DATA_PATH,'MLB_InterPro_'+"BPO"+'.pkl'), 'rb') as f:
    mlbInterpro= pickle.load(f)

In [10]:
import json

allInterproData =[]

for root,dirs,files in os.walk(os.path.join(DATA_PATH_INTERPRO, "test")):
    for f in files:
        if f.endswith(".json"):
            print("Processing ", f)
            with open(os.path.join(root, f)) as inputFile:
                iprData = json.load(inputFile)
            allInterproData=[*allInterproData, *iprData["results"]]

Processing  testsuperset1.fasta.json
Processing  testsuperset2.fasta.json


In [11]:
len(allInterproData)

139946

In [12]:

iprIds = {}


for entry in tqdm(allInterproData):
    entryId = entry["xref"][0]["id"]
    matches=[]
    for match in entry["matches"]:
        sigEntry = match["signature"]["entry"]
        if(sigEntry):
            type = sigEntry["type"]
            if type=="DOMAIN" or type=="REPEAT" or type=="FAMILY" or type=="HOMOLOGOUS_SUPERFAMILY":
                iprId = match["signature"]["entry"]["accession"]
                matches.append(iprId)
    iprIds[entryId] = matches

  0%|          | 0/139946 [00:00<?, ?it/s]

In [13]:
testInput = mlbInterpro.transform([iprIds["Q55G04"]])
np.count_nonzero(testInput)

4

## Physiochemical Properties

In [14]:

from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Seq import MutableSeq, Seq
from tqdm.notebook import tqdm


if os.path.exists(os.path.join(DATA_PATH, "PCDictTest"+".pkl")):
    print("Loading presaved data")
    with open(os.path.join(DATA_PATH, "PCDictTest"+".pkl"), 'rb') as f:
        PCDict = pickle.load(f)
else:
    PCDict = {}

    for i,seq in enumerate(tqdm(sequences)):

        index = ids[i]
        
        X =ProteinAnalysis(seq)

        if "X" in seq or "U" in seq or "O" in seq or "B" in seq or "Z" in seq:
            cleanedSeq = seq.replace("X", "A")
            cleanedSeq = cleanedSeq.replace("U", "A")
            cleanedSeq = cleanedSeq.replace("O", "A")
            cleanedSeq = cleanedSeq.replace("B", "A")
            cleanedSeq = cleanedSeq.replace("Z", "A")
            XClean =ProteinAnalysis(cleanedSeq)
            flex = XClean.flexibility()
            molW = XClean.molecular_weight()
            instabIdx = XClean.instability_index()
            gravy = XClean.gravy()
        else:
            flex= X.flexibility()
            molW = X.molecular_weight()
            instabIdx = X.instability_index()
            gravy = X.gravy()

        if len(flex)>10:
            idx = np.round(np.linspace(0, len(flex) - 1, 10)).astype(int)
            flex = np.array(flex)[idx]
        elif len(flex)<10:
            flex = np.pad(flex, (0,10-len(flex)))

        protS= X.protein_scale(aa_dict,100)
        if len(protS)>10:
            idx = np.round(np.linspace(0, len(protS) - 1, 10)).astype(int)
            protS = np.array(protS)[idx]
        elif len(protS)<10:
            protS = np.pad(protS, (0,10-len(protS)))

        #Adding all the physiochemical properties (N = 53)
        PCDict[index] = [ molW, X.aromaticity(), instabIdx, *list(X.get_amino_acids_percent().values()),
                *flex, gravy, *protS, X.isoelectric_point(), X.charge_at_pH(7), X.charge_at_pH(3), X.charge_at_pH(10), *X.molar_extinction_coefficient(),
                *X.secondary_structure_fraction()]
        
    with open(os.path.join(DATA_PATH, "PCDictTest"+".pkl"), 'wb') as f:
        pickle.dump(PCDict, f)
    


Loading presaved data


## Build Dataset

In [15]:
from itertools import product
from tqdm import tqdm
import os
import warnings

TRAIN_VAL_SPLIT = 0.7
k = 3
PCLength = len(PCDict[ids[0]])

allAA = list(aa_dict.keys())
allAA.sort()
allCombinations= list(product(*(allAA for i in range(k))))
allCombinations=np.array([''.join(el) for el in allCombinations])

positionDict = dict(zip(allCombinations, np.arange(0,allCombinations.size).T))

#Use numpy vectorize to speed up the mapping (hopefully)
mapping = lambda x: aa_dict[x]
vectMapping = np.vectorize(mapping)



def generator():
  for idxTrain,seqTrain in enumerate(sequences):
      entryIdTrain = ids[idxTrain]

      kmersTrain = [seqTrain[j:j+k] if j < len(seqTrain)-(k-1) else 0 for j,el in enumerate(seqTrain)]
      kmersTrain = kmersTrain[0:-(k-1)]
      kmersTrain = [str(el) for el in kmersTrain]
      valuesTrain, countsTrain = np.unique(kmersTrain, return_counts=True)
      freqVectorTrain=np.zeros(allCombinations.shape)
      for lTrain,vTrain in enumerate(valuesTrain):
          freqVectorTrain[positionDict[vTrain]] = countsTrain[lTrain]

      with warnings.catch_warnings():
        #supress the warnings for unknown classes
        warnings.simplefilter("ignore")
        if entryIdTrain in iprIds:
          xTrain  = mlbInterpro.transform([iprIds[entryIdTrain]])
        else:
          xTrain  = mlbInterpro.transform([[]])
      

      #Adding all the physiochemical properties (N = 53)
      pcPropsTrain = PCDict[entryIdTrain]

      t5data = t5df.loc[entryIdTrain].to_numpy()
     
      yield (entryIdTrain, np.array(pcPropsTrain),xTrain[0],freqVectorTrain, t5data)




In [16]:
g = generator()
test = next(g)
print("Seq ID: \n{}\n".format(test[0]))
print("PC Input: \n{}\n{}\n".format(test[1].shape, test[1][0:10]))
print("Interpro Input: \n{}\n{}\n".format(test[2].shape, test[2][0:10]))
print("kMer Input: \n{}\n{}\n".format(test[3].shape, test[3][0:20]))
print("t5 Input: \n{}\n{}\n".format(test[4].shape, test[4][0:20]))
print("The first sample has {} Interpro input classes".format(np.count_nonzero(test[1])))
print("The first sample has {} kMer input classes".format(np.count_nonzero(test[2])))

Seq ID: 
Q9CQV8

PC Input: 
(53,)
[2.80861080e+04 7.72357724e-02 4.63687805e+01 8.94308943e-02
 8.13008130e-03 5.28455285e-02 1.26016260e-01 2.43902439e-02
 4.06504065e-02 8.13008130e-03]

Interpro Input: 
(38293,)
[0 0 0 0 0 0 0 0 0 0]

kMer Input: 
(15625,)
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]

t5 Input: 
(1024,)
[ 0.05470492  0.06342026 -0.01531996 -0.01650625  0.04219466  0.02159161
 -0.11853468 -0.06329785 -0.04614646 -0.10231141 -0.05789618 -0.00766989
 -0.04101066  0.04766249  0.04528048 -0.01139968  0.02377699 -0.08064673
 -0.06053895 -0.04856941]

The first sample has 53 Interpro input classes
The first sample has 2 kMer input classes


## Tensorflow Prediction

In [17]:
import tensorflow as tf


dataset = tf.data.Dataset.from_generator(generator, output_signature=(
    tf.TensorSpec(shape=(),dtype=tf.dtypes.string),
    tf.TensorSpec(shape=(PCLength,), dtype=tf.float32),                 #Physiochemical properties
    tf.TensorSpec(shape=(len(mlbInterpro.classes_),), dtype=tf.int32),  #Interpro Classes
    tf.TensorSpec(shape=(allCombinations.shape[0],), dtype=tf.int32),   #kMers
    tf.TensorSpec(shape=(t5Dimension,), dtype=tf.float32)))              #t5

print(list(dataset.take(1)))


2023-08-09 17:16:03.565450: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-09 17:16:03.596570: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-09 17:16:03.596914: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-09 17:16:03.598644: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-09 17:16:03.598963: I tensorflow/compile

[(<tf.Tensor: shape=(), dtype=string, numpy=b'Q9CQV8'>, <tf.Tensor: shape=(53,), dtype=float32, numpy=
array([ 2.80861074e+04,  7.72357732e-02,  4.63687820e+01,  8.94308910e-02,
        8.13008100e-03,  5.28455302e-02,  1.26016259e-01,  2.43902430e-02,
        4.06504050e-02,  8.13008100e-03,  4.06504050e-02,  8.13008100e-02,
        1.01626016e-01,  3.25203240e-02,  5.69105707e-02,  1.21951215e-02,
        6.09756112e-02,  4.06504050e-02,  7.72357732e-02,  4.87804860e-02,
        4.47154455e-02,  8.13008100e-03,  4.47154455e-02,  1.02064288e+00,
        1.01961899e+00,  9.99678552e-01,  1.04596424e+00,  9.68892872e-01,
        1.03008330e+00,  1.05335712e+00,  1.01052380e+00,  1.02227378e+00,
        1.05423808e+00, -6.93495929e-01,  9.85148525e+00,  9.86138630e+00,
        1.03762379e+01,  1.08811884e+01,  1.02277231e+01,  1.03267326e+01,
        1.03663368e+01,  1.01089106e+01,  1.01881189e+01,  9.28712845e+00,
        4.77250862e+00, -1.42744741e+01,  3.06518860e+01, -3.24159012e+0

In [18]:
CCOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_CCO_epoch_13_valF1Score0.5785"))
MFOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_MFO_epoch_18_valF1Score0.5756"))
BPOmodel = tf.keras.saving.load_model(os.path.join(DATA_PATH, "model_BPO_epoch_18_valF1Score0.3447"))



In [19]:

# probs= CCOmodel.predict(tf.expand_dims(list(dataset.take(64))[0][1], 0))
# prediction= [1 if p > 0.5 else 0 for p in probs[0]]
# probabilities= probs[probs>0.5]
# # classes = np.argwhere(prediction)
# print(mlb.inverse_transform(np.array([prediction])))
# print(probabilities)

BATCHSIZE=512

batchedDataset = dataset.batch(BATCHSIZE)
tableData=[]

# for entries, data in tqdm(batchedDataset):
for entries, x_batch_trainPC, x_batch_trainIP, x_batch_trainKmer, x_batch_trainT5 in tqdm(batchedDataset, total=int(np.ceil(len(sequences)/BATCHSIZE))):

    probsCCO = CCOmodel.predict_on_batch((x_batch_trainPC, x_batch_trainIP, x_batch_trainKmer, x_batch_trainT5 ))
    probsMFO = MFOmodel.predict_on_batch((x_batch_trainPC, x_batch_trainIP, x_batch_trainKmer, x_batch_trainT5 ))
    probsBPO = BPOmodel.predict_on_batch((x_batch_trainPC, x_batch_trainIP, x_batch_trainKmer, x_batch_trainT5 ))

    for i,prob in enumerate(probsCCO):
        prediction = np.where(probsCCO[i] > 0.5, 1, 0)
        # prediction= [1 if p > 0.5 else 0 for p in prob]
        probabilities= prob[prob>0.5]
        entry = entries[i]
        GOs = mlbCco.inverse_transform(np.array([prediction]))
        for j,g in enumerate(GOs[0]):
            tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

    for i,prob in enumerate(probsMFO):
        prediction = np.where(probsMFO[i] > 0.5, 1, 0)
        # prediction= [1 if p > 0.5 else 0 for p in prob]
        probabilities= prob[prob>0.5]
        entry = entries[i]
        GOs = mlbMfo.inverse_transform(np.array([prediction]))
        for j,g in enumerate(GOs[0]):
            tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

    for i,prob in enumerate(probsBPO):
        prediction = np.where(probsBPO[i] > 0.5, 1, 0)
        # prediction= [1 if p > 0.5 else 0 for p in prob]
        probabilities= prob[prob>0.5]
        entry = entries[i]
        GOs = mlbBpo.inverse_transform(np.array([prediction]))
        for j,g in enumerate(GOs[0]):
            tableData.append([entry.numpy().decode("utf-8") , g, probabilities[j]])

        
results = pd.DataFrame(tableData, columns=['Entry ID', 'GO', 'Probability'])


100%|██████████| 278/278.0 [05:43<00:00,  1.24s/it]


In [20]:
results.to_csv(os.path.join(DATA_PATH, "submissionMultiModal.tsv"), sep="\t", header=False, index=False)