In [None]:
import os
from dotenv import load_dotenv
import numpy as np
import tensorflow as tf
import pandas as pd
import datetime

physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))
# try:
#   tf.config.experimental.set_memory_growth(physical_devices[0], True)
# except:
#   # Invalid device or cannot modify virtual devices once initialized.
#   pass

load_dotenv(override=True)

DATA_PATH = os.getenv('DATA_PATH')
print(DATA_PATH)

# Choose subontology (CCO, MFO or BPO)
SO = 'BPO'

## Reading fasta, obo and tsv files

In [None]:
from Bio import SeqIO

sequences = [rec.seq for rec in SeqIO.parse(os.path.join(DATA_PATH, "Train/train_sequences.fasta"),"fasta")]
ids = [rec.id for rec in SeqIO.parse(os.path.join(DATA_PATH, "Train/train_sequences.fasta"),"fasta")]

In [None]:
import networkx
import obonet

# Read the taxrank ontology
url = os.path.join(DATA_PATH, "Train/go-basic.obo")
graph = obonet.read_obo(url)



In [None]:
df = pd.read_csv(os.path.join(DATA_PATH, "Train/train_terms.tsv"), sep='\t')

dfSO = df.loc[df["aspect"]==SO]
uniqueTerms = dfSO["term"].unique()
termsArr = list(dfSO["term"].to_numpy())

uniqueTermsDict={}
for i,el in enumerate(uniqueTerms):
    uniqueTermsDict[el] = i
    
print(dfSO.shape)
df=dfSO

df.set_index("EntryID", inplace=True)

In [None]:
testID = df.index.to_list()[0]

## GO analysis

In [None]:
item_counts = df["term"].value_counts()

In [None]:
id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
name_to_id = {data['name']: id_ for id_, data in graph.nodes(data=True) if 'name' in data}

## Label encoding

The task is a multilabel classification: The output has several possible targets (Gene Ontologies) but each can only be 1 (existing) or 0 (non existing)

Extract label weights from IA

In [None]:
dfIa = pd.read_csv(os.path.join(DATA_PATH, "IA.txt"), sep='\t', header=None)

dfIa.set_index(0, inplace=True)

labelWeights=[]
allIndices = dfIa.index.tolist()



notFound=0
for go in item_counts.index.to_list():
    if go in allIndices:
        labelWeights.append(dfIa.loc[go].to_numpy()[0])
    else:
        notFound += 1
        labelWeights.append(0)

print("Not found GOs: {} (set to 0)".format(notFound))

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import pickle

topGOs= item_counts
topGOs=topGOs.index.to_list()

#Reduce possible GOs by label weight
threshold=0
labelWeights=np.array(labelWeights)
selection = labelWeights>threshold
topGOs=np.array(topGOs)[selection]
labelWeights=labelWeights[selection]

mlb = MultiLabelBinarizer()
mlb.fit([topGOs])

dftest=df.loc[testID]
indices = dftest["term"].to_numpy()
print(indices)
print(mlb.transform([indices]))
print(len(mlb.classes_))

with open(os.path.join(DATA_PATH,'MLB_'+SO+'.pkl'), 'wb') as f:
    pickle.dump(mlb, f)

## Amino acids encoding

In [None]:
aa_dict = {'A': 1, 'B':24, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'O': 21, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'U': 22, 'V': 18, 'W': 19, 'Y': 20, 'X':30, 'Z':23}

## Build Dataset

In [None]:
seqLengths = [len(seq) for seq in sequences]
maxLen = max(seqLengths)
print("The max. length of the sequences is {}".format(maxLen))

In [None]:

dfAll=pd.read_csv(os.path.join(DATA_PATH, "Train/train_terms.tsv"), sep='\t')

soEntries = dfAll.loc[dfAll["aspect"]==SO]
soEntryIds = soEntries["EntryID"].unique()

# print(len(seqEntries))
print(soEntryIds)

# SoSequences = []
# for entry in soEntryIds:
#     SoSequences.append(sequences[ids.index(entry)])

# print(len(SoSequences))
dfAll.set_index("EntryID", inplace=True)

In [None]:
import warnings
from itertools import product
from tqdm import tqdm
import os

TRAIN_VAL_SPLIT = 0.7
k = 3

allAA = list(aa_dict.keys())
allAA.sort()
allCombinations= list(product(*(allAA for i in range(k))))
allCombinations=np.array([''.join(el) for el in allCombinations])

positionDict = dict(zip(allCombinations, np.arange(0,allCombinations.size).T))

#Use numpy vectorize to speed up the mapping (hopefully)
mapping = lambda x: aa_dict[x]
vectMapping = np.vectorize(mapping)

# Shuffle the data
import random
random.seed(516213)
c = list(zip(sequences, ids))
random.shuffle(c)
sequencesShuffle, idsShuffle = zip(*c)


#Train Validation Split
split = int(np.floor(len(sequencesShuffle)*TRAIN_VAL_SPLIT))
print(split)
trainSeq = sequencesShuffle[0:split]
valSeq = sequencesShuffle[split+1:]
trainIds = idsShuffle[0:split]
valIds = idsShuffle[split+1:]


def generator():
  for i,seq in enumerate(trainSeq):
      entryId = trainIds[i]
      if entryId in soEntryIds:
        labelData = df.loc[entryId]
        # indices = labelData["termToken"].to_numpy()
        indices = labelData["term"].to_numpy()
      else: 
        indices=[]

      with warnings.catch_warnings():
          #supress the warnings for unknown classes
          warnings.simplefilter("ignore")
          y = mlb.transform([indices])

      # if np.count_nonzero(y)==0 and np.random.random()>nonRelevantThreshold:
      #   continue

      
      kmers = [seq[j:j+k] if j < len(seq)-(k-1) else 0 for j,el in enumerate(seq)]
      kmers = kmers[0:-(k-1)]
      kmers = [str(el) for el in kmers]
      values, counts = np.unique(kmers, return_counts=True)
      freqVector=np.zeros(allCombinations.shape)
      for l,v in enumerate(values):
          freqVector[positionDict[v]] = counts[l]
      yield (freqVector,y[0])


def generatorVal():
  for i,seq in enumerate(valSeq):
      entryId = valIds[i]
      if entryId in soEntryIds:
        labelData = df.loc[entryId]
        # indices = labelData["termToken"].to_numpy()
        indices = labelData["term"].to_numpy()
      else:
        indices=[]

      with warnings.catch_warnings():
          #supress the warnings for unknown classes
          warnings.simplefilter("ignore")
          y = mlb.transform([indices])

      # if np.count_nonzero(y)==0 and np.random.random()>nonRelevantThreshold:
      #   continue
      
      kmers = [seq[j:j+k] if j < len(seq)-(k-1) else 0 for j,el in enumerate(seq)]
      kmers = kmers[0:-(k-1)]
      kmers = [str(el) for el in kmers]
      values, counts = np.unique(kmers, return_counts=True)
      freqVector=np.zeros(allCombinations.shape)
      for l,v in enumerate(values):
          freqVector[positionDict[v]] = counts[l]
      yield (freqVector,y[0])
        

In [None]:
g = generator()
test = next(g)
print("The first sample: \n{}\n{}".format(test[0].shape, test[0][0:100]))
print("The first sample has {} classes".format(np.count_nonzero(test[1])))

## One-vs-Rest Classification

In [None]:
target = mlb.classes_[671]

def getKmers(seq):
    kmers = [seq[j:j+k] if j < len(seq)-(k-1) else 0 for j,el in enumerate(seq)]
    kmers = kmers[0:-(k-1)]
    kmers = [str(el) for el in kmers]
    values, counts = np.unique(kmers, return_counts=True)
    freqVector=np.zeros(allCombinations.shape)
    for l,v in enumerate(values):
        freqVector[positionDict[v]] = counts[l]
    return freqVector


X=[]
y=[]
positiveClassCount=0

for i,seq in enumerate(tqdm(trainSeq)):
    entryId = trainIds[i]
    if entryId in soEntryIds:
        labelData = df.loc[entryId]
        indices = labelData["term"].to_numpy()
    else: 
        indices=[]
    if target in indices:
        freqVector= getKmers(seq)
        X.append(freqVector)
        y.append(1)
        positiveClassCount += 1
    elif 0.2*len(y) < positiveClassCount:
        freqVector= getKmers(seq)
        X.append(freqVector)
        y.append(0)
    else:
        continue

X=np.array(X)
y=np.array(y)

    

In [None]:

Xval=[]
yval=[]
positiveClassCountVal=0
for i,seq in enumerate(tqdm(valSeq)):
    entryId = valIds[i]
    if entryId in soEntryIds:
        labelData = df.loc[entryId]
        indices = labelData["term"].to_numpy()
    else: 
        indices=[]
    if target in indices:
        freqVector= getKmers(seq)
        Xval.append(freqVector)
        yval.append(1)
        positiveClassCountVal += 1
    elif len(yval)/2 < positiveClassCountVal:
        freqVector= getKmers(seq)
        Xval.append(freqVector)
        yval.append(0)
    else:
        continue

Xval=np.array(Xval)
yval=np.array(yval)

In [None]:
print(X.shape)
print(y.shape)
print(np.count_nonzero(y))
print(Xval.shape)
print(yval.shape)
print(np.count_nonzero(yval))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline

# clf = LogisticRegression().fit(X, y)
# clf = svm.SVC(probability=True).fit(X, y)
# clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(4, 64)).fit(X, y)
clf = make_pipeline(preprocessing.StandardScaler(), LogisticRegression(solver="liblinear"))
clf.fit(X, y)
print(clf.predict(Xval))
# print(clf.predict_proba(Xval))
print(clf.score(Xval, yval))

In [None]:
def f1Score(yTrue, yPred):
    tp = np.count_nonzero(np.logical_and(np.array(yTrue, dtype="bool") , np.array(yPred, dtype="bool")))
    fn = np.count_nonzero(np.logical_and(np.logical_not(yTrue) , np.array(yPred, dtype="bool")))
    fp = np.count_nonzero(np.logical_and(np.array(yTrue, dtype="bool") , np.logical_not(yPred)))
    prec = (tp)/(tp+fp+1e-20)
    rec = tp/(tp+fn+1e-20)
    f1 = 2*(prec*rec)/(prec+rec+1e-20)
    return prec, rec, f1

In [None]:
f1Score(yval, clf.predict(Xval))

----

In [None]:
dfGo = pd.read_csv(os.path.join(DATA_PATH, "Train/train_terms.tsv"), sep='\t')

dfGo = dfGo.loc[dfGo["aspect"]==SO]
uniqueTerms = dfGo["term"].unique()
termsArr = list(dfGo["term"].to_numpy())

uniqueTermsDict={}
for i,el in enumerate(uniqueTerms):
    uniqueTermsDict[el] = i
    
print(dfGo.shape)

dfGo.set_index("term", inplace=True)

In [None]:
seqDict = dict(zip(ids, sequences))
classifiers = []
scores=[]
f1Scores=[]

for l in tqdm(mlb.classes_):
    X=[]
    y=[]
    if dfGo.loc[l].size<3:
        classifiers.append(None)
        f1Scores.append(0)
        scores.append(0)
        continue

    relevantSequenceIds = dfGo.loc[l]["EntryID"].unique()
    for seqId in relevantSequenceIds:
        seq = seqDict[seqId]
        X.append(getKmers(seq))
        y.append(1)

    nonRelSeqIds = dfGo.sample(n=len(X))["EntryID"].unique()
    for seqId in nonRelSeqIds:
        if seqId in relevantSequenceIds:
            continue
        seq = seqDict[seqId]
        X.append(getKmers(seq))
        y.append(0)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    try:
        clf = make_pipeline(preprocessing.StandardScaler(), LogisticRegression(solver="liblinear"))
        clf = clf.fit(X_train, y_train)
    except:
        classifiers.append(None)
        f1Scores.append(0)
        scores.append(0)
    classifiers.append(clf)
    f1Scores.append(f1Score(y_test, clf.predict(X_test)))
    # print(clf.predict_proba(Xval))
    scores.append(clf.score(X_test, y_test))



In [None]:
import matplotlib.pyplot as plt
plt.hist(scores)