In [None]:
import os
from dotenv import load_dotenv
import numpy as np
import tensorflow as tf
import pandas as pd

load_dotenv(override=True)

DATA_PATH = os.getenv('DATA_PATH')
print(DATA_PATH)

## Reading fasta, obo and tsv files

In [None]:
from Bio import SeqIO

sequences = [rec.seq for rec in SeqIO.parse(os.path.join(DATA_PATH, "Train/train_sequences.fasta"),"fasta")]
ids = [rec.id for rec in SeqIO.parse(os.path.join(DATA_PATH, "Train/train_sequences.fasta"),"fasta")]

In [None]:
print("There are {} sequences in the dataset.".format(len(sequences)))

In [None]:
import networkx
import obonet

# Read the taxrank ontology
url = os.path.join(DATA_PATH, "Train/go-basic.obo")
graph = obonet.read_obo(url)

# Number of nodes
print(len(graph))

# Number of edges
print(graph.number_of_edges())

# Check if the ontology is a DAG
print(networkx.is_directed_acyclic_graph(graph))


In [None]:
df = pd.read_csv(os.path.join(DATA_PATH, "Train/train_terms.tsv"), sep='\t')
uniqueTerms = df["term"].unique()
termsArr = list(df["term"].to_numpy())

uniqueTermsDict={}
for i,el in enumerate(uniqueTerms):
    uniqueTermsDict[el] = i
    
termToken = [uniqueTermsDict[el] for el in termsArr]
df["termToken"] = termToken
df.head(10)

In [None]:
df.shape

Test for the first entry:

In [None]:
df.loc[df['EntryID'] == "A0A009IHW8"]

## GO analysis

In [None]:
item_counts = df["term"].value_counts()
print(item_counts)

In [None]:
id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
name_to_id = {data['name']: id_ for id_, data in graph.nodes(data=True) if 'name' in data}
print(id_to_name['GO:0005575'] )
print(id_to_name['GO:0008150'] )
print(id_to_name['GO:0110165'] )

In [None]:
print(id_to_name['GO:0042324'] )
print(networkx.ancestors(graph, 'GO:0042324'))
print(networkx.descendants(graph, 'GO:0042324'))

paths = networkx.all_simple_paths(
    graph,
    source='GO:0042324',
    target=name_to_id['molecular_function']
)

for path in paths:
    print('•', ' ⟶ '.join(id_to_name[node] for node in path))

In [None]:
allGOs= df.loc[df['EntryID'] == "A0A009IHW8"]["term"].to_numpy()
print([[id_to_name[el],el] for el in allGOs])

### Find GOs without ancestors

In [None]:
sortedGOs = list(networkx.topological_sort(graph))
rootGOs = []
for g in sortedGOs:
    if len(networkx.ancestors(graph,g)) ==0:
        rootGOs.append(g)
    else:
        break
        
print(rootGOs)
print(len(rootGOs))

In [None]:
print(networkx.ancestors(graph, sortedGOs[1000]))
print(id_to_name[sortedGOs[1000]])

### How many of them are used in our dataset?

In [None]:
dataRootGOs = np.intersect1d(uniqueTerms,rootGOs)
print(len(dataRootGOs))

## Label encoding

The task is a multilabel classification: The output has several possible targets (Gene Ontologies) but each can only be 1 (existing) or 0 (non existing)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

dftest=df.loc[df['EntryID'] == "A0A009IHW8"]
indices = dftest["termToken"].to_numpy()

mlb = MultiLabelBinarizer()
mlb.fit([termToken])
print(indices)
print(mlb.transform([indices]))

## Amino acids coding

In [None]:
aminos_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'X']

- A: Alanine
- C: Cysteine
- D: Aspartic acid
- E: Glutamic acid
- F: Phenylalanine
- G: Glycine
- H: Histidine
- I: Isoleucine
- K: Lysine
- L: Leucine
- M: Methionine
- N: Asparagine
- O: Pyrrolysine
- P: Proline
- Q: Glutamine
- R: Arginine
- S: Serine
- T: Threonine
- U: Selenocystein
- V: Valine
- W: Tryptophan
- Y: Tyrosine
- X: unknown

In [None]:
aa_dict = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'O': 21, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'U': 22, 'V': 18, 'W': 19, 'Y': 20, 'X':23}

## Build Dataset

In [None]:
seqLengths = [len(seq) for seq in sequences]
maxLen = max(seqLengths)
print("The max. length of the sequences is {}".format(maxLen))

In [None]:
#Use numpy vectorize to speed up the mapping (hopefully)
mapping = lambda x: aa_dict[x]
vectMapping = np.vectorize(mapping)


def generator():
    for i,seq in enumerate(sequences):
        entryId = ids[i]
        labelData = df.loc[df['EntryID'] == entryId]
        
        indices = labelData["termToken"].to_numpy()

        y = mlb.transform([indices])
        
        arr = np.array(seq)
        mappedArr = vectMapping(arr)
        padWidth = maxLen - arr.size
        paddedArr = np.pad(mappedArr, (0, padWidth))
        yield paddedArr,y[0]
        

In [None]:
g = generator()
test = next(g)
print("The first (padded) sample sequence: {}".format(test[0]))
print("The first sample has {} classes".format(np.count_nonzero(test[1])))

## Basic classification

In [None]:
X=[]
y=[]
for i,el in enumerate(g):
    X.append(el[0])
    y.append(el[1])
    if i ==10:
        break

In [None]:
X= np.array(X)
y= np.array(y)
print(X.shape)
print(y.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X, y)

print(clf.score(X,y))

In [None]:
clf.decision_path([X[0]])

In [None]:
clf.predict([X[0]])

## Tensorflow Classification

In [None]:
import tensorflow as tf


dataset = tf.data.Dataset.from_generator(generator, output_signature=(
         tf.TensorSpec(shape=(maxLen,), dtype=tf.int32),
         tf.TensorSpec(shape=(uniqueTerms.size,), dtype=tf.int32)))
list(dataset.take(1))

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.Input( shape=(maxLen,1)))
model.add(tf.keras.layers.Conv1D(3, 5, activation=tf.keras.activations.relu))
model.add(tf.keras.layers.Conv1D(5, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Conv1D(7, 5, strides=2, activation=tf.keras.activations.relu ))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(uniqueTerms.size))
model.add(tf.keras.layers.Softmax())
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
              loss=tf.keras.losses.CategoricalCrossentropy())
model.summary()

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

batchedDataset = dataset.batch(32)
# print(batchedDataset.take(1))
model.fit(batchedDataset, epochs=1)