In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.spatial.distance import pdist, squareform

In [33]:
data_dir = "/Users/torresmateo/OneDrive - FGV/prot2vec"
if os.name == 'nt':
    data_dir = "D:/OneDrive - FGV/prot2vec"
    

In [34]:
dim = 256
activation = 'sigmoid'
model_filename =  f"[SiameseSimilarityNet-{activation}-{dim}]-200_epochs.pt-representations.pkl"
model_name = f'SiameseSimilarityNet-{activation}-{dim}'
vecs = pd.read_pickle(os.path.join(data_dir, model_filename))
features = [f'prot2vec_{i}' for i in range(dim//2//2)]
vecs = pd.concat([vecs, pd.DataFrame(vecs['vector'].to_list(), index=vecs.index, columns = features)], axis=1)
vecs = vecs[['protein', 'set'] + features]

In [35]:
annotations = pd.read_table(os.path.join(data_dir, '83333', 'uppropagated-annotations.BP'))
annotations.columns = ['protein', 'goterm']
annotations = annotations.merge(vecs[['protein', 'set']])

In [36]:
annotations.protein.unique().shape

(2252,)

In [37]:
condition = annotations['set'].isin(['train', 'validation'])
y = annotations[['protein', 'goterm']]
y['value'] = 1
y = y.pivot('protein', 'goterm', 'value').fillna(0).reset_index()

In [38]:
classes = y.columns[~y.columns.isin(['protein'])].to_numpy()

In [39]:
dataset = vecs.merge(y)

In [40]:
dataset

Unnamed: 0,protein,set,prot2vec_0,prot2vec_1,prot2vec_2,prot2vec_3,prot2vec_4,prot2vec_5,prot2vec_6,prot2vec_7,...,GO:2001023,GO:2001057,GO:2001060,GO:2001061,GO:2001124,GO:2001125,GO:2001141,GO:2001251,GO:2001313,GO:2001315
0,C1P607,train,4.822112e-10,2.327805e-05,3.053414e-01,4.172926e-03,1.265453e-05,2.597197e-06,1.881742e-04,5.898568e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1P615,train,4.822112e-10,2.327805e-05,3.053414e-01,4.172926e-03,1.265453e-05,2.597197e-06,1.881742e-04,5.898568e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O32528,train,3.741143e-07,1.090778e-05,1.076940e-02,5.383392e-08,1.313031e-06,1.197079e-06,3.520212e-05,4.684681e-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P00350,train,7.609263e-03,1.258004e-07,7.506846e-06,1.446274e-01,1.699214e-01,1.007646e-01,1.744034e-01,4.666566e-09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,P00363,train,4.521328e-11,1.419386e-01,1.295210e-01,4.913647e-03,2.015503e-01,1.182718e-05,2.046343e-08,4.569843e-12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,Q47149,test,1.575696e-09,2.104914e-01,1.272206e-04,2.393070e-02,1.542201e-03,2.149793e-01,8.950282e-03,1.345122e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2248,Q47538,test,2.770949e-05,2.457063e-08,7.028402e-08,1.087332e-06,4.894130e-07,3.883532e-07,3.917785e-01,1.630807e-09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2249,Q47689,test,5.096305e-04,2.753295e-01,9.723391e-06,7.553197e-09,2.383339e-07,1.564173e-05,3.526196e-04,5.370395e-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2250,Q57261,test,1.486540e-09,1.102134e-07,3.935691e-04,8.539056e-04,3.082948e-05,3.037853e-06,7.858851e-08,2.436093e-04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
distances = squareform(pdist(np.stack(dataset[features].values), metric='cosine'))

In [42]:
X_dists_train = distances[dataset['set'].isin(['train', 'validation']),:][:,dataset['set'].isin(['train', 'validation'])]

In [43]:
X_dists_test = distances[dataset['set'].isin(['test']),:][:,dataset['set'].isin(['train', 'validation'])]

In [44]:
X_dists_train.shape, X_dists_test.shape

((1803, 1803), (449, 1803))

In [45]:
condition = dataset['set'].isin(['train', 'validation'])
X_train = dataset[condition][features].values
y_train = dataset[condition][classes].values.astype(int)
X_test = dataset[~condition][features].values
y_test = dataset[~condition][classes].values.astype(int)

In [46]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1803, 64), (449, 64), (1803, 2701), (449, 2701))

In [47]:
from models import MultiLabelKNeighborsClassifier

In [48]:
m = MultiLabelKNeighborsClassifier(metric='precomputed')

In [49]:
m.fit(X_dists_train, y_train)

<models.knn.mlknn.MultiLabelKNeighborsClassifier at 0x24108cc8400>

In [50]:
y_pred = m.predict(X_dists_test)
y_proba = m.predict_proba(X_dists_test)

In [51]:
y_pred = y_pred.todense()
y_pred = y_pred.A
y_proba = y_proba.todense()
y_proba = y_proba.A

In [52]:
information_content = pd.read_table(os.path.join(data_dir, '83333', 'uppropagated-annotations.IC'))

In [53]:
information_content.set_index('goterm', inplace=True)

In [54]:
ic_sorted = information_content.loc[classes].values.flatten()

In [55]:
from evaluation.metrics import get_metrics

In [56]:
metrics = get_metrics(y_test, y_proba, ic_sorted)

Output()

Output()

In [57]:
eval_metrics = ['f_max', 's_min', 'auc_roc', 'auc_pr']
evaluation = metrics[eval_metrics + ['metric_type']]
ev = evaluation.groupby('metric_type').mean()

In [58]:
ev['model'] = model_name

In [59]:
evaluations = []

In [60]:
evaluations.append(ev)
df = pd.concat(evaluations)

In [61]:
df

Unnamed: 0_level_0,f_max,s_min,auc_roc,auc_pr,model
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
per_gene,0.3922,0.075591,0.89503,0.258772,SiameseSimilarityNet-sigmoid-256
per_term,0.02311,13.413958,0.318273,0.005677,SiameseSimilarityNet-sigmoid-256
