In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.spatial.distance import pdist, squareform

In [2]:
data_dir = "/Users/torresmateo/OneDrive - FGV/prot2vec"
if os.name == 'nt':
    data_dir = "D:/OneDrive - FGV/prot2vec"
    

In [3]:
vecs = pd.read_pickle(os.path.join(data_dir, "[SiameseSimilarityNet-relu]-200_epochs.pt-representations.pkl"))
features = [f'prot2vec_{i}' for i in range(256)]
vecs = pd.concat([vecs, pd.DataFrame(vecs['vector'].to_list(), index=vecs.index, columns = features)], axis=1)
vecs = vecs[['protein', 'set'] + features]

In [4]:
vecs

Unnamed: 0,protein,set,prot2vec_0,prot2vec_1,prot2vec_2,prot2vec_3,prot2vec_4,prot2vec_5,prot2vec_6,prot2vec_7,...,prot2vec_246,prot2vec_247,prot2vec_248,prot2vec_249,prot2vec_250,prot2vec_251,prot2vec_252,prot2vec_253,prot2vec_254,prot2vec_255
0,C1P607,train,0.000000,0.000000,0.028082,0.000000,0.000000,0.000000,0.050795,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.044362,0.000000,0.000000
1,C1P615,train,0.000000,0.000000,0.028082,0.000000,0.000000,0.000000,0.050795,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.044362,0.000000,0.000000
2,O32528,train,0.002485,0.006133,0.055946,0.002672,0.019455,0.014974,0.029565,0.000080,...,0.019311,0.000000,0.0,0.001157,0.000000,0.000000,0.000000,0.079693,0.000385,0.000000
3,P00350,train,0.000000,0.003082,0.036483,0.002623,0.000000,0.000000,0.008351,0.004073,...,0.000000,0.001257,0.0,0.001385,0.029654,0.000000,0.000000,0.000000,0.000790,0.000000
4,P00363,train,0.000000,0.006020,0.039784,0.008944,0.006469,0.000000,0.000000,0.009950,...,0.000000,0.010622,0.0,0.003226,0.070484,0.000000,0.000000,0.000000,0.003497,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267,Q47149,test,0.000831,0.008667,0.000000,0.013156,0.018391,0.000000,0.000000,0.006815,...,0.000000,0.011543,0.0,0.004399,0.000000,0.000000,0.000000,0.010090,0.005265,0.000000
2268,Q47538,test,0.004285,0.011408,0.000000,0.008088,0.052936,0.066988,0.000000,0.018088,...,0.000000,0.026626,0.0,0.013497,0.062536,0.015671,0.022751,0.000000,0.023180,0.000000
2269,Q47689,test,0.000943,0.005582,0.011060,0.006834,0.004057,0.034293,0.009787,0.008432,...,0.005284,0.000000,0.0,0.005779,0.018987,0.000000,0.038761,0.000456,0.001749,0.000000
2270,Q57261,test,0.000405,0.003913,0.027630,0.005108,0.004517,0.081702,0.014784,0.000000,...,0.027928,0.000000,0.0,0.000597,0.000000,0.000000,0.000000,0.062300,0.000000,0.000000


In [5]:
annotations = pd.read_table(os.path.join(data_dir, '83333', 'uppropagated-annotations.BP'))
annotations.columns = ['protein', 'goterm']
annotations = annotations.merge(vecs[['protein', 'set']])

In [6]:
annotations.protein.unique().shape

(2252,)

In [7]:
condition = annotations['set'].isin(['train', 'validation'])
y = annotations[['protein', 'goterm']]
y['value'] = 1
y = y.pivot('protein', 'goterm', 'value').fillna(0).reset_index()

In [8]:
classes = y.columns[~y.columns.isin(['protein'])].to_numpy()

In [9]:
dataset = vecs.merge(y)

In [10]:
dataset

Unnamed: 0,protein,set,prot2vec_0,prot2vec_1,prot2vec_2,prot2vec_3,prot2vec_4,prot2vec_5,prot2vec_6,prot2vec_7,...,GO:2001023,GO:2001057,GO:2001060,GO:2001061,GO:2001124,GO:2001125,GO:2001141,GO:2001251,GO:2001313,GO:2001315
0,C1P607,train,0.000000,0.000000,0.028082,0.000000,0.000000,0.000000,0.050795,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1P615,train,0.000000,0.000000,0.028082,0.000000,0.000000,0.000000,0.050795,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O32528,train,0.002485,0.006133,0.055946,0.002672,0.019455,0.014974,0.029565,0.000080,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P00350,train,0.000000,0.003082,0.036483,0.002623,0.000000,0.000000,0.008351,0.004073,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,P00363,train,0.000000,0.006020,0.039784,0.008944,0.006469,0.000000,0.000000,0.009950,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,Q47149,test,0.000831,0.008667,0.000000,0.013156,0.018391,0.000000,0.000000,0.006815,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2248,Q47538,test,0.004285,0.011408,0.000000,0.008088,0.052936,0.066988,0.000000,0.018088,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2249,Q47689,test,0.000943,0.005582,0.011060,0.006834,0.004057,0.034293,0.009787,0.008432,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2250,Q57261,test,0.000405,0.003913,0.027630,0.005108,0.004517,0.081702,0.014784,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
distances = squareform(pdist(np.stack(dataset[features].values), metric='cosine'))

In [12]:
X_dists_train = distances[dataset['set'].isin(['train', 'validation']),:][:,dataset['set'].isin(['train', 'validation'])]

In [13]:
X_dists_test = distances[dataset['set'].isin(['test']),:][:,dataset['set'].isin(['train', 'validation'])]

In [14]:
X_dists_train.shape, X_dists_test.shape

((1803, 1803), (449, 1803))

In [15]:
condition = dataset['set'].isin(['train', 'validation'])
X_train = dataset[condition][features].values
y_train = dataset[condition][classes].values.astype(int)
X_test = dataset[~condition][features].values
y_test = dataset[~condition][classes].values.astype(int)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1803, 256), (449, 256), (1803, 2701), (449, 2701))

In [17]:
from models import MultiLabelKNeighborsClassifier

In [18]:
m = MultiLabelKNeighborsClassifier(metric='precomputed')

In [19]:
m.fit(X_dists_train, y_train)

<models.knn.mlknn.MultiLabelKNeighborsClassifier at 0x148211f3820>

In [20]:
y_pred = m.predict(X_dists_test)
y_proba = m.predict_proba(X_dists_test)

In [21]:
y_pred = y_pred.todense()
y_pred = y_pred.A
y_proba = y_proba.todense()
y_proba = y_proba.A

In [23]:
y_pred.sum()

3432

In [25]:
y_proba.shape

(449, 2701)

In [26]:
y_test.shape

(449, 2701)

In [28]:
information_content = pd.read_table(os.path.join(data_dir, '83333', 'uppropagated-annotations.IC'))

In [32]:
information_content.set_index('goterm', inplace=True)

In [42]:
ic_sorted = information_content.loc[classes].values.flatten()

In [45]:
ic_sorted

array([12.55013846, 14.31567321, 16.6376013 , ..., 16.6376013 ,
       16.6376013 , 16.6376013 ])

In [52]:
from evaluation.metrics import get_metrics

In [53]:
metrics = get_metrics(y_test, y_proba, ic_sorted)

Output()

Output()

In [59]:
metrics.columns

Index(['tp', 'fp', 'tn', 'fn', 'qty_pos', 'qty_neg', 'rec', 'pre', 'f_max',
       'fpr', 'tpr', 'auc_roc', 'auc_pr', 'ru', 'mi', 's', 's_min',
       'metric_type'],
      dtype='object')

In [60]:
eval_metrics = ['f_max', 's_min', 'auc_roc', 'auc_pr']
evaluation = metrics[eval_metrics + ['metric_type']]

In [62]:
evaluation.groupby('metric_type').mean()

Unnamed: 0_level_0,f_max,s_min,auc_roc,auc_pr
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
per_gene,0.403988,0.07549,0.898086,0.262003
per_term,0.025889,13.260362,0.3323,0.007022
