In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.spatial.distance import pdist, squareform
from Utils import Configuration
from models import SiameseSimilarityMultiTask
from tools.datasets import FastMultitaskSemanticSimilarityDataset

In [101]:
data_dir = "/Users/torresmateo/OneDrive - FGV/prot2vec"
if os.name == 'nt':
    data_dir = "D:/OneDrive - FGV/prot2vec"
    

In [139]:
config = Configuration.load_run("run-test.ini")

In [140]:
config

{'configuration': {'config_file': 'config.ini'},
 'model': {'dir_model_output': 'D:/OneDrive - FGV/prot2vec',
  'batch_size_train': 256,
  'batch_size_val': 256,
  'batch_size_test': 256,
  'dim_first_hidden_layer': 1024,
  'alias': 'test.pt'},
 'training': {'dir_training_out': 'D:/OneDrive - FGV/prot2vec',
  'num_epochs': 10},
 'dataset': {'dir_train': '../83333/train_data_not_homologous/',
  'dir_val': '../83333/val_data_not_homologous/',
  'dir_test': '../83333/test_data_not_homologous/',
  'string_columns': ['neighborhood',
   'fusion',
   'cooccurence',
   'coexpression',
   'experiments',
   'database',
   'textmining'],
  'negative_sampling': False,
  'combine_string': True,
  'include_homology': False,
  'include_biogrid': False},
 'representations': {'dir_representations_out': 'D:/OneDrive - FGV/prot2vec'},
 'optimizer': {'learning_rate': 0.0006},
 'loss': {'main_task_factor': 1.0}}

In [141]:
config["model"]["dim_first_hidden_layer"] // 4

256

In [142]:
dim = config["model"]["dim_first_hidden_layer"]
activation = 'sigmoid'
model_filename =  f"{config['model']['alias']}-representations.pkl"
model_name = f"{config['model']['alias']}-{dim}"
vecs = pd.read_pickle(os.path.join(data_dir, model_filename))
features = [f'prot2vec_{i}' for i in range(dim//2//2)]
vecs = pd.concat([vecs, pd.DataFrame(vecs['vector'].to_list(), index=vecs.index, columns = features)], axis=1)
vecs = vecs[['protein', 'set'] + features]

In [143]:
annotations = pd.read_table(os.path.join(data_dir, '83333', 'uppropagated-annotations.BP'))
annotations.columns = ['protein', 'goterm']
annotations = annotations.merge(vecs[['protein', 'set']])

In [144]:
annotations.protein.unique().shape

(2132,)

In [145]:
condition = annotations['set'].isin(['train', 'validation'])
y = annotations[['protein', 'goterm']]
y['value'] = 1
y = y.pivot('protein', 'goterm', 'value').fillna(0).reset_index()

In [146]:
classes = y.columns[~y.columns.isin(['protein'])].to_numpy()

In [147]:
dataset = vecs.merge(y)

In [148]:
dataset

Unnamed: 0,protein,set,prot2vec_0,prot2vec_1,prot2vec_2,prot2vec_3,prot2vec_4,prot2vec_5,prot2vec_6,prot2vec_7,...,GO:2001023,GO:2001057,GO:2001060,GO:2001061,GO:2001124,GO:2001125,GO:2001141,GO:2001251,GO:2001313,GO:2001315
0,C1P607,train,3.818000e-22,0.016520,4.992979e-31,1.400548e-27,2.047985e-28,1.956636e-24,6.294710e-03,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1P617,train,3.818000e-22,0.016520,4.992979e-31,1.400548e-27,2.047985e-28,1.956636e-24,6.294710e-03,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,P00350,train,1.350018e-24,0.000497,1.931030e-34,6.054573e-29,8.981612e-32,4.858820e-25,1.219443e-03,8.330125e-37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P00393,train,1.983889e-24,0.001665,2.517395e-34,1.033826e-28,1.184034e-31,7.410680e-25,1.800415e-03,1.532279e-36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,P00448,train,3.668367e-27,0.072973,1.878670e-04,5.714374e-33,5.673670e-10,2.160612e-29,3.333220e-06,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2127,Q47005,test,7.764564e-28,0.071640,2.900220e-01,1.390259e-33,2.334782e-04,5.190790e-30,9.699301e-07,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2128,Q47129,test,4.603119e-24,0.983812,5.351386e-35,1.806901e-27,2.941678e-32,3.312039e-24,2.469603e-03,1.875379e-33,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2129,Q47155,test,7.246217e-27,0.086696,1.874109e-06,1.129421e-32,4.095852e-11,4.396507e-29,5.606224e-06,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2130,Q47536,test,4.764783e-23,0.081535,3.550844e-29,9.070165e-29,1.660844e-26,1.287589e-25,1.300501e-01,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
distances = squareform(pdist(np.stack(dataset[features].values), metric='cosine'))

In [150]:
X_dists_train = distances[dataset['set'].isin(['train', 'validation']),:][:,dataset['set'].isin(['train', 'validation'])]

In [151]:
X_dists_test = distances[dataset['set'].isin(['test']),:][:,dataset['set'].isin(['train', 'validation'])]

In [152]:
X_dists_train.shape, X_dists_test.shape

((1704, 1704), (428, 1704))

In [153]:
condition = dataset['set'].isin(['train', 'validation'])
X_train = dataset[condition][features].values
y_train = dataset[condition][classes].values.astype(int)
X_test = dataset[~condition][features].values
y_test = dataset[~condition][classes].values.astype(int)

In [154]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1704, 256), (428, 256), (1704, 2668), (428, 2668))

In [155]:
from models import MultiLabelKNeighborsClassifier
from evaluation.metrics import get_metrics

In [156]:
m = MultiLabelKNeighborsClassifier(metric='precomputed', n_neighbors=1)
m.fit(X_dists_train, y_train)

y_pred = m.predict(X_dists_test)
y_proba = m.predict_proba(X_dists_test)
y_pred = y_pred.todense()
y_pred = y_pred.A
y_proba = y_proba.todense()
y_proba = y_proba.A

information_content = pd.read_table(os.path.join(data_dir, '83333', 'uppropagated-annotations.IC'))
information_content.set_index('goterm', inplace=True)
ic_sorted = information_content.loc[classes].values.flatten()

metrics = get_metrics(y_test, y_proba, ic_sorted)

eval_metrics = ['f_max', 's_min', 'auc_roc', 'auc_pr']
evaluation = metrics[eval_metrics + ['metric_type']]
ev = evaluation.groupby('metric_type').mean()

ev['model'] = model_name
evaluations = []
evaluations.append(ev)
df = pd.concat(evaluations)
df

Output()

Output()

Unnamed: 0_level_0,f_max,s_min,auc_roc,auc_pr,model
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
per_gene,0.392841,0.07705,0.901388,0.255996,test.pt-1024
per_term,0.016847,14.211468,0.343681,0.002854,test.pt-1024


In [157]:
df

Unnamed: 0_level_0,f_max,s_min,auc_roc,auc_pr,model
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
per_gene,0.392841,0.07705,0.901388,0.255996,test.pt-1024
per_term,0.016847,14.211468,0.343681,0.002854,test.pt-1024
