In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.spatial.distance import pdist, squareform
from Utils import Configuration
from models import SiameseSimilarityMultiTask
from tools.datasets import FastMultitaskSemanticSimilarityDataset

In [217]:
data_dir = "/Users/torresmateo/OneDrive - FGV/prot2vec"
if os.name == 'nt':
    data_dir = "D:/OneDrive - FGV/prot2vec"
    

In [218]:
config = Configuration.load_run("run-test.ini")

In [219]:
config

{'configuration': {'config_file': 'config.ini'},
 'model': {'dir_model_output': 'D:/OneDrive - FGV/prot2vec',
  'batch_size_train': 256,
  'batch_size_val': 256,
  'batch_size_test': 256,
  'dim_first_hidden_layer': 1024,
  'alias': 'test.pt'},
 'training': {'dir_training_out': 'D:/OneDrive - FGV/prot2vec',
  'num_epochs': 10},
 'dataset': {'dir_train': '../83333/train_data_not_homologous/',
  'dir_val': '../83333/val_data_not_homologous/',
  'dir_test': '../83333/test_data_not_homologous/',
  'string_columns': ['neighborhood',
   'fusion',
   'cooccurence',
   'coexpression',
   'experiments',
   'database',
   'textmining'],
  'negative_sampling': False,
  'combine_string': True,
  'include_homology': True,
  'include_biogrid': True},
 'representations': {'dir_representations_out': 'D:/OneDrive - FGV/prot2vec'},
 'optimizer': {'learning_rate': 0.0006},
 'loss': {'main_task_factor': 1.0}}

In [220]:
config["model"]["dim_first_hidden_layer"] // 4

256

In [221]:
dim = config["model"]["dim_first_hidden_layer"]
activation = 'sigmoid'
model_filename =  f"{config['model']['alias']}-representations.pkl"
model_name = f"{config['model']['alias']}-{dim}"
vecs = pd.read_pickle(os.path.join(data_dir, model_filename))
features = [f'prot2vec_{i}' for i in range(dim//2//2)]
vecs = pd.concat([vecs, pd.DataFrame(vecs['vector'].to_list(), index=vecs.index, columns = features)], axis=1)
vecs = vecs[['protein', 'set'] + features]

In [222]:
annotations = pd.read_table(os.path.join(data_dir, '83333', 'uppropagated-annotations.BP'))
annotations.columns = ['protein', 'goterm']
annotations = annotations.merge(vecs[['protein', 'set']])

In [223]:
annotations.protein.unique().shape

(2132,)

In [224]:
condition = annotations['set'].isin(['train', 'validation'])
y = annotations[['protein', 'goterm']]
y['value'] = 1
y = y.pivot('protein', 'goterm', 'value').fillna(0).reset_index()

In [225]:
classes = y.columns[~y.columns.isin(['protein'])].to_numpy()

In [226]:
dataset = vecs.merge(y)

In [227]:
dataset

Unnamed: 0,protein,set,prot2vec_0,prot2vec_1,prot2vec_2,prot2vec_3,prot2vec_4,prot2vec_5,prot2vec_6,prot2vec_7,...,GO:2001023,GO:2001057,GO:2001060,GO:2001061,GO:2001124,GO:2001125,GO:2001141,GO:2001251,GO:2001313,GO:2001315
0,C1P607,train,0.090086,3.000233e-05,3.441443e-05,3.492409e-05,1.924738e-05,3.961688e-05,3.774117e-09,2.003084e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1P617,train,0.090086,3.000233e-05,3.441443e-05,3.492409e-05,1.924738e-05,3.961688e-05,3.774117e-09,2.003084e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,P00350,train,0.122507,1.724854e-06,6.778773e-07,3.197486e-06,1.457238e-06,1.713173e-06,3.717905e-11,1.922022e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P00393,train,0.087682,5.043263e-07,2.727672e-07,1.199419e-06,5.747757e-07,6.452598e-07,1.432480e-11,8.161376e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,P00448,train,0.115818,4.406590e-06,2.127144e-06,8.379537e-06,4.068766e-06,5.462230e-06,1.529636e-10,5.335748e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2127,Q47005,test,0.074553,2.893069e-08,1.236352e-08,5.854638e-08,3.216464e-08,3.661936e-08,1.111016e-12,7.202371e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2128,Q47129,test,0.047646,1.558651e-04,1.572403e-04,2.094675e-04,1.414709e-04,1.822573e-04,3.199842e-07,1.772449e-04,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2129,Q47155,test,0.113177,2.409832e-06,9.772025e-07,4.481728e-06,2.003780e-06,2.615029e-06,6.459221e-11,2.707995e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2130,Q47536,test,0.111407,1.238367e-06,5.351811e-07,2.455909e-06,1.156735e-06,1.262553e-06,3.047067e-11,1.461140e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [228]:
distances = squareform(pdist(np.stack(dataset[features].values), metric='cosine'))

In [229]:
X_dists_train = distances[dataset['set'].isin(['train', 'validation']),:][:,dataset['set'].isin(['train', 'validation'])]

In [230]:
X_dists_test = distances[dataset['set'].isin(['test']),:][:,dataset['set'].isin(['train', 'validation'])]

In [231]:
X_dists_train.shape, X_dists_test.shape

((1704, 1704), (428, 1704))

In [232]:
condition = dataset['set'].isin(['train', 'validation'])
X_train = dataset[condition][features].values
y_train = dataset[condition][classes].values.astype(int)
X_test = dataset[~condition][features].values
y_test = dataset[~condition][classes].values.astype(int)

In [233]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1704, 256), (428, 256), (1704, 2668), (428, 2668))

In [234]:
from models import MultiLabelKNeighborsClassifier
from evaluation.metrics import get_metrics

In [235]:
m = MultiLabelKNeighborsClassifier(metric='precomputed', n_neighbors=1)
m.fit(X_dists_train, y_train)

y_pred = m.predict(X_dists_test)
y_proba = m.predict_proba(X_dists_test)
y_pred = y_pred.todense()
y_pred = y_pred.A
y_proba = y_proba.todense()
y_proba = y_proba.A

information_content = pd.read_table(os.path.join(data_dir, '83333', 'uppropagated-annotations.IC'))
information_content.set_index('goterm', inplace=True)
ic_sorted = information_content.loc[classes].values.flatten()

metrics = get_metrics(y_test, y_proba, ic_sorted)

eval_metrics = ['f_max', 's_min', 'auc_roc', 'auc_pr']
evaluation = metrics[eval_metrics + ['metric_type']]
ev = evaluation.groupby('metric_type').mean()

ev['model'] = model_name
evaluations = []
evaluations.append(ev)
df = pd.concat(evaluations)
df

Output()

Output()

Unnamed: 0_level_0,f_max,s_min,auc_roc,auc_pr,model
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
per_gene,0.388983,0.077619,0.899038,0.253242,test.pt-1024
per_term,0.016934,14.113027,0.337528,0.003041,test.pt-1024


In [236]:
df

Unnamed: 0_level_0,f_max,s_min,auc_roc,auc_pr,model
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
per_gene,0.388983,0.077619,0.899038,0.253242,test.pt-1024
per_term,0.016934,14.113027,0.337528,0.003041,test.pt-1024
