In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.spatial.distance import pdist, squareform
from Utils import Configuration
from models import SiameseSimilarityMultiTask
from tools.datasets import FastMultitaskSemanticSimilarityDataset

In [9]:
data_dir = "/Users/torresmateo/OneDrive - FGV/prot2vec"
if os.name == 'nt':
    data_dir = "E:/prot2vec"
    

In [2]:
config = Configuration.load_run("run-all_cafa3.ini")

In [3]:
config

{'configuration': {'config_file': 'config.ini'},
 'model': {'dir_model_output': 'E:/prot2vec',
  'batch_size_train': 256,
  'batch_size_val': 256,
  'batch_size_test': 256,
  'dim_first_hidden_layer': 1024,
  'alias': 'cafa3.pt'},
 'training': {'dir_training_out': 'E:/prot2vec',
  'num_epochs': 1,
  'output_debug': False},
 'dataset': {'dir_train': '../CAFA3_P2V/train_data/',
  'dir_val': '../CAFA3_P2V/val_data/',
  'dir_test': '../CAFA3_P2V/test_data/',
  'string_columns': [''],
  'negative_sampling': False,
  'combine_string': False,
  'include_homology': False,
  'include_biogrid': False},
 'representations': {'dir_representations_out': 'E:/prot2vec'},
 'optimizer': {'learning_rate': 0.0006},
 'loss': {'main_task_factor': 1.0}}

In [4]:
config["model"]["dim_first_hidden_layer"] // 4

256

In [5]:
dim = config["model"]["dim_first_hidden_layer"]
activation = 'sigmoid'
model_filename =  f"{config['model']['alias']}-representations.pkl"
model_name = f"{config['model']['alias']}-{dim}"
vecs = pd.read_pickle(os.path.join(config["representations"]["dir_representations_out"], model_filename))
features = [f'prot2vec_{i}' for i in range(dim//2//2)]
vecs = pd.concat([vecs, pd.DataFrame(vecs['vector'].to_list(), index=vecs.index, columns = features)], axis=1)
vecs = vecs[['protein', 'set'] + features]

In [16]:
annotations = pd.read_table(os.path.join(data_dir, 'ALL_GOA', 'uppropagated-annotations.BP'))
annotations.columns = ['protein', 'goterm']
annotations = annotations.merge(vecs[['protein', 'set']])

In [17]:
annotations.protein.unique().shape

(76273,)

In [20]:
condition = annotations['set'].isin(['train', 'validation'])
y = annotations[['protein', 'goterm']].drop_duplicates()
y['value'] = 1
y = y.pivot('protein', 'goterm', 'value').fillna(0).reset_index()

In [21]:
classes = y.columns[~y.columns.isin(['protein'])].to_numpy()

In [22]:
dataset = vecs.merge(y)

In [1]:
dataset

NameError: name 'dataset' is not defined

In [24]:
distances = squareform(pdist(np.stack(dataset[features].values), metric='cosine'))

In [25]:
X_dists_train = distances[dataset['set'].isin(['train', 'validation']),:][:,dataset['set'].isin(['train', 'validation'])]

In [26]:
X_dists_test = distances[dataset['set'].isin(['test']),:][:,dataset['set'].isin(['train', 'validation'])]

In [27]:
X_dists_train.shape, X_dists_test.shape

((72741, 72741), (10738, 72741))

In [28]:
condition = dataset['set'].isin(['train', 'validation'])
X_train = dataset[condition][features].values
y_train = dataset[condition][classes].values.astype(int)
X_test = dataset[~condition][features].values
y_test = dataset[~condition][classes].values.astype(int)

In [29]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((72741, 256), (10738, 256), (72741, 21195), (10738, 21195))

In [30]:
from models import MultiLabelKNeighborsClassifier
from evaluation.metrics import get_metrics

In [None]:
m = MultiLabelKNeighborsClassifier(metric='precomputed', n_neighbors=1)
m.fit(X_dists_train, y_train)

y_pred = m.predict(X_dists_test)
y_proba = m.predict_proba(X_dists_test)
y_pred = y_pred.todense()
y_pred = y_pred.A
y_proba = y_proba.todense()
y_proba = y_proba.A

information_content = pd.read_table(os.path.join(data_dir, 'ALL_GOA', 'uppropagated-annotations.IC'))
information_content.set_index('goterm', inplace=True)
ic_sorted = information_content.loc[classes].values.flatten()

metrics = get_metrics(y_test, y_proba, ic_sorted)

eval_metrics = ['f_max', 's_min', 'auc_roc', 'auc_pr']
evaluation = metrics[eval_metrics + ['metric_type']]
ev = evaluation.groupby('metric_type').mean()

ev['model'] = model_name
evaluations = []
evaluations.append(ev)
df = pd.concat(evaluations)
df

Output()

In [33]:
df

Unnamed: 0_level_0,f_max,s_min,auc_roc,auc_pr,model
metric_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
per_gene,0.373943,0.027254,0.954017,0.2607,cafa3.pt-1024
per_term,0.031308,17.290696,0.273233,0.005016,cafa3.pt-1024


In [45]:
y_test.shape

(10738, 21195)

In [47]:
y_pred.shape

(10738, 21195)

In [37]:
import joblib

In [38]:
saved_model = "NearestNeighborsCAFA3.sav"
joblib.dump(m, saved_model)

['NearestNeighborsCAFA3.sav']

In [39]:
m2 = joblib.load(saved_model)

In [40]:
y_pred2 = m2.predict(X_dists_test)


In [43]:
(y_pred==y_pred2).all()

True