In [21]:
import os
import pandas as pd
import numpy as np

from src.models.lm_metrics import get_pred_metrics, add_psts_to_k

In [22]:
os.chdir('/Users/magdalena/OneDrive - Queen Mary, University of London/bezzlab/research/projects/phospho_logician/')
# os.chdir('/home/mhuebner/Desktop/bezzlab/research/projects/phospho_logician/')

In [23]:
# selecting cell line + search space
cline = 'HL60'  # HL60, MCF7, NTERA2

In [24]:
# INPUT
obs = pd.read_csv(f'data/raw/ctamdb_dpoa_{cline}.tsv', sep='\t')
# EDGES
enz_tprot = pd.read_csv(f'models/results/cantley/enz_tprot_omnipath_sub_{cline}.csv')
enz_sub = pd.read_csv(f'models/results/cantley/enz_sub_omnipath_sub_{cline}.csv')
# UNIVERSE
k_univ = pd.read_csv('data/external/kpas/kinases_final.csv')
# VALIDATION
pert_kin = pd.read_csv('data/processed/facts/pert_kin_rel.csv')

In [25]:
# perturbagens that can be validated
pk_true_df = pd.crosstab(index=pert_kin['kinase'], columns = pert_kin['perturbagen'])
perturbagens = pk_true_df.columns.to_list()

# subsetting data
# perturbed (affected) psts IN
obs_pert = obs.loc[(abs(obs['fc']) > 0) &
                   (obs['sid_score'] <= 0.05) &
                   (obs['perturbagen'].isin(perturbagens))].reset_index(drop=True)

In [26]:
obs_pert

Unnamed: 0,pst,perturbagen,fc,pval_eb,case,n_runs,n_ctr,n_trt,meansig_ctr,meansig_trt,sid_score
0,ABCA2(S50),AZD5438,1.298257,0.078150,cross_batch,271,32.0,4.0,29.128336,28.541017,6.709616e-03
1,ABI1(S222),AZD5438,0.574464,0.408446,cross_batch,268,32.0,4.0,31.120623,32.355194,2.185853e-02
2,ABI2(S216),AZD5438,3.210579,0.030975,cross_batch,84,8.0,2.0,26.371301,30.927407,0.000000e+00
3,ABI2(Y213),AZD5438,2.090733,0.117758,cross_batch,90,7.0,1.0,24.162828,25.568372,5.745007e-03
4,ABL1(T735),AZD5438,2.086637,0.017912,cross_batch,272,32.0,4.0,29.290022,30.548519,2.645500e-08
...,...,...,...,...,...,...,...,...,...,...,...
132735,NUCKS1(S73),Vemurafenib,1.826939,,ctr_missing,41,1.0,2.0,23.416910,26.792150,3.921529e-03
132736,PPP1R37(T588),Vemurafenib,1.705302,,ctr_missing,47,3.0,3.0,24.620114,24.931741,3.055342e-02
132737,RANBP1(S14),Vemurafenib,1.928449,,ctr_missing,7,1.0,2.0,25.603842,27.238934,1.329853e-03
132738,RANBP1(T13),Vemurafenib,1.928449,,ctr_missing,7,1.0,2.0,25.603842,27.238934,1.329853e-03


In [27]:
# PREDICT
pred_perform = []
pk_pred = dict()
k_predict = []

for pert_in in perturbagens:

    # edges_df
    enz_tprot_edges = np.array(enz_tprot.loc[enz_tprot['Pert'] == pert_in, ['Kpa', 'Tprot']])
    enz_sub_edges = np.array(enz_sub.loc[enz_sub['Pert'] == pert_in, ['Kpa', 'Pst']])
    edges_df = pd.DataFrame(np.concatenate((enz_tprot_edges, enz_sub_edges)),
                            columns=['from', 'to'])

    # sink kinases (i.e. predicted kinases)
    nodes_sink = set(edges_df['from']) - set(edges_df['to'])
    k_sink = list(set(nodes_sink) & set(k_univ['kinase']))

    # psts affected by perturbagen and contained in edges_df (i.e. predictive psts)
    psts_pred = obs_pert.loc[(obs_pert['perturbagen'] == pert_in) &
                             (obs_pert['pst'].isin(edges_df['to'])), 'pst'].tolist()

    # add psts to k_sinks
    k_sink_expl = add_psts_to_k(k_sink, edges_df, psts_pred)

    # kinases actually inhibited by perturbagen (i.e. true values)
    pk_true = dict(pk_true_df[pert_in])

    # determine confusion matrix and accuracy score
    cm, report = get_pred_metrics(k_sink, pk_true)

    # add results to dataframe
    pred_perform.append([pert_in, len(psts_pred), len(k_sink),
                         cm['tp'], cm['fp'], cm['tn'], cm['fn'], report['accuracy'],
                         report['1']['precision'], report['1']['recall'], report['1']['f1-score'], report['1']['support'],
                         report['0']['precision'], report['0']['recall'], report['0']['f1-score'], report['0']['support']])
    pk_pred.update({pert_in: k_sink_expl})
    k_predict = list(np.unique(k_predict + k_sink))

In [28]:
# model performance
pred_perform = pd.DataFrame(pred_perform, columns=['pert_in', 'psts_pred', 'k_pred', 'TP', 'FP', 'TN', 'FN', 'accuracy',
                                                   'precision_1', 'recall_1', 'f1-score_1', 'support_1',
                                                   'precision_0', 'recall_0', 'f1-score_0', 'support_0'])

# predicted pk_rel
pk_pred_df = []
[[pk_pred_df.append([p] + k) for k in pk_pred[p]] for p in pk_pred.keys()]
pk_pred_df = pd.DataFrame(pk_pred_df, columns=['pert_in', 'k_pred', 'psts_explained', 'expl_known_psts'])

In [29]:
pk_pred_df

Unnamed: 0,pert_in,k_pred,psts_explained,expl_known_psts
0,AT13148,NUAK2,32,0.055077
1,AT13148,DYRK4,24,0.041308
2,AT13148,PAK1,53,0.091222
3,AT13148,PIM1,56,0.096386
4,AT13148,NEK1,9,0.015491
...,...,...,...,...
4244,Vemurafenib,MAK,19,0.029008
4245,Vemurafenib,HIPK4,30,0.045802
4246,Vemurafenib,CLK4,29,0.044275
4247,Vemurafenib,CDK1,39,0.059542


In [30]:
# export to csv with:
pred_perform.to_csv(f'reports/cantley/predict_report_cantley_AND_lm_{cline}.csv', float_format='%.3f', index=False)
pk_pred_df.to_csv(f'reports/cantley/predict_pkrel_cantley_AND_lm_{cline}.csv', float_format='%.3f', index=False)