In [2]:
import os
import pandas as pd
import numpy as np
from src.data.ksea_ztest_og import kseaZtest, stackSamples

In [3]:
os.chdir('/Users/magdalena/OneDrive - Queen Mary, University of London/bezzlab/research/projects/phospho_logician/')
# os.chdir('/home/mhuebner/Desktop/bezzlab/research/projects/phospho_logician/')

In [4]:
# selecting cell line
cline = 'NTERA2'  # HL60, MCF7, NTERA2
mode = ''  # '', 'nas_', 'rd_nas_'

In [5]:
# Load observational data and enz_sub relationships
# Has to be repeated for every cell line individually
obs = pd.read_csv('data/raw/ctamdb_dpoa_' + mode + cline + '.tsv', sep='\t')
enz_sub = pd.read_csv('data/processed/facts/enz_sub_omnipath.csv')
kinases = pd.read_csv('data/processed/facts/kinase.csv')
phosphatases = pd.read_csv('data/processed/facts/phosphatase.csv')

In [6]:
# Data wrangling INPUT
# Reformat observations and enz_sub data
kt = enz_sub[['kpa', 'pst']]
obs = obs[['perturbagen', 'pst', 'fc']]
# Data is converted into df, where rows and cols correspond to phosphosite and perturbagens, respectively.
obs = obs.pivot(index='pst', columns='perturbagen', values='fc')
obs = obs.replace(np.nan, 'nan')
obs = obs.reset_index()

In [14]:
# kseaZtest function returns Z-scores and p-values for every phosphosite and perturbagen
ksea_res = kseaZtest(kt, obs)

AAAS(S495)
AAGAB(S310)
AAGAB(S311)
AAK1(S14)
AAK1(S21)
AAK1(S624)
AAK1(S637)
AAK1(S678)
AAK1(T389)
AAK1(T606)
AAK1(T620)
AAK1(T653)
AARS2(S19)
AATF(S203)
AATF(S316)
AATF(S320)
AATF(S321)
ABCA2(S1238)
ABCA2(S50)
ABCA6(S1238)
ABCC1(S916)
ABCC4(T646)
ABCC5(S509)
ABCF1(S105)
ABCF1(S109)
ABCF1(S140)
ABCF1(S22)
ABCF1(S228)
ABCF1(T108)
ABI1(S183)
ABI1(S222)
ABI1(S231)
ABI1(S319)
ABI1(S323)
ABI1(Y213)
ABI2(S183)
ABI2(S216)
ABI2(S227)
ABI2(S242)
ABI2(S368)
ABI2(Y213)
ABL1(S569)
ABL1(S718)
ABL1(T735)
ABL2(S620)
ABL2(S631)
ABL2(S655)
ABL2(S936)
ABLIM1(S216)
ABLIM1(S353)
ABLIM1(S431)
ABLIM1(S452)
ABLIM1(S640)
ABLIM1(S655)
ABLIM1(S706)
ABLIM1(T433)
ABLIM1(T670)
ABLIM2(S476)
ABR(S53)
ABR(S632)
ABR(S72)
ABR(T74)
ABTB2(S365)
ACACA(S1259)
ACACA(S1263)
ACACA(S80)
ACAP1(S379)
ACAP2(S540)
ACAT1(S16)
ACBD5(T263)
ACD(S425)
ACIN1(S1004)
ACIN1(S132)
ACIN1(S216)
ACIN1(S243)
ACIN1(S386)
ACIN1(S388)
ACIN1(S410)
ACIN1(S478)
ACIN1(S481)
ACIN1(S490)
ACIN1(S561)
ACIN1(S605)
ACIN1(S657)
ACIN1(S710)
ACIN1(S729)
ACIN1(

In [15]:
# Data wrangling OUTPUT
ksea_df = stackSamples(ksea_res)
ksea_df[['z_score', 'p_val', 'mlog2fc']] = ksea_df[['z_score', 'p_val', 'mlog2fc']].astype('float64')

# Assign activity status
ksea_df['status'] = 'unaffected'
ksea_df.loc[ksea_df['tc'] == 0, 'status'] = 'no_t'  # no observed targets
ksea_df.loc[(ksea_df['kpa'].isin(kinases['kinase'])) & (ksea_df['z_score'] > 0) & (
            ksea_df['p_val'] < 0.05), 'status'] = 'actv'
ksea_df.loc[(ksea_df['kpa'].isin(kinases['kinase'])) & (ksea_df['z_score'] < 0) & (
            ksea_df['p_val'] < 0.05), 'status'] = 'inhb'
ksea_df.loc[(ksea_df['kpa'].isin(phosphatases['phosphatase'])) & (ksea_df['z_score'] < 0) & (
            ksea_df['p_val'] < 0.05), 'status'] = 'actv'
ksea_df.loc[(ksea_df['kpa'].isin(phosphatases['phosphatase'])) & (ksea_df['z_score'] > 0) & (
            ksea_df['p_val'] < 0.05), 'status'] = 'inhb'


In [16]:
ksea_df

Unnamed: 0,kpa,tc_total,pert,mlog2fc,p_val,tc,z_score,status
0,AAK1,1,AC220,-0.468526,0.361953,1.0,-0.353245,unaffected
1,AAK1,1,AT13148,-0.330821,0.377049,1.0,-0.313240,unaffected
2,AAK1,1,AZ20,-3.382521,0.000566,1.0,-3.255322,inhb
3,AAK1,1,AZD1480,0.906704,0.165644,1.0,0.971524,unaffected
4,AAK1,1,AZD3759,-2.304175,0.017819,1.0,-2.101023,inhb
...,...,...,...,...,...,...,...,...
17380,WNK2,1,Torin,2.349496,0.008281,1.0,2.396286,actv
17381,WNK2,1,Trametinib,-1.028743,0.165881,1.0,-0.970570,unaffected
17382,WNK2,1,U73122,-4.061752,0.000029,1.0,-4.018570,inhb
17383,WNK2,1,Ulixertinib,-3.722612,0.000084,1.0,-3.762593,inhb


In [17]:
# export df to csv
ksea_df.to_csv('data/processed/facts/ksea_' + mode + cline + '_mp.csv', index=False)

In [18]:
# Write Prolog file
with open('models/facts/ksea_' + mode + cline + '_mp.pl', 'w') as file:
    for index, row in ksea_df.iterrows():
        var1 = "ksea('{}', '{}', {}, {}).".format(row['pert'], row['kpa'], row['status'], row['p_val'])
        file.write(var1 + '\n')