In [7]:
import pandas as pd
import numpy as np

In [4]:
# gene uniprot ID -> reactome pathway ID (pathway membership)
uni2rea             = pd.read_csv(f'../../../data/UniProt2Reactome_All_Levels.txt', sep='\t', header=None, low_memory=False, dtype=str)
uni2rea.columns     = ['uniprot', 'pathway', 'source', 'description', 'acc', 'species']
uni2rea             = uni2rea[lambda x: ~x.pathway.isna()]
uni2rea.pathway     = [str(p) for p in uni2rea.pathway]
uni2rea             = uni2rea[lambda x: (x.species == 'Homo sapiens')]
uni2rea             = {uni:rea for uni,rea in zip(uni2rea.uniprot.values, uni2rea.pathway.values)}

uni2id = pd.read_csv('../../extdata/omnipath_uniprot2geneid.tsv', sep='\t').rename({'From':'uniprot', 'To':'gene_id'}, axis=1)
uni2symb = pd.read_csv('../../extdata/omnipath_uniprot2genesymb.tsv', sep='\t').rename({'From':'uniprot', 'To':'gene_symbol'}, axis=1)
gene_map = uni2id.merge(uni2symb, on='uniprot', how='outer')
geneinfo = pd.read_csv(f'../../../data/geneinfo_beta.txt', sep='\t')

In [36]:
druginfo = pd.read_csv(f'../../../data/compoundinfo_beta.txt', sep='\t')

clue_drug_target_symbols = druginfo.target.unique()

gene2uni = gene_map[['uniprot', 'gene_symbol']]

druginfo = druginfo.merge(gene2uni, left_on='target', right_on='gene_symbol', how='inner') 

druginfo = druginfo[['pert_id', 'uniprot', 'moa']].rename({'uniprot':'target'}, axis=1)

druginfo = druginfo.assign(combined_score=1000, source='clue')

stitch = pd.read_csv('../../extdata/processed_stitch_targets.csv')
stitch = stitch.assign(source='stitch', moa='')

druginfo = pd.concat((druginfo, stitch), axis=0)

targ = pd.read_csv('../../extdata/targetome_with_broad_ids.csv')
targ = targ.rename({'Target_UniProt':'target'}, axis=1)
targ = targ.assign(combined_score = 1000, source='targetome', moa='')
targ = targ[['pert_id','target','combined_score','moa','source']]

druginfo = pd.concat((druginfo, targ), axis=0)

druginfo = druginfo.groupby(['pert_id', 'target']).agg({'combined_score' : np.mean, "moa" : lambda x: ' |AND| '.join(np.unique(x)), 'source':lambda x: '+'.join(np.unique(x))}).reset_index()



In [44]:
druginfo = druginfo.assign(in_stitch=lambda x: x.source.str.contains('stitch'), 
                in_targetome=lambda x: x.source.str.contains('targetome'),
                in_clue=lambda x: x.source.str.contains('clue'))

In [46]:
druginfo.to_csv('../../extdata/processed_targets.csv', index=False)

In [45]:
druginfo.head()

Unnamed: 0,pert_id,target,combined_score,moa,source,in_stitch,in_targetome,in_clue
0,BRD-A00077618,O60603,800.0,,stitch,True,False,False
1,BRD-A00077618,P00797,818.0,,stitch,True,False,False
2,BRD-A00077618,P01112,800.0,,stitch,True,False,False
3,BRD-A00077618,P03956,800.0,,stitch,True,False,False
4,BRD-A00077618,P05019,786.0,,stitch,True,False,False


In [38]:
druginfo.pert_id.unique().shape

(4681,)

In [39]:
druginfo.target.unique().shape

(8278,)

In [40]:
druginfo.groupby('source').count()

Unnamed: 0_level_0,pert_id,target,combined_score,moa
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
clue,2179,2179,2179,2179
clue+stitch,3512,3512,3512,3512
clue+stitch+targetome,121,121,121,121
clue+targetome,11,11,11,11
stitch,87686,87686,87686,87686
stitch+targetome,436,436,436,436
targetome,2557,2557,2557,2557


In [41]:
druginfo.groupby('moa').count()

Unnamed: 0_level_0,pert_id,target,combined_score,source
moa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,90679,90679,90679,90679
|AND| 11-beta-HSD1 inhibitor,3,3,3,3
|AND| 5 alpha reductase inhibitor,3,3,3,3
|AND| ABC transporter expression enhancer |AND| LXR agonist,2,2,2,2
|AND| ABC transporter expression enhancer |AND| LXR agonist |AND| ROR inverse agonist,2,2,2,2
...,...,...,...,...
Vasopressin receptor agonist,3,3,3,3
Vesicular monoamine transporter inhibitor,6,6,6,6
Vitamin D receptor agonist,2,2,2,2
Voltage-gated sodium channel blocker,5,5,5,5
