In [1]:
import pandas as pd 
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sbn
import pickle as pkl 

pd.set_option('display.max_columns', None)

In [2]:
# Metadata for cell lines (note there is one entry per compound/moa/target/structure combination, so some compounds appear more than once)
druginfo = pd.read_csv('../data/raw/compoundinfo_beta.txt', sep='\t')
druginfo2 = druginfo[lambda x: ~x.target.isna()]
druginfo2.head()

Unnamed: 0,pert_id,cmap_name,target,moa,canonical_smiles,inchi_key,compound_aliases
602,BRD-A50311610,meclizine,NR1I3,CAR agonist,Cc1cccc(CN2CCN(CC2)C(c2ccccc2)c2ccc(Cl)cc2)c1,OCJYIGYOJCODJL-UHFFFAOYSA-N,meclozine
603,BRD-K30743633,TCPOBOP,NR1I3,CAR agonist,Clc1cnc(Oc2ccc(Oc3ncc(Cl)cc3Cl)cc2)c(Cl)c1,BAFKRPOFIYPKBQ-UHFFFAOYSA-N,tcpobop
604,BRD-K39381259,DMH1,ACVR1,ALK inhibitor,CC(C)Oc1ccc(cc1)-c1cnc2c(cnn2c1)-c1ccnc2ccccc12,JMIFGARJSWXZSH-UHFFFAOYSA-N,DMH-1
605,BRD-K43002773,GDC-0068,AKT3,Akt inhibitor,C[C@@H]1C[C@H](C2=C1C(=NC=N2)N3CCN(CC3)C(=O)[C...,GRZXWCHAXNAUHY-NSISKUIASA-N,ipatasertib
606,BRD-K43002773,GDC-0068,AKT1,Akt inhibitor,C[C@@H]1C[C@H](C2=C1C(=NC=N2)N3CCN(CC3)C(=O)[C...,GRZXWCHAXNAUHY-NSISKUIASA-N,ipatasertib


In [3]:

druginfo.cmap_name.unique().shape

(33627,)

In [4]:
druginfo.target.unique().shape

(891,)

In [4]:
fi = pd.read_csv('../../../REACTOME/data/raw/FIsInGene_020720_with_annotations.txt', sep='\t')
fi.head()

Unnamed: 0,Gene1,Gene2,Annotation,Direction,Score
0,16-5-5,CDC42,predicted,-,0.98
1,16-5-5,PARD3,predicted,-,1.0
2,16-5-5,PARD3B,predicted,-,1.0
3,A1CF,APOBEC1,catalyzed by; complex; input,<-,1.0
4,A1CF,EP300,expression regulated by,<-,1.0


In [40]:
fi.shape

(268857, 5)

In [41]:
fi.Annotation.unique().shape

(2059,)

In [6]:
unq_genes = [x.lower() for x in set(fi.Gene1.unique()).union(set(fi.Gene2.unique()))]
len( unq_genes )

14071

In [7]:
targs = [x.lower() for x in druginfo2.target.unique()]
len( targs )

890

In [8]:
overlap = set(targs).intersection(unq_genes)
len( overlap )

846

In [10]:
druginfo_fi = druginfo2[lambda x: x.target.str.lower().isin(overlap)]
drugs_with_fi_targs = druginfo_fi.cmap_name.unique()
drugs_with_fi_targs.shape

(2509,)

In [21]:
moa = np.array([x.split(' ')[-1] for x in druginfo_fi.moa.unique()])
unq_moa, cnts = np.unique(moa, return_counts=True)
for m,c in zip(unq_moa, cnts): 
    print(m, c)

Anti-HCVE2 1
Antiarrhythmic 1
Antifibrinolytic 1
Antihistamine 1
Antioxidant 1
Antiviral 1
B 1
Corrector 1
Diuretic 1
Immunosuppressant 1
Inhibitor 1
Neurotransmitter 1
Steroid 1
Sulfonylurea 1
Vasodilator 1
acid 1
activator 34
activity 1
agent 15
agonist 75
analog 1
anesthetic 1
antagonist 69
antidepressant 1
antiepileptic 5
antifolate 1
blocker 12
cells 1
compounds 1
diuretic 2
drug 1
effects 1
enhancer 4
hormone 1
inducer 2
inhibitor 344
intercalator 1
ligand 11
modulator 19
potentiator 1
precursor 2
progestin 1
scavenger 1
secretagogue 1
sensitizer 2
stabilizer 1
stimulant 14
sympatholytic 1


In [19]:
cnts

array([   3,    2,    1,    3,    6,    1,    7,    1,    4,   13,    1,
          9,    1,    8,    6,    3,  168,    1,   69, 1307,   11,    2,
       1618,   14,   26,    3,  286,    1,    1,    5,    3,    4,   14,
          8,    3, 3996,    1,   35,  135,    1,   10,    3,    3,   18,
         14,    1,   45,    1], dtype=int64)

In [25]:
targ = pd.read_csv('../../../TARGETOME/data/raw/Targetome_FullEvidence_070617.txt', sep='\t')
targ = targ.assign(drug=lambda x: x.Drug.str.lower())

targ = targ[targ.Target_Type =='Protein']
targ = targ[targ.Target_Species =='Homo sapiens']
targ.drug = [str(x).lower() for x in targ.drug.values]
targ.head()

Unnamed: 0,Drug,Target_Name,Target_Type,Target_UniProt,Target_Species,Database,Reference,Assay_Type,Assay_Relation,Assay_Value,EvidenceLevel_Assigned,drug
0,Vemurafenib,RAF1,Protein,P04049,Homo sapiens,BindingDB,22808911,IC50,=,48.0,III,vemurafenib
1,Vemurafenib,RAF1,Protein,P04049,Homo sapiens,IUPHAR,22808911,IC50,=,,III,vemurafenib
2,Vemurafenib,RAF1,Protein,P04049,Homo sapiens,IUPHAR,26343583,IC50,=,,III,vemurafenib
3,Vemurafenib,EGFR,Protein,P00533,Homo sapiens,BindingDB,24588073,IC50,>,10000.0,III,vemurafenib
4,Vemurafenib,BRAF,Protein,P15056,Homo sapiens,Therapeutic Target Database,NA_TTD,,,,I,vemurafenib


In [29]:
targetome_cmapfi_overlap = set(druginfo_fi.cmap_name.str.lower()).intersection(set(targ.Drug.str.lower()))
len(targetome_cmapfi_overlap)

61

In [31]:
cmap_targ = targ[lambda x: x.Drug.str.lower().isin(targetome_cmapfi_overlap)]
cmap_targ.Assay_Type.unique()

array(['IC50', nan, 'Ki', 'KD', 'EC50', 'Kd'], dtype=object)

In [33]:
withkd = cmap_targ[lambda x: x.Assay_Type.isin(['KD', 'Kd'])]
withkd.head()

Unnamed: 0,Drug,Target_Name,Target_Type,Target_UniProt,Target_Species,Database,Reference,Assay_Type,Assay_Relation,Assay_Value,EvidenceLevel_Assigned,drug
943,Vismodegib,ALB,Protein,P02768,Homo sapiens,BindingDB,21438527,KD,=,6000.0,III,vismodegib
944,Vismodegib,ALB,Protein,P02768,Homo sapiens,BindingDB,21438527,KD,=,130000.0,III,vismodegib
945,Vismodegib,ALB,Protein,P02768,Homo sapiens,BindingDB,21438527,KD,=,120000.0,III,vismodegib
946,Vismodegib,ALB,Protein,P02768,Homo sapiens,BindingDB,21438527,KD,=,5500.0,III,vismodegib
1426,Nilotinib,PIP4K2B,Protein,P78356,Homo sapiens,BindingDB,22037378,KD,>,10000.0,III,nilotinib


In [34]:
withkd.shape

(4353, 12)

In [36]:
withkd.Drug.unique().shape

(13,)

In [39]:
withkd.groupby('Drug').count()['Target_Name']

Drug
Axitinib         440
Bexarotene        15
Bosutinib        439
Crizotinib       439
Dasatinib        849
Dexamethasone      1
Docetaxel          1
Gefitinib        814
Hydroxyurea        1
Nilotinib        447
Vandetanib       897
Vismodegib         4
Vorinostat         6
Name: Target_Name, dtype: int64

In [9]:
# how many drugs in our drug-drug network have targets in cmap meta? 
druginfo2 = druginfo[lambda x: ~x.target.isna()]
drug_overlap = set(druginfo2.cmap_name.str.lower().unique()).intersection(set(drugs))
len(drug_overlap)

755

In [16]:
# how many of the targets overlap with genes in our network? 
gene_overlap = set(druginfo2.target.str.upper().map(all2symb).values).intersection(set(genes))
len(gene_overlap)

848

In [20]:
# if we filter to targets that are in our gene graph... how many drugs in our graph have protein targets?
druginfo3 = druginfo2.assign(target2 = lambda x: x.target.str.upper().map(all2symb))
druginfo3 = druginfo3[lambda x: x.target2.isin(genes)]
print('# dropped obs:', druginfo2.shape[0] - druginfo3.shape[0])
drug_overlap2 = set(druginfo3.cmap_name.str.lower().unique()).intersection(set(drugs))
len(drug_overlap2)

# dropped obs: 166


745

In [37]:
# what are the moa (mode of action) types we have? 
druginfo3.fillna('yourmom').groupby('moa').count()['target'].sort_values().tail(10)

moa
KIT inhibitor                        114
FLT3 inhibitor                       121
Acetylcholine receptor antagonist    138
Cyclooxygenase inhibitor             149
PDGFR inhibitor                      164
Adrenergic receptor antagonist       194
VEGFR inhibitor                      199
Dopamine receptor antagonist         216
Dopamine receptor agonist            216
Serotonin receptor antagonist        276
Name: target, dtype: int64

In [27]:
druginfo4 = druginfo3.assign(moa_type = lambda x: x.moa.str.lower().str.split(' '))
druginfo4.moa_type = [x[-1] for x in druginfo4.moa_type]
druginfo4.head()

Unnamed: 0,pert_id,cmap_name,target,moa,canonical_smiles,inchi_key,compound_aliases,target2,moa_type
602,BRD-A50311610,meclizine,NR1I3,CAR agonist,Cc1cccc(CN2CCN(CC2)C(c2ccccc2)c2ccc(Cl)cc2)c1,OCJYIGYOJCODJL-UHFFFAOYSA-N,meclozine,NR1I3,agonist
603,BRD-K30743633,TCPOBOP,NR1I3,CAR agonist,Clc1cnc(Oc2ccc(Oc3ncc(Cl)cc3Cl)cc2)c(Cl)c1,BAFKRPOFIYPKBQ-UHFFFAOYSA-N,tcpobop,NR1I3,agonist
604,BRD-K39381259,DMH1,ACVR1,ALK inhibitor,CC(C)Oc1ccc(cc1)-c1cnc2c(cnn2c1)-c1ccnc2ccccc12,JMIFGARJSWXZSH-UHFFFAOYSA-N,DMH-1,ACVR1,inhibitor
605,BRD-K43002773,GDC-0068,AKT3,Akt inhibitor,C[C@@H]1C[C@H](C2=C1C(=NC=N2)N3CCN(CC3)C(=O)[C...,GRZXWCHAXNAUHY-NSISKUIASA-N,ipatasertib,AKT3,inhibitor
606,BRD-K43002773,GDC-0068,AKT1,Akt inhibitor,C[C@@H]1C[C@H](C2=C1C(=NC=N2)N3CCN(CC3)C(=O)[C...,GRZXWCHAXNAUHY-NSISKUIASA-N,ipatasertib,AKT1,inhibitor


In [28]:
druginfo4.groupby('moa_type').count()['target'].sort_values()

moa_type
intercalator            1
cells                   1
sympatholytic           1
antiviral               1
antifibrinolytic        1
compounds               1
potentiator             1
corrector               1
stabilizer              1
activity                1
steroid                 1
antiarrhythmic          2
anesthetic              2
anti-hcve2              3
progestin               3
scavenger               3
antifolate              3
antihistamine           3
drug                    3
acid                    3
inducer                 4
effects                 4
vasodilator             6
antioxidant             6
b                       7
sulfonylurea            8
hormone                 8
neurotransmitter        9
diuretic                9
precursor              10
analog                 11
immunosuppressant      13
antidepressant         14
sensitizer             14
enhancer               14
secretagogue           18
antiepileptic          26
ligand                 35
sti

In [31]:
# what drugs have the most targets? 
druginfo4.groupby('cmap_name').count()['target'].sort_values().tail(25)

cmap_name
LY-294002         28
zonisamide        28
asenapine         28
ponatinib         30
BMS-777607        30
cabergoline       30
pentobarbital     32
ellagic-acid      35
yohimbine         35
amitriptyline     36
clozapine         40
orantinib         42
ginkgolide-b      42
ursolic-acid      44
serotonin         48
dovitinib         50
tozasertib        56
dasatinib         60
guggulsterone     60
topiramate        66
bromocriptine     70
sunitinib         80
regorafenib       96
etomidate         96
sorafenib        132
Name: target, dtype: int64

In [34]:
# what targets are shared by the most drugs? 
druginfo4.groupby('target').count()['cmap_name'].sort_values().tail(25)

target
EGFR       54
MTOR       55
FGFR1      57
ADRB2      57
GABRA1     58
AR         61
FLT4       63
CHRM1      64
HTR2C      65
SLC6A2     69
PGR        73
SLC6A4     74
FLT1       75
ESR1       76
PDGFRB     77
KIT        83
FLT3       84
HTR1A      86
HRH1       88
PTGS1      95
PTGS2      99
KDR       105
HTR2A     112
NR3C1     112
DRD2      118
Name: cmap_name, dtype: int64