# 7C_drug_gene_tables



Paul,
We have a good group of key IMA-associated faculty lined up for your presentation tomorrow. Look forward to a productive discussion on the therapeutic implications of your large-scale non-coding mutational analysis.


To facilitate discussion (and only if it is convenient for you), it would be helpful if you could organize your presentation along the following lines:
- Background
- Key findings (esp. the identification of ~1500 druggable targets across ~50 diseases)
- Implications of your findings for drug repurposing
- Implications of your findings for drug discovery

In particular, I suggest separate discussions on 3 and 4 above, because the f/u will be different in the two cases (both relevant to the IMA).
Thank you,
Chaitan


want list of gene names with 
- gene name
- gene evidence (MPRA eQTL,  coding, rare/mendelian)
- directionality of linked SNP (loss or gain)
- class of gene (i.e. GPCR)
- tissue distribution

In [2]:
import pandas as pd
import os, glob

# get drug data

In [3]:
data_df =  pd.read_csv('figure_tables/mpra_res_df_UPDATED_ABC.csv',index_col=0).fillna('')

data_df.columns

Index(['rowname', 'Linked_SNP', 'Chr', 'Position', 'Ref/Alt', 'Index_SNP',
       'ConsScore', 'ConsDetail', 'diseases', 'num_dz', 'gwas_pval', 'source',
       'gene_annon', 'ngene_haploreg', 'egene_alltissue_gtex',
       'egene_neurotissue_gtex', 'egene_neurotissue_mapgene_gtex',
       'egene_neurotissue_maptissue_gtex', 'atac_tissues', 'bool_in_atac_pk',
       'hichip_tissues', 'bool_in_hichip_pk', 'broken_motifs', 'gained_motifs',
       'drug_target_gene', 'drug_target_pertname', 'drug_target_moa',
       'drug_target_clinical_phase', 'drug_target_indication',
       'drug_cmap_gene_up', 'drug_cmap_pertname_up', 'drug_cmap_gene_dn',
       'drug_cmap_pertname_dn', 'tissue_asatac', 'bool_is_asatac',
       'tissue_ashichip', 'bool_is_ashichip', 'mpra_tissue', 'mpra_pval_str',
       'mpra_pval_mean', 'mpra_logfc_str', 'mpra_logfc_mean', 'bool_mpra_sig',
       'cgene_egene_agree', 'cgene_egene_agree_num', 'bool_cgene_egene_agree',
       'ukbb_drug', 'ukbb_phenotype', 'ukbb_phen

genes

In [38]:
# df = pd.DataFrame({'temp_c': [17.0, 25.0]},
#                   index=['Portland', 'Berkeley'])
# df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)

def explore_genelist_disease(column, df = data_df):
    df_sel = (df[['diseases',column, 'mpra_logfc_mean']]#,'drug_target_pertname']]
     .assign(diseases= lambda x: x.diseases.str.split('|'))
     .explode('diseases')
     .assign(gene= lambda x: x[column].apply(lambda x: sorted(set(x.split('|')))
                                                                        ))
     .explode('gene')
     .query('gene!=""')
     .drop(column, axis=1)
     .groupby('gene')
        .agg({'diseases': lambda arr: '|'.join(sorted(set(arr))),
              'mpra_logfc_mean': 'mean'})
     .reset_index()
    )
    return df_sel

In [42]:
# data_df_sel = (data_df[['diseases','drug_target_gene']]#,'drug_target_pertname']]
#  .assign(diseases= lambda x: x.diseases.str.split('|'))
#  .explode('diseases')
#  .assign(drug_target_gene= lambda x: x.drug_target_gene.apply(lambda x: sorted(set(x.split('|')))
#                                                                     ))
#  .explode('drug_target_gene')
#  .query('drug_target_gene!=""')
# #  .groupby
# )
# data_df_sel
target_info = explore_genelist_disease('drug_target_gene')
target_info['type'] = 'direct_target'
display(target_info)
cmap_up = explore_genelist_disease('drug_cmap_gene_up')
cmap_up['type'] = 'cmap_up'
print(len(cmap_up))
cmap_dn = explore_genelist_disease('drug_cmap_gene_dn')
cmap_dn['type'] = 'cmap_dn'
print(len(cmap_dn))


Unnamed: 0,gene,diseases,mpra_logfc_mean,type
0,ADH1C,MDD,-0.794781,direct_target
1,CHRNA2,BPD|SCZ,0.665546,direct_target
2,CYP2D6,BPD|MDD|SCZ,-0.383639,direct_target
3,DHODH,MDD,-1.03855,direct_target
4,ITPR3,SCZ,0.608879,direct_target
5,NFKB1,SCZ,-0.927891,direct_target
6,SV2A,BPD|SCZ,-0.555829,direct_target


236
135


In [45]:
all_drug_info = (pd.concat([target_info, cmap_up, cmap_dn])
                 .groupby(['gene','diseases','mpra_logfc_mean'])
                 .agg({'type':'|'.join})
                 .reset_index()
                 .sort_values('gene')
                )
print(all_drug_info.shape, all_drug_info.gene.unique().shape)
all_drug_info

(313, 4) (313,)


Unnamed: 0,gene,diseases,mpra_logfc_mean,type
0,ABCB9,BPD|SCZ,0.297346,cmap_up
1,ABCC8,BPD|SCZ,-0.933245,cmap_up|cmap_dn
2,ABT1,MDD|SCZ,-2.045156,cmap_up
3,ACO2,MDD,0.616614,cmap_dn
4,ADAMTS2,MDD,0.837381,cmap_up
...,...,...,...,...
308,ZNF276,SCZ,1.101667,cmap_up
309,ZNF391,BPD|MDD,0.961567,cmap_up
310,ZNF592,SCZ,-0.855839,cmap_up
311,ZSCAN31,BPD|MDD|SCZ,0.278635,cmap_up


## annotate with other drug information

In [28]:
# omim genes
omim_genes="""
ACO2
AKT1
ANK3
AP3B2
ARC
ARL3
ATXN7
BANF1
CDK10
CEP85L
CHRNA2
CLP1
CNNM2
CNOT1
DCC
DDHD2
EP300
ERBB2
FGFR1
FGFR3
FIBP
GNAI2
HARS
HLA-A
HLA-DQB1
IRF3
MADD
MARS
MFN2
MTHFD1
MTHFR
MYRF
NEK1
NT5C2
PARK7
PGAP3
PIGQ
PRMT7
RERE
RNASEH2C
RSRC1
SLC30A9
SPTLC1
SUFU
SZT2
TKT
TMX2
TPRKB
VARS2
WASF1
WDR73
""".split()

In [29]:
len(omim_genes)

51

In [30]:
coding_genes = """
C4A
CACNA1G
DAGLA
MAGI2
STAG1
SV2A
XPO7
""".split()

In [31]:
len(coding_genes)

7

In [47]:
all_drug_info['in_omim'] = all_drug_info.gene.isin(omim_genes)
all_drug_info['is_coding'] = all_drug_info.gene.isin(coding_genes)
print(all_drug_info.in_omim.value_counts())
print(all_drug_info.is_coding.value_counts())
print(all_drug_info.type.value_counts())


False    272
True      41
Name: in_omim, dtype: int64
False    312
True       1
Name: is_coding, dtype: int64
cmap_up                  171
cmap_dn                   76
cmap_up|cmap_dn           59
direct_target|cmap_up      6
direct_target              1
Name: type, dtype: int64


In [67]:
all_drug_info_filt = all_drug_info[( all_drug_info.type.str.contains('direct'))|all_drug_info.in_omim|all_drug_info.is_coding]
print(all_drug_info_filt.shape)
all_drug_info_filt

(47, 6)


Unnamed: 0,gene,diseases,mpra_logfc_mean,type,in_omim,is_coding
3,ACO2,MDD,0.616614,cmap_dn,True,False
5,ADH1C,MDD,-0.794781,direct_target|cmap_up,False,False
8,AKT1,SCZ,-0.266337,cmap_up|cmap_dn,True,False
10,AP3B2,PTSD,-0.951833,cmap_up,True,False
12,ARC,SCZ,-0.942587,cmap_up,True,False
14,ARL3,ADHD|BPD|SCZ,-0.103592,cmap_up|cmap_dn,True,False
20,ATXN7,SCZ,-0.488048,cmap_up|cmap_dn,True,False
24,BANF1,SCZ,0.687733,cmap_dn,True,False
37,CDK10,OCD,1.019088,cmap_up,True,False
43,CHRNA2,BPD|SCZ,0.665546,direct_target|cmap_up,True,False


# add in gene class

In [68]:
drug_df = pd.read_excel('/Users/mguo123/Google Drive/1_khavari/perturb_tf_project/pharos_targets_filtered2.xlsx',sheet_name='targets')


print(drug_df.shape)
drug_df[:5]


(3035, 15)


Unnamed: 0,id,gene,name,description,idgFamily,idgTDL,jensenScore,novelty,knowledgeAvailability,pubTatorScore,entrez_id,drug_list,interaction_list,source_list,raw_info
0,5,HTR1E,5-hydroxytryptamine receptor 1E,G-protein coupled receptor for 5-hydroxytrypta...,GPCR,Tchem,35.946685,-1.568858,24.464814,1.495159,3354,"['OLANZAPINE', '1-NAPHTHYLPIPERAZINE', 'NARATR...","[nan, 'antagonist', 'agonist', 'agonist', 'ago...","['TdgClinicalTrial', 'GuideToPharmacologyInter...","[{'gene_name': 'HTR1E', 'gene_claim_name': 'P2..."
1,9,EMP2,Epithelial membrane protein 2,Functions as a key regulator of cell membrane ...,Non-IDG,Tbio,210.602988,-2.308556,37.256058,1.59981,2013,['HORMONES'],[nan],['NCI'],"[{'gene_name': 'EMP2', 'gene_claim_name': 'EMP..."
2,11,DUSP1,Dual specificity protein phosphatase 1,Dual specificity phosphatase that dephosphoryl...,Enzyme,Tchem,810.925362,-2.882599,47.969158,2.515725,1843,"['HYDROXYUREA', 'VASOPRESSIN', 'ANTISENSE OLIG...","[nan, nan, nan]","['NCI', 'NCI', 'NCI']","[{'gene_name': 'DUSP1', 'gene_claim_name': 'DU..."
3,15,IL9,Interleukin-9,Supports IL-2 independent and IL-4 independent...,Non-IDG,Tbio,334.36139,-2.50811,20.035321,2.940012,3578,"['MEDI-528', 'ENOKIZUMAB', 'MEDI-528']","['inhibitor', 'inhibitor', nan]","['TTD', 'ChemblInteractions', 'TdgClinicalTrial']","[{'gene_name': 'IL9', 'gene_claim_name': 'TTDC..."
4,22,SELE,E-selectin,Cell-surface glycoprotein having a role in imm...,Non-IDG,Tchem,2672.113112,-3.449258,37.448817,3.350202,6401,"['BIMOSIAMOSE', 'BIMOSIAMOSE', 'BIMOSIAMOSE', ...","[nan, 'inhibitor', 'inhibitor', 'antagonist', ...","['TdgClinicalTrial', 'TTD', 'ChemblInteraction...","[{'gene_name': 'SELE', 'gene_claim_name': 'P16..."


In [69]:
all_drug_info_filt = all_drug_info_filt.merge(drug_df[['gene','name','description','idgFamily','idgTDL']], how='left',on='gene')
all_drug_info_filt

Unnamed: 0,gene,diseases,mpra_logfc_mean,type,in_omim,is_coding,name,description,idgFamily,idgTDL
0,ACO2,MDD,0.616614,cmap_dn,True,False,,,,
1,ADH1C,MDD,-0.794781,direct_target|cmap_up,False,False,Alcohol dehydrogenase 1C,,Enzyme,Tclin
2,AKT1,SCZ,-0.266337,cmap_up|cmap_dn,True,False,RAC-alpha serine/threonine-protein kinase,AKT1-specific substrates have been recently id...,Kinase,Tchem
3,AP3B2,PTSD,-0.951833,cmap_up,True,False,,,,
4,ARC,SCZ,-0.942587,cmap_up,True,False,,,,
...,...,...,...,...,...,...,...,...,...,...
62,SV2A,BPD|SCZ,-0.555829,direct_target|cmap_up,False,True,Synaptic vesicle glycoprotein 2A,Plays a role in the control of regulated secre...,Non-IDG,Tclin
63,SZT2,SCZ,-0.792680,cmap_up,True,False,,,,
64,TKT,BPD|SCZ,-0.837160,cmap_dn,True,False,,,,
65,TMX2,MDD|SCZ,-2.920013,cmap_dn,True,False,,,,


In [93]:
all_drug_info.to_csv('figure_tables/all_drug_info.csv')
all_drug_info_filt.to_csv('figure_tables/all_drug_info_filt.csv')

# GPCRs

manual add into spreadsheet


In [72]:
gpcr_df = pd.read_csv('/Users/mguo123/Google Drive/0_altman/db/gpcr/GPCRTargets.csv')
print(gpcr_df.shape)
print(gpcr_df.columns)


(415, 33)
Index(['Type', 'Family id', 'Family name', 'Target id', 'Target name',
       'Subunits', 'Target systematic name', 'Target abbreviated name',
       'synonyms', 'HGNC id', 'HGNC symbol', 'HGNC name',
       'Human genetic localisation', 'Human nucleotide RefSeq',
       'Human protein RefSeq', 'Human SwissProt', 'Human Entrez Gene',
       'RGD id', 'RGD symbol', 'RGD name', 'Rat genetic localisation',
       'Rat nucleotide RefSeq', 'Rat protein RefSeq', 'Rat SwissProt',
       'Rat Entrez Gene', 'MGI id', 'MGI symbol', 'MGI name',
       'Mouse genetic localisation', 'Mouse nucleotide RefSeq',
       'Mouse protein RefSeq', 'Mouse SwissProt', 'Mouse Entrez Gene'],
      dtype='object')


In [85]:
gpcr_genes = gpcr_df['HGNC symbol'].sort_values().values
print(gpcr_genes)

['ACKR1' 'ACKR2' 'ACKR3' 'ACKR4' 'ADCYAP1R1' 'ADGRA1' 'ADGRA2' 'ADGRA3'
 'ADGRB1' 'ADGRB2' 'ADGRB3' 'ADGRD1' 'ADGRD2' 'ADGRE1' 'ADGRE2' 'ADGRE3'
 'ADGRE4P' 'ADGRE5' 'ADGRF1' 'ADGRF2' 'ADGRF3' 'ADGRF4' 'ADGRF5' 'ADGRG1'
 'ADGRG2' 'ADGRG3' 'ADGRG4' 'ADGRG5' 'ADGRG6' 'ADGRG7' 'ADGRL1' 'ADGRL2'
 'ADGRL3' 'ADGRL4' 'ADGRV1' 'ADORA1' 'ADORA2A' 'ADORA2B' 'ADORA3' 'ADRA1A'
 'ADRA1B' 'ADRA1D' 'ADRA2A' 'ADRA2B' 'ADRA2C' 'ADRB1' 'ADRB2' 'ADRB3'
 'AGTR1' 'AGTR2' 'APLNR' 'AVPR1A' 'AVPR1B' 'AVPR2' 'BDKRB1' 'BDKRB2'
 'BRS3' 'BRS3' 'C3AR1' 'C5AR1' 'C5AR2' 'CALCR' 'CALCRL' 'CASR' 'CCKAR'
 'CCKBR' 'CCR1' 'CCR10' 'CCR2' 'CCR3' 'CCR4' 'CCR5' 'CCR6' 'CCR7' 'CCR8'
 'CCR9' 'CCRL2' 'CELSR1' 'CELSR2' 'CELSR3' 'CHRM1' 'CHRM2' 'CHRM3' 'CHRM4'
 'CHRM5' 'CMKLR1' 'CNR1' 'CNR2' 'CRHR1' 'CRHR2' 'CX3CR1' 'CXCR1' 'CXCR2'
 'CXCR3' 'CXCR4' 'CXCR5' 'CXCR6' 'CYSLTR1' 'CYSLTR2' 'DRD1' 'DRD2' 'DRD3'
 'DRD4' 'DRD5' 'EDNRA' 'EDNRB' 'F2R' 'F2RL1' 'F2RL2' 'F2RL3' 'FFAR1'
 'FFAR2' 'FFAR3' 'FFAR4' 'FPR1' 'FPR2' 'FPR2' 'FPR3' 'FSHR'

In [76]:
all_drug_info_filt[all_drug_info_filt.gene.isin(gpcr_df['HGNC symbol'].values)]

Unnamed: 0,gene,diseases,mpra_logfc_mean,type,in_omim,is_coding,name,description,idgFamily,idgTDL


In [88]:
all_genes = pd.read_csv('D_mpraanalyze_barcode_allelic/egene_gtex_psychencode/all.txt',header=None).iloc[:,0].values
for g in gpcr_genes:
    if g in all_genes:
        print(g)

LPAR2
MCHR1


In [90]:
data_df[data_df.egenes_all_sources.str.contains('LPAR2')].iloc[0,:]

rowname                                                                chr19_19447470
Linked_SNP                                                                  rs1989867
Chr                                                                             chr19
Position                                                                     19447470
Ref/Alt                                                                           A/G
Index_SNP                                                         rs1064395,rs2011503
ConsScore                                                                           2
ConsDetail                                                                     intron
diseases                                                                          BPD
num_dz                                                                              1
gwas_pval                                                                       2e-09
source                                                

MCHR1 is a Melanin-concentrating hormone receptor 1, but the new bipolar paper says it's targetted by haloperidol



In [92]:
data_df[data_df.egenes_all_sources.str.contains('MCHR1')].iloc[0,:]

rowname                                                          chr22_41024854
Linked_SNP                                                           rs73169089
Chr                                                                       chr22
Position                                                               41024854
Ref/Alt                                                                     T/C
Index_SNP                                                  rs17002034,rs6001982
ConsScore                                                                     4
ConsDetail                                                           regulatory
diseases                                                                BPD|SCZ
num_dz                                                                        2
gwas_pval                                                                 1e-34
source                                                                     gwas
gene_annon                              

# intersection all gene list with pharos drug targets

In [118]:
druggable_genes = drug_df[drug_df.gene.isin(all_genes)].iloc[:,1:6].drop_duplicates()
druggable_genes_filt = druggable_genes[~druggable_genes.gene.isin(all_drug_info_filt.gene)]#.gene.sort_values().values
print(druggable_genes_filt.columns)
for idx,row in druggable_genes_filt.iterrows():
    print('----------')
    print(row['gene'], row['idgFamily'],row['idgTDL'])
    print(row['description'])

Index(['gene', 'name', 'description', 'idgFamily', 'idgTDL'], dtype='object')
----------
CACNA2D2 Ion Channel Tclin
The alpha-2/delta subunit of voltage-dependent calcium channels regulates calcium current density and activation/inactivation kinetics of the calcium channel. Acts as a regulatory subunit for P/Q-type calcium channel (CACNA1A), N-type (CACNA1B), L-type (CACNA1C OR CACNA1D) and possibly T-type (CACNA1G). Overexpression induces apoptosis.
----------
TNFRSF13C Non-IDG Tbio
B-cell receptor specific for TNFSF13B/TALL1/BAFF/BLyS. Promotes the survival of mature B-cells and the B-cell response.
----------
PSMC3 Enzyme Tbio
Component of the 26S proteasome, a multiprotein complex involved in the ATP-dependent degradation of ubiquitinated proteins. This complex plays a key role in the maintenance of protein homeostasis by removing misfolded or damaged proteins, which could impair cellular functions, and by removing proteins whose functions are no longer required. Therefore, the pro

from reading above, added: 
- CACNA2D2
- CLCN3
- CLCN6
- PTK2B

b/c they have generally neural functions (ion channels are especially emphasied) and/or protein atlas states some brain-specific tissue specificity




In [124]:
# data_df[data_df.egenes_all_sources.str.contains('CACNA2D2')].iloc[0,:]
# data_df[data_df.egenes_all_sources.str.contains('CLCN3')].iloc[0,:]


Unnamed: 0,rowname,Linked_SNP,Chr,Position,Ref/Alt,Index_SNP,ConsScore,ConsDetail,diseases,num_dz,...,cgene_egene_agree_num,bool_cgene_egene_agree,ukbb_drug,ukbb_phenotype,ukbb_phenotype_beta,SNP_id,psychencode_egenes,egenes_all_sources,abc_genes,abc_gene_tissues
632,chr8_27317337,rs2280375,chr8,27317337,A/G,,1,downstream,SCZ,1,...,0.0,False,,,,8:27317337,,PTK2B|TRIM35,,
634,chr8_27318391,rs2292974,chr8,27318391,C/T,,1,downstream,SCZ,1,...,0.0,False,,depression history|worry,0.020|-0.009,8:27318391,,CHRNA2|PTK2B,,
635,chr8_27324844,rs2565061,chr8,27324844,G/A,,5,synonymous,SCZ,1,...,0.0,False,,,,8:27324844,,PTK2B|TRIM35,,


In [123]:
data_df[data_df.egenes_all_sources.str.contains('CLCN6')].mpra_logfc_mean.mean()#.iloc[0,:]


0.2546555795690967

In [125]:
data_df[data_df.egenes_all_sources.str.contains('PTK2B')].mpra_logfc_mean.mean()#.iloc[0,:]


0.3227474124252467

In [126]:
len(all_genes)

619

# which genes have previously been druged

In [129]:
pd.read_csv('/Users/mguo123/Google Drive/0_altman/db/cmap/repurposing_drugs_20200324.txt',header=9, sep='\t')

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,
2,(R)-baclofen,Phase 3,benzodiazepine receptor agonist,GABBR1|GABBR2,,
3,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,
4,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
...,...,...,...,...,...,...
6793,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,
6794,80841-78-7,Preclinical,,,,
6795,9-aminoacridine,Preclinical,,,,
6796,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,
