In [1]:
import pandas as pd
import numpy as np
import os, sys, math

# 

# Read Drug Target Data

## https://drugcentral.org/download

In [8]:
drugcentral_df = pd.read_csv('../data/drug.target.interaction.tsv', sep = '\t')
drugcentral_df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.89,,IC50,Inhibition of wild-type human ERG channel expr...,CHEMBL,=,,,,,,Tclin,Homo sapiens
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.79,,IC50,,WOMBAT-PK,=,,,,,,Tclin,Homo sapiens
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens
3,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,,WOMBAT-PK,,,,,,,Tclin,Homo sapiens
4,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.707,,IC50,"DRUGMATRIX: CYP450, 2D6 enzyme inhibition (sub...",DRUG MATRIX,=,,,,,,Tclin,Homo sapiens


# 

# Read FDA Approved

In [10]:
fda_df = pd.read_csv('../data/FDA_Approved.csv', header = None)
fda_df.columns = ['DRUG_NUMBER', 'DRUG_NAME']
fda_df.head()

Unnamed: 0,DRUG_NUMBER,DRUG_NAME
0,2104,perflutren
1,1834,monobenzone
2,2684,tobramycin
3,3051,butamben
4,3103,citrulline


# 

# Read EMA Approved

In [12]:
ema_df = pd.read_csv('../data/EMA_Approved.csv', header = None)
ema_df.columns = ['DRUG_NUMBER', 'DRUG_NAME']
ema_df.head()

Unnamed: 0,DRUG_NUMBER,DRUG_NAME
0,5405,belantamab mafodotin
1,5376,remdesivir
2,5415,lumasiran
3,5416,setmelanotide
4,5417,bulevirtide


# 

# Read PMDA Approved

In [14]:
pmda_df = pd.read_csv('../data/PMDA_Approved.csv', header = None)
pmda_df.columns = ['DRUG_NUMBER', 'DRUG_NAME']
pmda_df.head()

Unnamed: 0,DRUG_NUMBER,DRUG_NAME
0,5392,capmatinib
1,5400,remimazolam
2,5408,viltolarsen
3,5419,filgotinib
4,5420,vadadustat


# 

# Read Cancer Data

## https://cancer.sanger.ac.uk/census

In [16]:
gene_df = pd.read_csv('../data/Cosmic_CancerGeneCensus_v99_GRCh38.tsv', sep = '\t')
gene_df.head()

Unnamed: 0,GENE_SYMBOL,NAME,COSMIC_GENE_ID,CHROMOSOME,GENOME_START,GENOME_STOP,CHR_BAND,SOMATIC,GERMLINE,TUMOUR_TYPES_SOMATIC,...,CANCER_SYNDROME,TISSUE_TYPE,MOLECULAR_GENETICS,ROLE_IN_CANCER,MUTATION_TYPES,TRANSLOCATION_PARTNER,OTHER_GERMLINE_MUT,OTHER_SYNDROME,TIER,SYNONYMS
0,A1CF,APOBEC1 complementation factor,COSG68236,10,50799409.0,50885675.0,10q11.23,y,n,melanoma,...,,E,,oncogene,Mis,,n,,2,"A1CF,ENSG00000148584.14,29974,ACF,ACF64,ACF65,..."
1,ABI1,abl interactor 1,COSG100962,10,26746593.0,26861087.0,10p12.1,y,n,AML,...,,L,Dom,"TSG, fusion",T,KMT2A,n,,1,"ABI1,ENSG00000136754.17,Q8IZP0,10006,ABI-1,E3B1"
2,ABL1,"ABL proto-oncogene 1, non-receptor tyrosine ki...",COSG106650,9,130713946.0,130887675.0,9q34.12,y,n,"CML, ALL, T-ALL",...,,L,Dom,"oncogene, fusion","T, Mis","BCR, ETV6, NUP214",n,,1,"ABL1,ENSG00000097007.17,P00519,25,JTK7,c-ABL,p150"
3,ABL2,"ABL proto-oncogene 2, non-receptor tyrosine ki...",COSG93778,1,179099327.0,179229684.0,1q25.2,y,n,AML,...,,L,Dom,"oncogene, fusion",T,ETV6,n,,1,"ABL2,ENSG00000143322.19,P42684,27,ARG"
4,ACKR3,atypical chemokine receptor 3,COSG97311,2,236567787.0,236582358.0,2q37.3,y,n,lipoma,...,,M,Dom,"oncogene, fusion",T,HMGA2,n,,1,"ACKR3,ENSG00000144476.5,P25106,57007,GPR159,RDC1"


# 

# Append Approval Status to Drug Interaction Data

In [20]:
fda_ids = list(fda_df['DRUG_NAME'].unique())
ema_ids = list(ema_df['DRUG_NAME'].unique())
pmda_ids = list(pmda_df['DRUG_NAME'].unique())

In [22]:
len(fda_ids), len(ema_ids), len(pmda_ids)

(2331, 456, 435)

In [24]:
drugcentral_df['FDA_Approved'] = np.where(drugcentral_df['DRUG_NAME'].isin(fda_ids), 'Yes', 'No')
drugcentral_df['FDA_Approved'].value_counts()

FDA_Approved
Yes    15315
No      4063
Name: count, dtype: int64

In [26]:
drugcentral_df['EMA_Approved'] = np.where(drugcentral_df['DRUG_NAME'].isin(ema_ids), 'Yes', 'No')
drugcentral_df['EMA_Approved'].value_counts()

EMA_Approved
No     16228
Yes     3150
Name: count, dtype: int64

In [28]:
drugcentral_df['PMDA_Approved'] = np.where(drugcentral_df['DRUG_NAME'].isin(pmda_ids), 'Yes', 'No')
drugcentral_df['PMDA_Approved'].value_counts()

PMDA_Approved
No     17035
Yes     2343
Name: count, dtype: int64

In [30]:
drugcentral_df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,FDA_Approved,EMA_Approved,PMDA_Approved
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.89,,IC50,...,,,,,,Tclin,Homo sapiens,Yes,No,No
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.79,,IC50,...,,,,,,Tclin,Homo sapiens,Yes,No,No
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,...,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens,Yes,No,No
3,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,...,,,,,,Tclin,Homo sapiens,Yes,No,No
4,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.707,,IC50,...,,,,,,Tclin,Homo sapiens,Yes,No,No


# 

# Parse Annotations for Cancer Types

In [34]:
def find_keyword(df, keys):

    key_labels = []

    type_somatic = df['TUMOUR_TYPES_SOMATIC']

    for i in range(len(type_somatic)):
        
        value = type_somatic[i]
        
        if pd.notna(value):
            if ',' in value:
                vals = type_somatic[i].split(',')
            else:
                vals = [value]

            key_found = False
                
            for j in range(len(keys)):
                if keys[j] in vals:
                    key_found = True

            key_labels.append(key_found)
            
        else:
            key_labels.append(False)

    return key_labels

In [36]:
colon_labels = find_keyword(gene_df, ['colon', 'colorectal', 'colorectal cancer', 'colon cancer', 'colon carcinoma'])
gene_df['Colon'] = colon_labels

In [38]:
nsclc_labels = find_keyword(gene_df, ['NSCLC'])
gene_df['NSCLC'] = nsclc_labels

In [40]:
bladder_labels = find_keyword(gene_df, ['bladder', 'bladder carcinoma'])
gene_df['Bladder'] = bladder_labels

In [42]:
ovarian_labels = find_keyword(gene_df, ['ovarian', 'ovarian cancer', 'clear cell ovarian carcinoma'])
gene_df['Ovarian'] = ovarian_labels

In [44]:
bladder_df = gene_df[gene_df['Bladder'] == True]
ovarian_df = gene_df[gene_df['Ovarian'] == True]
nsclc_df = gene_df[gene_df['NSCLC'] == True]
colon_df = gene_df[gene_df['Colon'] == True]

In [46]:
bladder_ids = list(bladder_df['GENE_SYMBOL'].unique())
colon_ids = list(colon_df['GENE_SYMBOL'].unique())
nsclc_ids = list(nsclc_df['GENE_SYMBOL'].unique())
ovarian_ids = list(ovarian_df['GENE_SYMBOL'].unique())

In [48]:
len(bladder_ids), len(colon_ids), len(nsclc_ids), len(ovarian_ids)

(2, 29, 17, 6)

In [50]:
drugcentral_df['Bladder'] = np.where(drugcentral_df['GENE'].isin(bladder_ids), 'Yes', 'No')
drugcentral_df['Colon'] = np.where(drugcentral_df['GENE'].isin(colon_ids), 'Yes', 'No')
drugcentral_df['NSCLC'] = np.where(drugcentral_df['GENE'].isin(nsclc_ids), 'Yes', 'No')
drugcentral_df['Ovarian'] = np.where(drugcentral_df['GENE'].isin(ovarian_ids), 'Yes', 'No')

In [52]:
drugcentral_df['Bladder'].value_counts()

Bladder
No     19362
Yes       16
Name: count, dtype: int64

In [54]:
drugcentral_df['Colon'].value_counts()

Colon
No     19308
Yes       70
Name: count, dtype: int64

In [56]:
drugcentral_df['NSCLC'].value_counts()

NSCLC
No     19312
Yes       66
Name: count, dtype: int64

In [58]:
drugcentral_df['Ovarian'].value_counts()

Ovarian
No     19374
Yes        4
Name: count, dtype: int64

In [60]:
drugcentral_df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,ACTION_TYPE,TDL,ORGANISM,FDA_Approved,EMA_Approved,PMDA_Approved,Bladder,Colon,NSCLC,Ovarian
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.89,,IC50,...,,Tclin,Homo sapiens,Yes,No,No,No,No,No,No
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.79,,IC50,...,,Tclin,Homo sapiens,Yes,No,No,No,No,No,No
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,...,BLOCKER,Tclin,Homo sapiens,Yes,No,No,No,No,No,No
3,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,...,,Tclin,Homo sapiens,Yes,No,No,No,No,No,No
4,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.707,,IC50,...,,Tclin,Homo sapiens,Yes,No,No,No,No,No,No


# 

# Write Output File for Connecting Drug-Gene-Cancer Type Interactions

In [64]:
drugcentral_df.to_csv('../data/drug.target.interaction.fda.cosmic.cancer.type.tsv', sep = '\t', index = False)