In [36]:
import pandas as pd
import requests
import json
from tqdm import tqdm
# tqdm.pandas()

### Functions

In [79]:
# Normalize a given term and return match types/concept ids
def normalize(term):
    url = f'https://normalize.cancervariants.org/therapy/normalize?q={term}&infer_namespace=true' 

    r = requests.get(url)

    if r.status_code == 200:
        data = r.json()
        try:
            return([term,data['match_type'],data['therapy_descriptor']['therapy_id']])
        except:
            return([term,None, None])

    else:
        return([term,None, None])



In [80]:
# Check a dataframe cell for a normalization success/failure
def is_normalized(entry):
    if entry == None:
        return('Failure')
    else:
        return('Success')

### Data

In [61]:
# Repurposing Hub
# https://clue.io/repurposing

df = pd.read_csv('data/repurposing_drugs_20200324.txt',sep='\t',header=9)
df

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,
2,(R)-baclofen,Phase 3,benzodiazepine receptor agonist,GABBR1|GABBR2,,
3,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,
4,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
...,...,...,...,...,...,...
6793,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,
6794,80841-78-7,Preclinical,,,,
6795,9-aminoacridine,Preclinical,,,,
6796,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,


In [93]:
# ReDO Trials 
# https://www.anticancerfund.org/en/redo-trials-db

df2 = pd.read_csv('data/ReDO_Trials_DB.txt',sep='\t')
df2

Unnamed: 0,NCT Number,Title,Acronym,Status,Conditions,Interventions,Outcome Measures,Sponsors,Gender,Age,...,Multi-Arm,Pediatric,Country_PI,Cancer_Group,Cancer_Type,Drug_INN,Primary-EP,Phase,DrugBank,Removed
0,NCT03047837,"A Randomized, 2x2 Factorial Design Biomarker P...",ASAMET,Recruiting,Tertiary Prevention in Colon Cancer,Drug: Aspirin (ASA) + Metformin (MET); Drug: A...,"NF B; pS6K, p53, beta-catenin, PI3K; IL-6, CRP...",Ente Ospedaliero Ospedali Galliera,All,"18 Years to 80 Years (Adult, Older Adult)",...,N,N,Italy,GI,Colon Cancer; Rectal Cancer,Acetylsalicylic Acid; Metformin,Biomarker,Phase 2,DB00210; DB06800,N
1,NCT02969681,Vitamin C Intravenously With Chemotherapy in A...,Vitality,Recruiting,Colorectal Neoplasms,Drug: ascorbic acid; Drug: Chemotherapy,Progression Free Survival; Overall Survival; R...,Sun Yat-sen University,All,"18 Years to 75 Years (Adult, Older Adult)",...,N,N,China,GI,Colon Cancer,Ascorbic acid,PFS,Phase 3,DB00335,N
2,NCT02497638,LIpitor and biGuanide to Androgen Delay Trial,LIGAND,Not yet recruiting,Prostate Cancer,Drug: Metformin; Drug: Atorvastatin; Drug: Pla...,Time to disease progression (defined as PSA ri...,"University Health Network, Toronto",Male,"18 Years to 80 Years (Adult, Older Adult)",...,N,N,Canada,Urological,Prostate Cancer,Atorvastatin; Metformin,PFS,Phase 2,DB01117; DB06800,N
3,NCT03275376,Statin Combination Therapy in Patients Receivi...,,Recruiting,Overall Survival|Tumor Responses,Drug: Atorvastatin 10mg; Drug: Placebo Oral Ta...,Overall survival; Best tumor response; Progres...,Taichung Veterans General Hospital,All,"40 Years and older (Adult, Older Adult)",...,N,N,Taiwan,GI,Liver Cancer,Atorvastatin,OS,Phase 2,DB01117,N
4,NCT03980249,Anti-Cancer Effects of Carvedilol With Standar...,,Not yet recruiting,Glioblastoma|Glioblastoma Multiforme,Drug: Carvedilol,Survival curve of overall survival; Survival c...,West Virginia University; NovoCure Ltd.; West ...,All,"18 Years and older (Adult, Older Adult)",...,N,N,United States,CNS,Glioblastoma,Carvedilol,OS,Phase 1,DB00567,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902,ChiCTR2200061789,Fasudil hydrochloride in the treatment of gene...,,Pending,Ovary cancer,first group:Fasudil Hydrochloride Injection;se...,Number of treatment courses received;Overall s...,Tianjin Medical University General Hospital,Female,-,...,N,N,China,Gynaecological,Ovarian Epithelial Cancer,Fasudil,PFS; OS,Not available/Missing,DB00800,N
903,ChiCTR2200066615,The effect of intraoperative intravenous lidoc...,,Pending,Lung cancer,Lidocaine group:Intraoperative intravenous lid...,Overall survival;Disease free survival;,"Zhongshan Hospital, Fudan University",All,-,...,N,N,China,Lung,Any lung cancers,Lidocaine,OS; DFS/RFS/EFS,Not available/Missing,Not found in DrugBank,N
904,ChiCTR2200065595,Exploratory Clinical Study on Mifepristone for...,,Pending,Recurrent Glioblastoma,treatment group:Administration of Mifepristone;,complete blood count;liver function test;kidne...,"Huashan Hospital, Fudan University",All,18 -,...,N,N,China,CNS,Glioblastoma,Mifepristone,Biomarker,Not available/Missing,DB00370,N
905,ChiCTR2300068631,Oral aminophylline combined with sunitinib mal...,,Recruiting,locally advanced unresectable renal clear cell...,Test group:Aminophylline combined with sunitin...,Progression-free survival;,Shanghai Sixth People's Hospital Affiliated to...,All,18 - 70,...,N,N,China,Urological,Renal Cell Carcinoma,Aminophylline,PFS,Not available/Missing,DB01118,N


In [103]:
# repoDB (Drug Repositioning Database)
# http://apps.chiragjpgroup.org/repoDB/

df3 = pd.read_csv('data/full-repodb.csv',sep=',',header=0)
df3

Unnamed: 0,drug_name,drug_id,ind_name,ind_id,NCT,status,phase,DetailedStatus
0,Lepirudin,DB00001,Heparin-induced thrombocytopenia with thrombosis,C0272275,,Approved,,
1,Cetuximab,DB00002,Squamous cell carcinoma of mouth,C0585362,,Approved,,
2,Cetuximab,DB00002,Squamous cell carcinoma of nose,C3163899,,Approved,,
3,Cetuximab,DB00002,Squamous cell carcinoma of pharynx,C1319317,,Approved,,
4,Cetuximab,DB00002,Laryngeal Squamous Cell Carcinoma,C0280324,,Approved,,
...,...,...,...,...,...,...,...,...
10557,Temozolomide,DB00853,CNS disorder,C0007682,NCT02661113,Withdrawn,Phase 2,Sponsor withdrew support; Study did not progre...
10558,Dasatinib,DB01254,CNS disorder,C0007682,NCT02661113,Withdrawn,Phase 2,Sponsor withdrew support; Study did not progre...
10559,Dacarbazine,DB00851,Brain Diseases,C0006111,NCT02661113,Withdrawn,Phase 2,Sponsor withdrew support; Study did not progre...
10560,Temozolomide,DB00853,Brain Diseases,C0006111,NCT02661113,Withdrawn,Phase 2,Sponsor withdrew support; Study did not progre...


### The Repurposing Hub

#### Normalization
NOTE: Chose to normalize this way to make only one set of therapy normalizer calls instead of dealing with apply errors or requests errors.

In [62]:
# Grab all relevant data from the therapy normalizer
results_list = []
input_list = df['pert_iname']

for drug in tqdm(input_list):
    results_list.append(normalize(drug))

results_list

100%|██████████| 6798/6798 [17:59<00:00,  6.29it/s] 


[['(R)-(-)-apomorphine', None, None],
 ['(R)-(-)-rolipram', 60, 'ncit:C72842'],
 ['(R)-baclofen', 60, 'rxcui:1292'],
 ['(S)-(+)-rolipram', None, None],
 ['[sar9,met(o2)11]-substance-p', None, None],
 ['A-1070722', None, None],
 ['A-1120', None, None],
 ['A-317491', 80, 'iuphar.ligand:4115'],
 ['A-33903', 60, 'chembl:CHEMBL204738'],
 ['A-366', 80, 'iuphar.ligand:8238'],
 ['A-381393', 80, 'iuphar.ligand:8441'],
 ['A-412997', 60, 'iuphar.ligand:3301'],
 ['A-438079', 60, 'iuphar.ligand:4118'],
 ['A-485', 80, 'chembl:CHEMBL4282264'],
 ['A-582941', 80, 'iuphar.ligand:3995'],
 ['A-61603', 60, 'iuphar.ligand:480'],
 ['A-674563', 80, 'drugbank:DB08568'],
 ['A-7', None, None],
 ['A-769662', 80, 'iuphar.ligand:10069'],
 ['A-784168', None, None],
 ['A-803467', 80, 'iuphar.ligand:5734'],
 ['A-804598', 60, 'iuphar.ligand:4121'],
 ['A-839977', 60, 'iuphar.ligand:4122'],
 ['A-867744', 80, 'iuphar.ligand:3986'],
 ['A-887826', None, None],
 ['A-922500', None, None],
 ['A-939572', None, None],
 ['A-96707

In [63]:
# Check that lengths agree
len(results_list) == len(df)

True

In [64]:
# Extract scores and concept IDs
all_scores = []
all_ids = []

for entry in tqdm(results_list):
    all_scores.append(entry[1]) # match score
    all_ids.append(entry[2]) # concept id

100%|██████████| 6798/6798 [00:00<00:00, 1310093.67it/s]


In [65]:
# Add to dataframe
df['match_score'] = all_scores
df['concept_id'] = all_ids

#### Quantify Normalization

In [66]:
df

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication,match_score,concept_id
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,,
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,,60.0,ncit:C72842
2,(R)-baclofen,Phase 3,benzodiazepine receptor agonist,GABBR1|GABBR2,,,60.0,rxcui:1292
3,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,,,
4,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,,,
...,...,...,...,...,...,...,...,...
6793,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,,,
6794,80841-78-7,Preclinical,,,,,,
6795,9-aminoacridine,Preclinical,,,,,60.0,rxcui:645
6796,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,,80.0,drugbank:DB12515


In [73]:
df['normalization'] = df['concept_id'].apply(is_normalized)
df

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication,match_score,concept_id,normalization
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,,,Failure
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,,60.0,ncit:C72842,Success
2,(R)-baclofen,Phase 3,benzodiazepine receptor agonist,GABBR1|GABBR2,,,60.0,rxcui:1292,Success
3,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,,,,Failure
4,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,,,,Failure
...,...,...,...,...,...,...,...,...,...
6793,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,,,,Failure
6794,80841-78-7,Preclinical,,,,,,,Failure
6795,9-aminoacridine,Preclinical,,,,,60.0,rxcui:645,Success
6796,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,,80.0,drugbank:DB12515,Success


In [74]:
df['normalization'].value_counts()

Success    4966
Failure    1832
Name: normalization, dtype: int64

In [77]:
print('Success: ' + str(4966/(4966+1832)))
print('Failure: ' + str(1832/(4966+1832)))

Success: 0.7305089732274198
Failure: 0.2694910267725802


#### Disease Inspection

In [88]:
# If an entry has a disease_area it also has an indication
diseases = df[df['disease_area'].isnull()==False]


In [89]:
diseases

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication,match_score,concept_id,normalization
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,,,Failure
32,abacavir,Launched,nucleoside reverse transcriptase inhibitor,,infectious disease,human immunodeficiency virus (HIV-1),80.0,rxcui:190521,Success
34,abamectin,Launched,benzodiazepine receptor agonist,GABBR1|GABBR2,infectious disease,gastrointestinal parasites,80.0,ncit:C95196,Success
39,abemaciclib,Launched,CDK inhibitor,CDK4|CDK6,oncology,breast cancer,80.0,rxcui:1946825,Success
40,abiraterone,Launched,androgen biosynthesis inhibitor,CYP11B1|CYP17A1,oncology,prostate cancer,80.0,rxcui:1100071,Success
...,...,...,...,...,...,...,...,...,...
6697,3-(4-methylbenzylidene)camphor,Launched,endocrine disruptor,,dermatology,sunscreen lotion,60.0,rxcui:1311507,Success
6725,4-aminohippuric-acid,Launched,,SLC22A6,nephrology,renal diagnostic agent,,,Failure
6755,5-aminolevulinic-acid,Launched,oxidizing agent,ALAD,oncology|dermatology,glioma|actinic keratosis (AK),,,Failure
6760,5-fluorouracil,Launched,thymidylate synthase inhibitor,DPYD|TYMS,oncology,colorectal cancer|breast cancer|pancreatic can...,80.0,rxcui:4492,Success


In [90]:
diseases['indication'].value_counts()

hypertension                                                                                                                       70
diabetes mellitus                                                                                                                  31
schizophrenia                                                                                                                      29
pain relief                                                                                                                        29
gram-negative bacterial infections                                                                                                 28
                                                                                                                                   ..
non-small cell lung cancer (NSCLC)|pancreatic cancer                                                                                1
intra-abdominal infections|skin infections|pneumonia|urinary t

In [91]:
diseases['disease_area'].value_counts()

infectious disease                                             424
neurology/psychiatry                                           346
cardiology                                                     205
gastroenterology                                               124
endocrinology                                                  122
                                                              ... 
oncology|neurology/psychiatry|genetics|urology                   1
neurology/psychiatry|otolaryngology|endocrinology|pulmonary      1
ophthalmology|gastroenterology                                   1
neurology/psychiatry|endocrinology|otolaryngology                1
oncology|dermatology                                             1
Name: disease_area, Length: 216, dtype: int64

### ReDO Trials
Maybe this source? This does not have gene interactions necessarily, as far as I can tell. Perhaps within Outcome Measures? But it does have notions of diseases, phenotypes, clinical status, recruiting, drugbank associations

In [94]:
df2

Unnamed: 0,NCT Number,Title,Acronym,Status,Conditions,Interventions,Outcome Measures,Sponsors,Gender,Age,...,Multi-Arm,Pediatric,Country_PI,Cancer_Group,Cancer_Type,Drug_INN,Primary-EP,Phase,DrugBank,Removed
0,NCT03047837,"A Randomized, 2x2 Factorial Design Biomarker P...",ASAMET,Recruiting,Tertiary Prevention in Colon Cancer,Drug: Aspirin (ASA) + Metformin (MET); Drug: A...,"NF B; pS6K, p53, beta-catenin, PI3K; IL-6, CRP...",Ente Ospedaliero Ospedali Galliera,All,"18 Years to 80 Years (Adult, Older Adult)",...,N,N,Italy,GI,Colon Cancer; Rectal Cancer,Acetylsalicylic Acid; Metformin,Biomarker,Phase 2,DB00210; DB06800,N
1,NCT02969681,Vitamin C Intravenously With Chemotherapy in A...,Vitality,Recruiting,Colorectal Neoplasms,Drug: ascorbic acid; Drug: Chemotherapy,Progression Free Survival; Overall Survival; R...,Sun Yat-sen University,All,"18 Years to 75 Years (Adult, Older Adult)",...,N,N,China,GI,Colon Cancer,Ascorbic acid,PFS,Phase 3,DB00335,N
2,NCT02497638,LIpitor and biGuanide to Androgen Delay Trial,LIGAND,Not yet recruiting,Prostate Cancer,Drug: Metformin; Drug: Atorvastatin; Drug: Pla...,Time to disease progression (defined as PSA ri...,"University Health Network, Toronto",Male,"18 Years to 80 Years (Adult, Older Adult)",...,N,N,Canada,Urological,Prostate Cancer,Atorvastatin; Metformin,PFS,Phase 2,DB01117; DB06800,N
3,NCT03275376,Statin Combination Therapy in Patients Receivi...,,Recruiting,Overall Survival|Tumor Responses,Drug: Atorvastatin 10mg; Drug: Placebo Oral Ta...,Overall survival; Best tumor response; Progres...,Taichung Veterans General Hospital,All,"40 Years and older (Adult, Older Adult)",...,N,N,Taiwan,GI,Liver Cancer,Atorvastatin,OS,Phase 2,DB01117,N
4,NCT03980249,Anti-Cancer Effects of Carvedilol With Standar...,,Not yet recruiting,Glioblastoma|Glioblastoma Multiforme,Drug: Carvedilol,Survival curve of overall survival; Survival c...,West Virginia University; NovoCure Ltd.; West ...,All,"18 Years and older (Adult, Older Adult)",...,N,N,United States,CNS,Glioblastoma,Carvedilol,OS,Phase 1,DB00567,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902,ChiCTR2200061789,Fasudil hydrochloride in the treatment of gene...,,Pending,Ovary cancer,first group:Fasudil Hydrochloride Injection;se...,Number of treatment courses received;Overall s...,Tianjin Medical University General Hospital,Female,-,...,N,N,China,Gynaecological,Ovarian Epithelial Cancer,Fasudil,PFS; OS,Not available/Missing,DB00800,N
903,ChiCTR2200066615,The effect of intraoperative intravenous lidoc...,,Pending,Lung cancer,Lidocaine group:Intraoperative intravenous lid...,Overall survival;Disease free survival;,"Zhongshan Hospital, Fudan University",All,-,...,N,N,China,Lung,Any lung cancers,Lidocaine,OS; DFS/RFS/EFS,Not available/Missing,Not found in DrugBank,N
904,ChiCTR2200065595,Exploratory Clinical Study on Mifepristone for...,,Pending,Recurrent Glioblastoma,treatment group:Administration of Mifepristone;,complete blood count;liver function test;kidne...,"Huashan Hospital, Fudan University",All,18 -,...,N,N,China,CNS,Glioblastoma,Mifepristone,Biomarker,Not available/Missing,DB00370,N
905,ChiCTR2300068631,Oral aminophylline combined with sunitinib mal...,,Recruiting,locally advanced unresectable renal clear cell...,Test group:Aminophylline combined with sunitin...,Progression-free survival;,Shanghai Sixth People's Hospital Affiliated to...,All,18 - 70,...,N,N,China,Urological,Renal Cell Carcinoma,Aminophylline,PFS,Not available/Missing,DB01118,N


In [96]:
df2.keys()

Index(['NCT Number', 'Title', 'Acronym', 'Status', 'Conditions',
       'Interventions', 'Outcome Measures', 'Sponsors', 'Gender', 'Age',
       'Enrollment', 'Funders', 'Study Type', 'Study Designs', 'Other IDs',
       'Start Date', 'Primary Completion Date', 'Completion Date',
       'Last Verified', 'First Submitted', 'First Posted',
       'Results First Submitted', 'Results First Posted',
       'Last Update Submitted', 'Last Update Posted', 'URL', 'Setting',
       'Stage', 'Sponsor_Type', 'Controlled', 'Multi-Arm', 'Pediatric',
       'Country_PI', 'Cancer_Group', 'Cancer_Type', 'Drug_INN', 'Primary-EP',
       'Phase', 'DrugBank', 'Removed'],
      dtype='object')

In [98]:
df2['Drug_INN'].value_counts()

Metformin                                                                                                                                                                                             113
Celecoxib                                                                                                                                                                                              43
Hydroxychloroquine                                                                                                                                                                                     41
Acetylsalicylic Acid                                                                                                                                                                                   32
Ascorbic acid                                                                                                                                                                                   

In [99]:
df2['Outcome Measures'].value_counts()

Progression Free Survival;                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  3
objective response rate   

### repoDB

In [104]:
df3

Unnamed: 0,drug_name,drug_id,ind_name,ind_id,NCT,status,phase,DetailedStatus
0,Lepirudin,DB00001,Heparin-induced thrombocytopenia with thrombosis,C0272275,,Approved,,
1,Cetuximab,DB00002,Squamous cell carcinoma of mouth,C0585362,,Approved,,
2,Cetuximab,DB00002,Squamous cell carcinoma of nose,C3163899,,Approved,,
3,Cetuximab,DB00002,Squamous cell carcinoma of pharynx,C1319317,,Approved,,
4,Cetuximab,DB00002,Laryngeal Squamous Cell Carcinoma,C0280324,,Approved,,
...,...,...,...,...,...,...,...,...
10557,Temozolomide,DB00853,CNS disorder,C0007682,NCT02661113,Withdrawn,Phase 2,Sponsor withdrew support; Study did not progre...
10558,Dasatinib,DB01254,CNS disorder,C0007682,NCT02661113,Withdrawn,Phase 2,Sponsor withdrew support; Study did not progre...
10559,Dacarbazine,DB00851,Brain Diseases,C0006111,NCT02661113,Withdrawn,Phase 2,Sponsor withdrew support; Study did not progre...
10560,Temozolomide,DB00853,Brain Diseases,C0006111,NCT02661113,Withdrawn,Phase 2,Sponsor withdrew support; Study did not progre...


In [106]:
df3['drug_name'].nunique()

1572

This data was built from DrugCentral 2016 release and AACT 2016 (association of clinical trials database) which I can't even find. This data feels fairly potentially out of date....I think more up-to-date relevant sources would probably be a better fit. COuld this be good for historic capture or perhaps looking for things which we don't have coverage?
  
https://www.nature.com/articles/sdata201729

repoDB spans 1,571 drugs and 2,051 UMLS disease concepts, accounting for 6,677 approved and 4,123 failed drug-indication pairs (see Table 2 and Fig. 1b for trial status breakdown).
  
Seems potentially useful for pulling in indication pairs? This potentially might have some already overlapped data though? Also still given that it's indication of use data, this is approach almost 10 years of age. Would be better to have a cool way to pull this in automatically, say from the FDA.