In [2]:
import pandas as pd
import numpy as np


# Ontological Expansion 
Given the results from this AI curation pipeline, evaluate the rate of ontological expansion of DGIdb. Specifically, observe how many new drug and gene concepts are imported into DGIdb (as corresponding to concept groups in the normalizers). Evaluate this for the BCL2 Associated Literature, and the DGIdb reviewed literature

### Drugs (BCL2-associated Literature)

In [25]:
df = pd.read_excel('data/final_results_test3.xlsx')
drug_concepts_in_dgidb = pd.read_csv('data/drugs.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,50,37726279,venetoclax,ABCC1,YES,INHIBITING,Genetic and pharmacologic ABCC1 inactivation p...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:1747556,venetoclax,80
1,51,37726279,glutathione,ABCC1,YES,ACTIVATING,Consistent with ABCC1-specific export of gluta...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:4890,glutathione,80
2,7,37004989,Kynurenine,AhR,YES,ACTIVATING,"An endogenous AhR ligand, kynurenine (Kyn), wa...",normalize.gene.hgnc:348,AHR,100,normalize.therapy.drugbank:DB02070,Kynurenine,80
3,32,33932119,ONC201,AKT,YES,INHIBITING,"The compensatory, pro-survival PI3K/AKT/mTOR p...",normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80
4,36,26884600,ONC201,AKT,YES,INHIBITING,ONC201 (also called TIC10) is a small molecule...,normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80


In [29]:
new_concepts = []
old_concepts = []

def trim_therapy(therapy):
    return therapy.split('normalize.therapy.')[1]

df['drug_concept_trimmed'] = df['drug_concept'].apply(trim_therapy)
for drug in df['drug_concept_trimmed']:
    tdf = drug_concepts_in_dgidb[drug_concepts_in_dgidb['concept_id']==drug]
    if len(tdf)>0:
        old_concepts.append(drug)
    if len(tdf)==0:
        new_concepts.append(drug)

print(f'')
print(f'New Drug Concepts found in Curation Set: {len(set(new_concepts))}')    # new_concepts
print(f'Existing Drug Concepts in Curation Set: {len(set(old_concepts))}')
print(f'Total # of Existing Drug Concepts in DGIdb: {len(set(drug_concepts_in_dgidb['concept_id']))}')
concept_capture_increase = ((len(set(drug_concepts_in_dgidb)) + len(set(new_concepts))) / len(set(drug_concepts_in_dgidb['concept_id'])) * 100)#-100
print(f'% Increase in Drug Concepts into DGIdb: {concept_capture_increase}% ({len(set(new_concepts))} new concepts)')


New Drug Concepts found in Curation Set: 9
Existing Drug Concepts in Curation Set: 74
Total # of Existing Drug Concepts in DGIdb: 39581
% Increase in Drug Concepts into DGIdb: 0.03789697076880322% (9 new concepts)


In [31]:
for concept in set(new_concepts):
    print(concept)

drugbank:DB02070
chembl:CHEMBL1159652
drugbank:DB13172
chembl:CHEMBL1909423
rxcui:114202
chembl:CHEMBL510380
rxcui:72031
iuphar.ligand:4644
chembl:CHEMBL3348861


In [42]:
for concept in set(new_concepts):
    tdf = df[df['drug_concept'].str.contains(concept, na=False)]
    if not tdf.empty:
        drug_name = tdf['drug_name'].iloc[0]
        for gene, pmid in zip(tdf['gene_label'], tdf['pmid']):
            print(concept, drug_name, gene, pmid)


drugbank:DB02070 Kynurenine AHR 37004989
chembl:CHEMBL1159652 MELIBIOSE TFEB 30335591
drugbank:DB13172 Tunicamycin GOLGA2P10 32332695
drugbank:DB13172 Tunicamycin RIPK1 26018731
chembl:CHEMBL1909423 CORYNOXINE B SNCA 24178442
rxcui:114202 lactate LDHA 31078780
chembl:CHEMBL510380 VERTICILLIN A BCL2L1 29409480
rxcui:72031 pyruvate PDP1 37935978
iuphar.ligand:4644 sialic acid CD22 20038598
chembl:CHEMBL3348861 AMPELOPSIN PRKAA2 29250183


In [36]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type,drug_concept_trimmed
0,50,37726279,venetoclax,ABCC1,YES,INHIBITING,Genetic and pharmacologic ABCC1 inactivation p...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:1747556,venetoclax,80,rxcui:1747556
1,51,37726279,glutathione,ABCC1,YES,ACTIVATING,Consistent with ABCC1-specific export of gluta...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:4890,glutathione,80,rxcui:4890
2,7,37004989,Kynurenine,AhR,YES,ACTIVATING,"An endogenous AhR ligand, kynurenine (Kyn), wa...",normalize.gene.hgnc:348,AHR,100,normalize.therapy.drugbank:DB02070,Kynurenine,80,drugbank:DB02070
3,32,33932119,ONC201,AKT,YES,INHIBITING,"The compensatory, pro-survival PI3K/AKT/mTOR p...",normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80,iuphar.ligand:9978
4,36,26884600,ONC201,AKT,YES,INHIBITING,ONC201 (also called TIC10) is a small molecule...,normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80,iuphar.ligand:9978


### Genes (BCL2-associated literature)

In [43]:
df = pd.read_excel('data/final_results_test3.xlsx')
gene_concepts_in_dgidb = pd.read_csv('data/genes.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,50,37726279,venetoclax,ABCC1,YES,INHIBITING,Genetic and pharmacologic ABCC1 inactivation p...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:1747556,venetoclax,80
1,51,37726279,glutathione,ABCC1,YES,ACTIVATING,Consistent with ABCC1-specific export of gluta...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:4890,glutathione,80
2,7,37004989,Kynurenine,AhR,YES,ACTIVATING,"An endogenous AhR ligand, kynurenine (Kyn), wa...",normalize.gene.hgnc:348,AHR,100,normalize.therapy.drugbank:DB02070,Kynurenine,80
3,32,33932119,ONC201,AKT,YES,INHIBITING,"The compensatory, pro-survival PI3K/AKT/mTOR p...",normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80
4,36,26884600,ONC201,AKT,YES,INHIBITING,ONC201 (also called TIC10) is a small molecule...,normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80


In [44]:
new_concepts = []
old_concepts = []

def trim_gene(gene):
    return gene.split('normalize.gene.')[1]

df['gene_concept_trimmed'] = df['gene_concept'].apply(trim_gene)
for gene in df['gene_concept_trimmed']:
    tdf = gene_concepts_in_dgidb[gene_concepts_in_dgidb['concept_id']==gene]
    if len(tdf)>0:
        old_concepts.append(gene)
    if len(tdf)==0:
        new_concepts.append(gene)


print(f'New Gene Concepts found in Curation Set: {len(set(new_concepts))}')    # new_concepts
print(f'Existing Gene Concepts in Curation Set: {len(set(old_concepts))}')
print(f'Total # of Existing Gene Concepts in DGIdb: {len(set(gene_concepts_in_dgidb['concept_id']))}')
concept_capture_increase = ((len(set(gene_concepts_in_dgidb)) + len(set(new_concepts))) / len(set(gene_concepts_in_dgidb['concept_id'])) * 100)#-100
print(f'% Increase in Drug Concepts into DGIdb: {concept_capture_increase}% ({len(set(new_concepts))} new concepts)')

New Gene Concepts found in Curation Set: 8
Existing Gene Concepts in Curation Set: 79
Total # of Existing Gene Concepts in DGIdb: 12062
% Increase in Drug Concepts into DGIdb: 0.09948598905654121% (8 new concepts)


In [45]:
for concept in set(new_concepts):
    print(concept)

hgnc:26229
hgnc:44048
hgnc:30181
hgnc:33791
hgnc:9874
hgnc:3254
hgnc:11655
hgnc:31868


In [46]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type,gene_concept_trimmed
0,50,37726279,venetoclax,ABCC1,YES,INHIBITING,Genetic and pharmacologic ABCC1 inactivation p...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:1747556,venetoclax,80,hgnc:51
1,51,37726279,glutathione,ABCC1,YES,ACTIVATING,Consistent with ABCC1-specific export of gluta...,normalize.gene.hgnc:51,ABCC1,100,normalize.therapy.rxcui:4890,glutathione,80,hgnc:51
2,7,37004989,Kynurenine,AhR,YES,ACTIVATING,"An endogenous AhR ligand, kynurenine (Kyn), wa...",normalize.gene.hgnc:348,AHR,100,normalize.therapy.drugbank:DB02070,Kynurenine,80,hgnc:348
3,32,33932119,ONC201,AKT,YES,INHIBITING,"The compensatory, pro-survival PI3K/AKT/mTOR p...",normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80,hgnc:391
4,36,26884600,ONC201,AKT,YES,INHIBITING,ONC201 (also called TIC10) is a small molecule...,normalize.gene.hgnc:391,AKT1,60,normalize.therapy.iuphar.ligand:9978,ONC201,80,hgnc:391


In [47]:
for concept in set(new_concepts):
    tdf = df[df['gene_concept'].str.contains(concept, na=False)]
    if not tdf.empty:
        gene_name = tdf['gene_label'].iloc[0]
        for drug, pmid in zip(tdf['drug_name'], tdf['pmid']):
            print(concept, gene_name, drug, pmid)


hgnc:26229 GOLGA2P10 Tunicamycin 32332695
hgnc:26229 GOLGA2P10 Thapsigargin 32332695
hgnc:44048 PANDAR cisplatin 30375398
hgnc:44048 PANDAR doxorubicin hydrochloride 30375398
hgnc:44048 PANDAR paclitaxel 30375398
hgnc:44048 PANDAR platinum 30375398
hgnc:30181 CCDC106 alanine 30885251
hgnc:33791 LINC00173 cisplatin 36627670
hgnc:9874 RASAL2 platinum 39890967
hgnc:3254 EIF2A ESTROGEN 30655322
hgnc:3254 EIF2A SALUBRINAL 30655322
hgnc:11655 TCP1 Adriamycin 34750375
hgnc:31868 MIR375 enzalutamide 36042375


## Drugs (DGIdb Literature)

In [53]:
df = pd.read_excel('data/final_results_control_test3.xlsx')
drug_concepts_in_dgidb = pd.read_csv('data/drugs.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,0,12941843,TRAIL,XIAP,YES,INHIBITING,"In RKO, rottlerin induced the release of cytoc...",normalize.gene.hgnc:592,XIAP,100,normalize.therapy.iuphar.ligand:5065,TRAIL,80
1,1,12941843,calphostin C,PKC,YES,INHIBITING,Calphostin c [an inhibitor of classic and nove...,normalize.gene.hgnc:30500,PRRT2,60,normalize.therapy.iuphar.ligand:5156,calphostin C,80
2,2,12941843,Gö 6976,PKC,YES,INHIBITING,"Go6976, (inhibitor of classic PKC isoforms), d...",normalize.gene.hgnc:30500,PRRT2,60,,,0
3,3,12941843,phorbol 12-myristate 13-acetate,PKC,YES,INHIBITING,"Furthermore, the incubation of HCT116 or RKO w...",normalize.gene.hgnc:30500,PRRT2,60,normalize.therapy.iuphar.ligand:2341,phorbol 12-myristate 13-acetate,80
4,4,24398428,vemurafenib,BRAF,YES,INHIBITING,"Selective BRAF(V600E) inhibitors, such as vemu...",normalize.gene.hgnc:1097,BRAF,100,normalize.therapy.rxcui:1147220,vemurafenib,80


In [60]:
new_concepts = []
old_concepts = []

def trim_therapy(therapy):
    if pd.isna(therapy):
        return 'No Concept'
    return therapy.split('normalize.therapy.')[1]

df['drug_concept_trimmed'] = df['drug_concept'].apply(trim_therapy)
for drug in df['drug_concept_trimmed']:
    tdf = drug_concepts_in_dgidb[drug_concepts_in_dgidb['concept_id']==drug]
    if len(tdf)>0:
        old_concepts.append(drug)
    if len(tdf)==0:
        new_concepts.append(drug)

print(f'')
print(f'New Drug Concepts found in Curation Set: {len(new_concepts)}')    # new_concepts
print(f'Existing Drug Concepts in Curation Set: {len(old_concepts)}')
print(f'Total # of Existing Drug Concepts in DGIdb: {len(drug_concepts_in_dgidb)}')
concept_capture_increase = ((len(drug_concepts_in_dgidb) + len(new_concepts)) / len(drug_concepts_in_dgidb) * 100)-100
print(f'% Increase in Drug Concepts into DGIdb: {concept_capture_increase}% ({len(new_concepts)} new concepts)')


New Drug Concepts found in Curation Set: 10
Existing Drug Concepts in Curation Set: 178
Total # of Existing Drug Concepts in DGIdb: 39581
% Increase in Drug Concepts into DGIdb: 0.025264647179199073% (10 new concepts)


In [61]:
for concept in new_concepts:
    print(concept)

No Concept
rxcui:1426598
rxcui:1546382
rxcui:10603
chembl:CHEMBL499815
chembl:CHEMBL4802231
ncit:C88791
chembl:CHEMBL50626
chembl:CHEMBL50626
rxcui:1244014


### Gene Concepts (DGIdb Literature)

In [62]:
df = pd.read_excel('data/final_results_control_test3.xlsx')
gene_concepts_in_dgidb = pd.read_csv('data/genes.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,0,12941843,TRAIL,XIAP,YES,INHIBITING,"In RKO, rottlerin induced the release of cytoc...",normalize.gene.hgnc:592,XIAP,100,normalize.therapy.iuphar.ligand:5065,TRAIL,80
1,1,12941843,calphostin C,PKC,YES,INHIBITING,Calphostin c [an inhibitor of classic and nove...,normalize.gene.hgnc:30500,PRRT2,60,normalize.therapy.iuphar.ligand:5156,calphostin C,80
2,2,12941843,Gö 6976,PKC,YES,INHIBITING,"Go6976, (inhibitor of classic PKC isoforms), d...",normalize.gene.hgnc:30500,PRRT2,60,,,0
3,3,12941843,phorbol 12-myristate 13-acetate,PKC,YES,INHIBITING,"Furthermore, the incubation of HCT116 or RKO w...",normalize.gene.hgnc:30500,PRRT2,60,normalize.therapy.iuphar.ligand:2341,phorbol 12-myristate 13-acetate,80
4,4,24398428,vemurafenib,BRAF,YES,INHIBITING,"Selective BRAF(V600E) inhibitors, such as vemu...",normalize.gene.hgnc:1097,BRAF,100,normalize.therapy.rxcui:1147220,vemurafenib,80


In [63]:
new_concepts = []
old_concepts = []

def trim_gene(gene):
    return gene.split('normalize.gene.')[1]

df['gene_concept_trimmed'] = df['gene_concept'].apply(trim_gene)
for gene in df['gene_concept_trimmed']:
    tdf = gene_concepts_in_dgidb[gene_concepts_in_dgidb['concept_id']==gene]
    if len(tdf)>0:
        old_concepts.append(gene)
    if len(tdf)==0:
        new_concepts.append(gene)


print(f'New Gene Concepts found in Curation Set: {len(new_concepts)}')    # new_concepts
print(f'Existing Gene Concepts in Curation Set: {len(old_concepts)}')
print(f'Total # of Existing Gene Concepts in DGIdb: {len(gene_concepts_in_dgidb)}')
concept_capture_increase = ((len(gene_concepts_in_dgidb) + len(new_concepts)) / len(gene_concepts_in_dgidb) * 100)-100
print(f'% Increase in Drug Concepts into DGIdb: {concept_capture_increase}% ({len(new_concepts)} new concepts)')

New Gene Concepts found in Curation Set: 0
Existing Gene Concepts in Curation Set: 188
Total # of Existing Gene Concepts in DGIdb: 12062
% Increase in Drug Concepts into DGIdb: 0.0% (0 new concepts)
