In [3]:
import pandas as pd
import sys, os
import inflect
from tqdm import tqdm
import requests
import ast

# Add the parent directory of this notebook to the Python path
sys.path.append(os.path.abspath('..'))

import score
import novel
import search_set
import indicator
import literature


# Surpress NLP Mask Warning for Apple Silicon
import warnings
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=r".*encoder_attention_mask.*BertSdpaSelfAttention\.forward"
)

ModuleNotFoundError: Could not import module 'pipeline'. Are this object's requirements defined correctly?

## Curate Literature (DGIdb Control)

In [7]:
publications = pd.read_csv('data/dgidb/publications.csv')
pmids = publications['pmid'].tolist()
pmids = [str(pmid) for pmid in pmids]
abstracts = literature.fetch_abstracts(pmids)
abstracts[0:5]

13142 PMIDs found!
Fetching...


100%|██████████| 66/66 [01:29<00:00,  1.36s/it]


[('38496920',
  'Isocitrate dehydrogenase (IDH) is commonly mutated (mIDH) in gliomas, and this mutant enzyme produces the oncometabolite 2-hydroxyglutarate (2HG). 2HG promotes gliomagenesis and is implicated in epileptogenesis. Ivosidenib (IVO), a small molecule oral mIDH1 inhibitor, is FDA-approved for mIDH1 newly diagnosed and relapsed/refractory acute myeloid leukemia. Moreover, IVO has efficacy in clinical trials for recurrent mIDH1 gliomas. Given the lack of targeted treatments for gliomas, we initiated off-label IVO for mIDH glioma patients in October 2020. Retrospectively, we sought to assess early outcomes in our patients and describe their experience on IVO from October 2020 through February 2022. Our objective was to report on the following variables of off-label use of IVO: radiographic response, seizure control, tolerability, and access to the medication. All patients initially received single-agent IVO dosed at 500 mg orally once daily. The cohort age range was 21-74 year

In [11]:
abstract = pd.DataFrame(abstracts, columns=["Title", "Abstract"])
abstract

Unnamed: 0,Title,Abstract
0,38496920,Isocitrate dehydrogenase (IDH) is commonly mut...
1,10100098,The tissue renin-angiotensin system and extrac...
2,25686592,Intestinal P-glycoprotein (P-gp) is a limiting...
3,23920485,"Design, synthesis and anticancer activity of a..."
4,23459444,Compelling evidence points to a key role for i...
...,...,...
12756,20855208,Cancer therapy has moved beyond conventional c...
12757,22369181,The ligand-regulated nuclear receptor peroxiso...
12758,37773077,Approximately 10% to 15% of triple-negative br...
12759,29992026,"Tramadol is a complex drug, being metabolized ..."


In [11]:
results = novel.batch(abstract['Abstract'])
results

100%|██████████| 12761/12761 [32:21<00:00,  6.57it/s] 


Unnamed: 0,entity_group,score,word,start,end,original_text
0,GENETIC,0.999992,Isocitrate dehydrogenase,0,24,Isocitrate dehydrogenase (IDH) is commonly mut...
1,GENETIC,0.999984,IDH,26,29,Isocitrate dehydrogenase (IDH) is commonly mut...
2,GENETIC,0.999972,mIDH,52,56,Isocitrate dehydrogenase (IDH) is commonly mut...
3,GENETIC,0.999993,mIDH1,252,257,Isocitrate dehydrogenase (IDH) is commonly mut...
4,GENETIC,0.999970,mIDH1,289,294,Isocitrate dehydrogenase (IDH) is commonly mut...
...,...,...,...,...,...,...
317746,DISEASE,0.999982,tumor,1252,1257,Bladder cancer accounts for nearly 5% of all n...
317747,DISEASE,0.999979,tumor,1333,1338,Bladder cancer accounts for nearly 5% of all n...
317748,DISEASE,0.999992,bladder cancer,1403,1417,Bladder cancer accounts for nearly 5% of all n...
317749,DISEASE,0.999992,bladder cancer,1509,1523,Bladder cancer accounts for nearly 5% of all n...


In [13]:
results[results['entity_group'] == 'CHEMICAL']

Unnamed: 0,entity_group,score,word,start,end,original_text,concept_match_type,concept_id,concept_label
5,CHEMICAL,0.999998,Isocitrate,0,10,Isocitrate dehydrogenase (IDH) is commonly mut...,80.0,normalize.therapy.chembl:CHEMBL539669,ISOCITRATE
6,CHEMICAL,0.999999,2 - hydroxyglutarate,121,139,Isocitrate dehydrogenase (IDH) is commonly mut...,0.0,,
7,CHEMICAL,0.999998,2HG,141,144,Isocitrate dehydrogenase (IDH) is commonly mut...,0.0,,
8,CHEMICAL,0.999994,2HG,147,150,Isocitrate dehydrogenase (IDH) is commonly mut...,0.0,,
9,CHEMICAL,0.999996,Ivosidenib,212,222,Isocitrate dehydrogenase (IDH) is commonly mut...,80.0,normalize.therapy.rxcui:2049873,ivosidenib
...,...,...,...,...,...,...,...,...,...
317719,CHEMICAL,0.999972,olaparib,1450,1458,Approximately 10% to 15% of triple-negative br...,,,
317720,CHEMICAL,0.999997,ceralasertib,1463,1475,Approximately 10% to 15% of triple-negative br...,,,
317721,CHEMICAL,0.958486,olaparib,1641,1649,Approximately 10% to 15% of triple-negative br...,,,
317728,CHEMICAL,0.999998,Tramadol,0,8,"Tramadol is a complex drug, being metabolized ...",,,


In [None]:
def _singularize(word):
    inflector = inflect.engine()
    return inflector.singular_noun(word) or word

def _normalize_therapy(word):
    try:
        r = requests.get(
            f'https://normalize.cancervariants.org/therapy/normalize?q={word}&infer_namespace=true',
            timeout=10  # Set timeout for network reliability
        )
        r.raise_for_status()
        response = r.json()

        if isinstance(response, dict) and response.get('match_type') is not None:
            if response['match_type'] != 0:
                return [
                    response['match_type'],
                    response['therapy']['id'],
                    response['therapy']['name']
                ]
            else:
                return [0, None, None]  # Not matched
        else:
            return ['Unexpected Response Format', None, None]
    except requests.exceptions.RequestException as e:
        return ['HTTP Error', str(e), None]
    except Exception as e:
        return ['Failure to Normalize', str(e), None]

# Main loop
checkpoint_interval = 5000
output_base = "normalized_results_checkpoint"
for idx, (index, row) in enumerate(tqdm(results[results['entity_group'] == 'CHEMICAL'].iterrows()), 1):
    word = _singularize(row['word'])
    norm_result = _normalize_therapy(word)

    results.at[index, 'concept_match_type'] = norm_result[0]
    results.at[index, 'concept_id'] = norm_result[1]
    results.at[index, 'concept_label'] = norm_result[2]

    if idx % checkpoint_interval == 0:
        checkpoint_filename = f"{output_base}_checkpoint_{idx}.xlsx"
        results.to_excel(checkpoint_filename, index=False)
        print(f"Checkpoint saved at row {idx} -> {checkpoint_filename}")

# Final save after loop completes
final_filename = f"{output_base}_final.xlsx"
# results.to_excel(final_filename, index=False)
print(f"Final results saved -> {final_filename}")

  results.at[index, 'concept_match_type'] = norm_result[0]
5000it [22:47,  8.47s/it]

Checkpoint saved at row 5000 -> normalized_results_checkpoint_checkpoint_5000.xlsx


10001it [48:24,  5.89s/it]

Checkpoint saved at row 10000 -> normalized_results_checkpoint_checkpoint_10000.xlsx


15001it [1:08:17,  5.96s/it]

Checkpoint saved at row 15000 -> normalized_results_checkpoint_checkpoint_15000.xlsx


20000it [1:28:44,  8.67s/it]

Checkpoint saved at row 20000 -> normalized_results_checkpoint_checkpoint_20000.xlsx


25001it [1:49:18,  6.03s/it]

Checkpoint saved at row 25000 -> normalized_results_checkpoint_checkpoint_25000.xlsx


30000it [2:09:52,  8.64s/it]

Checkpoint saved at row 30000 -> normalized_results_checkpoint_checkpoint_30000.xlsx


35002it [2:28:09,  4.17s/it]

Checkpoint saved at row 35000 -> normalized_results_checkpoint_checkpoint_35000.xlsx


40000it [2:40:19,  5.14s/it]

Checkpoint saved at row 40000 -> normalized_results_checkpoint_checkpoint_40000.xlsx


45001it [2:52:33,  5.54s/it]

Checkpoint saved at row 45000 -> normalized_results_checkpoint_checkpoint_45000.xlsx


50000it [3:04:42,  8.20s/it]

Checkpoint saved at row 50000 -> normalized_results_checkpoint_checkpoint_50000.xlsx


55001it [3:17:14,  5.88s/it]

Checkpoint saved at row 55000 -> normalized_results_checkpoint_checkpoint_55000.xlsx


60001it [3:29:22,  5.57s/it]

Checkpoint saved at row 60000 -> normalized_results_checkpoint_checkpoint_60000.xlsx


65002it [3:41:28,  4.50s/it]

Checkpoint saved at row 65000 -> normalized_results_checkpoint_checkpoint_65000.xlsx


70001it [3:53:19,  5.01s/it]

Checkpoint saved at row 70000 -> normalized_results_checkpoint_checkpoint_70000.xlsx


75001it [4:05:26,  5.32s/it]

Checkpoint saved at row 75000 -> normalized_results_checkpoint_checkpoint_75000.xlsx


80001it [4:17:25,  5.81s/it]

Checkpoint saved at row 80000 -> normalized_results_checkpoint_checkpoint_80000.xlsx


85001it [4:29:30,  4.80s/it]

Checkpoint saved at row 85000 -> normalized_results_checkpoint_checkpoint_85000.xlsx


90000it [4:41:41,  6.98s/it]

Checkpoint saved at row 90000 -> normalized_results_checkpoint_checkpoint_90000.xlsx


95001it [4:53:59,  4.49s/it]

Checkpoint saved at row 95000 -> normalized_results_checkpoint_checkpoint_95000.xlsx


100000it [5:06:24,  8.67s/it]

Checkpoint saved at row 100000 -> normalized_results_checkpoint_checkpoint_100000.xlsx


100297it [5:07:03,  5.44it/s]


Final results saved -> normalized_results_checkpoint_final.xlsx


In [None]:
results = pd.read_excel('normalized_results_checkpoint_final.xslx')

NameError: name 'results' is not defined

In [6]:
results = pd.read_excel('control_test3_normalized_results_final.xlsx')
tdf = results[(results['concept_match_type']!=0) & (results['concept_match_type'].isna()==False)].reset_index(drop=True)
condensed_results = tdf.groupby('original_text').apply(
    lambda group: pd.Series({
        'DRUG_LABELS': ' | '.join(group.loc[group['entity_group'] == 'CHEMICAL', 'concept_label'].dropna().astype(str).unique()),
        'DRUG_IDS': ' | '.join(group.loc[group['entity_group'] == 'CHEMICAL', 'concept_id'].dropna().astype(str).unique())
    })
).reset_index()

condensed_results

  condensed_results = tdf.groupby('original_text').apply(


Unnamed: 0,original_text,DRUG_LABELS,DRUG_IDS
0,"EAY131-H is an open-label, single-arm study. ...",dabrafenib | trametinib dimethyl sulfoxide,normalize.therapy.rxcui:1424911 | normalize.th...
1,Eligible patients had HER2-negative MBC with ...,alpelisib | paclitaxel,normalize.therapy.rxcui:2169285 | normalize.th...
2,Eligible patients had tumors with an activati...,Taselisib,normalize.therapy.ncit:C116876
3,Following transfection of exon 17 (E17) and e...,tyrosine | Ozogamicin,normalize.therapy.rxcui:10962 | normalize.ther...
4,One hundred six patients with Progression-fr...,vemurafenib | irinotecan hydrochloride,normalize.therapy.rxcui:1147220 | normalize.th...
...,...,...,...
10703,β-Thalassemia patients develop deficiency in v...,vitamin D | calcitriol | deferasirox,normalize.therapy.rxcui:11253 | normalize.ther...
10704,β1-Adrenergic receptor (β1-AR) stimulation mod...,sodium | cyclic AMP | flecainide acetate,normalize.therapy.rxcui:9853 | normalize.thera...
10705,γ-Butyrobetaine hydroxylase (BBOX) is a 2-oxog...,carnitine | zinc | selenium | Ebselen,normalize.therapy.rxcui:2106 | normalize.thera...
10706,σ-1 receptor (S1R) radioligands have the poten...,haloperidol decanoate,normalize.therapy.rxcui:26420


In [12]:
merged_df = pd.merge(
    abstract,
    condensed_results,
    left_on='Abstract',
    right_on='original_text',
    how='left'
)
merged_df = merged_df[merged_df['DRUG_LABELS'].isnull()==False].reset_index(drop=True)
merged_df

Unnamed: 0,Title,Abstract,original_text,DRUG_LABELS,DRUG_IDS
0,38496920,Isocitrate dehydrogenase (IDH) is commonly mut...,Isocitrate dehydrogenase (IDH) is commonly mut...,ISOCITRATE | ivosidenib,normalize.therapy.chembl:CHEMBL539669 | normal...
1,10100098,The tissue renin-angiotensin system and extrac...,The tissue renin-angiotensin system and extrac...,sodium chloride,normalize.therapy.rxcui:9863
2,25686592,Intestinal P-glycoprotein (P-gp) is a limiting...,Intestinal P-glycoprotein (P-gp) is a limiting...,XANTHONE | daunorubicin liposomal | paclitaxel...,normalize.therapy.chembl:CHEMBL186784 | normal...
3,23459444,Compelling evidence points to a key role for i...,Compelling evidence points to a key role for i...,ESTROGEN | tamoxifen | formalin,normalize.therapy.chembl:CHEMBL592868 | normal...
4,23948351,"Icotinib, an oral EGFR tyrosine kinase inhibit...","Icotinib, an oral EGFR tyrosine kinase inhibit...",Icotinib | tyrosine | gefitinib | platinum,normalize.therapy.ncit:C138996 | normalize.the...
...,...,...,...,...,...
10703,23786520,"Luteolin (1), a natural product occurring in m...","Luteolin (1), a natural product occurring in m...",Luteolin | flavanone | chrysin | apigenin | tr...,normalize.therapy.drugbank:DB15584 | normalize...
10704,15464068,The aim of the present study was to determine ...,The aim of the present study was to determine ...,tramadol | dopamine,normalize.therapy.rxcui:10689 | normalize.ther...
10705,20855208,Cancer therapy has moved beyond conventional c...,Cancer therapy has moved beyond conventional c...,vorinostat | selenium,normalize.therapy.rxcui:194337 | normalize.the...
10706,37773077,Approximately 10% to 15% of triple-negative br...,Approximately 10% to 15% of triple-negative br...,olaparib | Ceralasertib,normalize.therapy.rxcui:1597582 | normalize.th...


### Generate Score

In [13]:
dgidb_df = pd.read_csv('search/2025-08-13_BCL2_clin_score.csv')
dgidb_df = dgidb_df.drop_duplicates(subset=['Drug','Gene'], keep='first')
dgidb_df.head()

Unnamed: 0,nomenclature,Gene,long_name,gene_concept_id,interaction_score,drug_specificity,gene_specificity,evidence_score,source_db_name,source_db_version,...,directionality,definition,reference,drug_name,nomenclature-2,Drug,approved,immunotherapy,anti_neoplastic,drug_concept_id
0,Gene Symbol,BCL2,BCL2 apoptosis regulator,hgnc:990,2.284732,4.141413,0.183893,3.0,TALC,12-May-16,...,1.0,"In inhibitor interactions, the drug binds to a...","<a href=""https://en.wikipedia.org/wiki/Enzyme_...",OBLIMERSEN,Primary Name,OBLIMERSEN,False,False,True,ncit:C1870
3,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,DOLASTATIN 10,Primary Name,DOLASTATIN 10,False,False,False,ncit:C1300
4,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,EPISIALIN,Primary Name,MUC-1 ANTIGEN,False,False,False,ncit:C2407
5,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,MICELLAR PACLITAXEL,Primary Name,MICELLAR PACLITAXEL,False,False,False,ncit:C29256
6,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,BEAUVERICIN,Primary Name,BEAUVERICIN,False,False,False,ncit:C1011


In [14]:
import importlib
importlib.reload(indicator)

indicator.generate_interaction_evidence(merged_df, dgidb_df)

                                     

Results saved to 2025-08-25_BCL2.zip!


## Load Scores

In [16]:
tdf = score.load_pmid_assessments(f'2025-08-25_BCL2.zip', 'interaction_search')
tdf = tdf[tdf['label']=='interaction_evidence'].reset_index(drop=True)
tdf


Unnamed: 0,pmid,abstract,label,scores,tagged_drugs,concepts,gene,drug,method
0,38496920,Isocitrate dehydrogenase (IDH) is commonly mut...,interaction_evidence,"{'direct_interaction': 0, 'binding_interaction...",ISOCITRATE | ivosidenib,normalize.therapy.chembl:CHEMBL539669 | normal...,BCL2,interaction_search,interaction_search
1,10100098,The tissue renin-angiotensin system and extrac...,interaction_evidence,"{'direct_interaction': 2, 'binding_interaction...",sodium chloride,normalize.therapy.rxcui:9863,BCL2,interaction_search,interaction_search
2,25686592,Intestinal P-glycoprotein (P-gp) is a limiting...,interaction_evidence,"{'direct_interaction': 1, 'binding_interaction...",XANTHONE | daunorubicin liposomal | paclitaxel...,normalize.therapy.chembl:CHEMBL186784 | normal...,BCL2,interaction_search,interaction_search
3,23459444,Compelling evidence points to a key role for i...,interaction_evidence,"{'direct_interaction': 1, 'binding_interaction...",ESTROGEN | tamoxifen | formalin,normalize.therapy.chembl:CHEMBL592868 | normal...,BCL2,interaction_search,interaction_search
4,23948351,"Icotinib, an oral EGFR tyrosine kinase inhibit...",interaction_evidence,"{'direct_interaction': 0, 'binding_interaction...",Icotinib | tyrosine | gefitinib | platinum,normalize.therapy.ncit:C138996 | normalize.the...,BCL2,interaction_search,interaction_search
...,...,...,...,...,...,...,...,...,...
10210,7499260,It has recently been reported that protein-tyr...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",tyrosine | genistein | herbimycin A,normalize.therapy.rxcui:10962 | normalize.ther...,BCL2,interaction_search,interaction_search
10211,23786520,"Luteolin (1), a natural product occurring in m...",interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",Luteolin | flavanone | chrysin | apigenin | tr...,normalize.therapy.drugbank:DB15584 | normalize...,BCL2,interaction_search,interaction_search
10212,15464068,The aim of the present study was to determine ...,interaction_evidence,"{'direct_interaction': 1, 'binding_interaction...",tramadol | dopamine,normalize.therapy.rxcui:10689 | normalize.ther...,BCL2,interaction_search,interaction_search
10213,20855208,Cancer therapy has moved beyond conventional c...,interaction_evidence,"{'direct_interaction': 1, 'binding_interaction...",vorinostat | selenium,normalize.therapy.rxcui:194337 | normalize.the...,BCL2,interaction_search,interaction_search


In [17]:
def unpack_total(score):
    if type(score) is float:
        return 0
    if score is None:
        return 0
    return ast.literal_eval(score)['unweighted_total']

def unpack_regulation(score):
    if type(score) is float:
        return 0
    if score is None:
        return 0
    return ast.literal_eval(score)['regulation_changes']

tdf['total_interaction_evidence'] = tdf['scores'].apply(unpack_total)
tdf['total_regulation_evidence'] = tdf['scores'].apply(unpack_total)


tdf.sort_values(by='total_regulation_evidence', ascending=False)[0:10]

Unnamed: 0,pmid,abstract,label,scores,tagged_drugs,concepts,gene,drug,method,total_interaction_evidence,total_regulation_evidence
3947,12941843,Signaling pathways involved in survival respon...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",TRAIL | calphostin C | G&ouml; 6976 | rottleri...,normalize.therapy.iuphar.ligand:5065 | normali...,BCL2,interaction_search,interaction_search,16,16
8892,24398428,Activating BRAF kinase mutations serve as onco...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",vemurafenib | Ganetespib,normalize.therapy.rxcui:1147220 | normalize.th...,BCL2,interaction_search,interaction_search,16,16
7649,15870285,The expression of the glutathione S-transferas...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",glutathione | dexamethasone | Oltipraz | Retin...,normalize.therapy.rxcui:4890 | normalize.thera...,BCL2,interaction_search,interaction_search,16,16
9374,22649091,During a clinical trial of the tyrosine kinase...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",tyrosine | dasatinib anhydrous,normalize.therapy.rxcui:10962 | normalize.ther...,BCL2,interaction_search,interaction_search,15,15
7309,15319449,Leukemia inhibitory factor (LIF) cooperates wi...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",serine,normalize.therapy.rxcui:9671,BCL2,interaction_search,interaction_search,15,15
2170,17227768,Tumor necrosis factor-alpha (TNF-alpha) induce...,interaction_evidence,"{'direct_interaction': 5, 'binding_interaction...",anhydrous dextrose | serine,normalize.therapy.rxcui:349730 | normalize.the...,BCL2,interaction_search,interaction_search,15,15
6823,25658463,The wild-type p53-induced phosphatase 1 (WIP1)...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",serine | threonine | GSK2830371 | doxorubicin ...,normalize.therapy.rxcui:9671 | normalize.thera...,BCL2,interaction_search,interaction_search,15,15
7961,22806357,Nicotinamide adenine dinucleotide phosphate (N...,interaction_evidence,"{'direct_interaction': 6, 'binding_interaction...",Nicotinamide adenine dinucleotide phosphate | ...,normalize.therapy.drugbank:DB03461 | normalize...,BCL2,interaction_search,interaction_search,15,15
3062,18283314,The ErbB3 binding protein (Ebp1) is a transcri...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",serine | threonine | tamoxifen | glutamic acid,normalize.therapy.rxcui:9671 | normalize.thera...,BCL2,interaction_search,interaction_search,15,15
4075,16682955,Fibroblast growth factor (FGF) signaling can b...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",ESTROGEN | tyrosine | sirolimus | PI3K/BET Inh...,normalize.therapy.chembl:CHEMBL592868 | normal...,BCL2,interaction_search,interaction_search,14,14


In [18]:
position = 1
print(tdf.sort_values(by='total_interaction_evidence', ascending=False).reset_index(drop=True)['scores'][position])

{'direct_interaction': 4, 'binding_interaction': 1, 'regulation_changes': 2, 'sensitivity_resistance': 5, 'pharmacogenomic_signals': 4, 'unweighted_total': 16}


### Build Prompts

In [19]:
here_we_go = tdf.sort_values(by='total_interaction_evidence', ascending=False)[0:100].reset_index(drop=True)

def generate_prompt_base(drugs, gene):
    prompt = f"""You are an expert biomedical scientist, biochemist, and scientific curator trained to identify drug-gene interactions from scientific literature. Given a list of drugs, a gene, and a scientific abstract, your task is to determine whether an interaction between a drug and a gene is occuring and assign it an interaction directionality. Use the following tools to help perform these tasks.
    
    *Interaction*
    - An interaction between a small molecule and a gene or gene product. 

    *Interaction Directionality*
    - Activating -> Activating interactions are those where the drug increases the biological activity or expression of a gene target.
    - Inhibiting -> Inhibiting interactions are those where the drug decreases the biological activity or expression of a gene target.

    The drugs to consider for this task are {drugs}. For each drug, fill out the following JSON schema:
    {{
        "pmid": "EXACT PMID FROM CONTEXT",
        "drug_name": "NAME OF DRUG",
        "gene_name": "NAME OF GENE BEING INTERACTED WITH",
        "interaction_occurs_with_gene": "YES" or "NO",
        "interaction_type": "ACTIVATING" or "INHIBITING",
        "evidence": "EXACT SENTENCE FROM ABSTRACT THAT SUPPORTS INTERACTION"        
    }} 

"""
    
    return prompt

here_we_go['context'] = None
here_we_go['prompt'] = None

for idx, row in here_we_go.iterrows():
    drugs = row['tagged_drugs']
    gene = row['gene']
    prompt_base = generate_prompt_base(drugs, gene)

    # tdf = here_we_go[here_we['pmid']==row['pmid']].reset_index(drop=True)
    context = f'PMID: {row['pmid']}\n Abstract: {row['abstract']}'

    full_prompt = f'{prompt_base}\n\n{context}'

    here_we_go.at[idx,'context'] = context
    here_we_go.at[idx,'prompt'] = full_prompt    



here_we_go.head()

Unnamed: 0,pmid,abstract,label,scores,tagged_drugs,concepts,gene,drug,method,total_interaction_evidence,total_regulation_evidence,context,prompt
0,12941843,Signaling pathways involved in survival respon...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",TRAIL | calphostin C | G&ouml; 6976 | rottleri...,normalize.therapy.iuphar.ligand:5065 | normali...,BCL2,interaction_search,interaction_search,16,16,PMID: 12941843\n Abstract: Signaling pathways ...,"You are an expert biomedical scientist, bioche..."
1,24398428,Activating BRAF kinase mutations serve as onco...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",vemurafenib | Ganetespib,normalize.therapy.rxcui:1147220 | normalize.th...,BCL2,interaction_search,interaction_search,16,16,PMID: 24398428\n Abstract: Activating BRAF kin...,"You are an expert biomedical scientist, bioche..."
2,15870285,The expression of the glutathione S-transferas...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",glutathione | dexamethasone | Oltipraz | Retin...,normalize.therapy.rxcui:4890 | normalize.thera...,BCL2,interaction_search,interaction_search,16,16,PMID: 15870285\n Abstract: The expression of t...,"You are an expert biomedical scientist, bioche..."
3,22649091,During a clinical trial of the tyrosine kinase...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",tyrosine | dasatinib anhydrous,normalize.therapy.rxcui:10962 | normalize.ther...,BCL2,interaction_search,interaction_search,15,15,PMID: 22649091\n Abstract: During a clinical t...,"You are an expert biomedical scientist, bioche..."
4,15319449,Leukemia inhibitory factor (LIF) cooperates wi...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",serine,normalize.therapy.rxcui:9671,BCL2,interaction_search,interaction_search,15,15,PMID: 15319449\n Abstract: Leukemia inhibitory...,"You are an expert biomedical scientist, bioche..."


In [22]:
print(here_we_go['prompt'][0])

You are an expert biomedical scientist, biochemist, and scientific curator trained to identify drug-gene interactions from scientific literature. Given a list of drugs, a gene, and a scientific abstract, your task is to determine whether an interaction between a drug and a gene is occuring and assign it an interaction directionality. Use the following tools to help perform these tasks.
    
    *Interaction*
    - An interaction between a small molecule and a gene or gene product. 

    *Interaction Directionality*
    - Activating -> Activating interactions are those where the drug increases the biological activity or expression of a gene target.
    - Inhibiting -> Inhibiting interactions are those where the drug decreases the biological activity or expression of a gene target.

    The drugs to consider for this task are TRAIL | calphostin C | G&ouml; 6976 | rottlerin | phorbol 12-myristate 13-acetate. For each drug, fill out the following JSON schema:
    {
        "pmid": "EXACT P

In [23]:
import boto3
import json

# Initialize the Bedrock Runtime client
bedrock = boto3.client("bedrock-runtime", region_name="us-east-2")

# Replace with your actual inference profile ID or ARN
INFERENCE_PROFILE_ID = "us.anthropic.claude-3-5-sonnet-20240620-v1:0"
# Not opus 4 I guess????


def query_claude_sonnet(prompt: str) -> str:
    body = {
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 1024,
        "temperature": 0.0,
        "anthropic_version": "bedrock-2023-05-31"
    }

    try:
        response = bedrock.invoke_model(
            body=json.dumps(body),
            modelId=INFERENCE_PROFILE_ID,
            contentType="application/json",
            accept="application/json"
        )
        response_body = json.loads(response["body"].read())
        return response_body["content"][0]["text"]
    except Exception as e:
        return f"[Error] {str(e)}"

# Example usage
# response = query_claude_opus("Hello! This is a test message!")
# print(response)


In [24]:
here_we_go['response'] = None

for idx, row in here_we_go.iterrows():
    here_we_go.at[idx,'response'] = query_claude_sonnet(row['prompt'])
    print(f'{idx} Done')

0 Done
1 Done
2 Done
3 Done
4 Done
5 Done
6 Done
7 Done
8 Done
9 Done
10 Done
11 Done
12 Done
13 Done
14 Done
15 Done
16 Done
17 Done
18 Done
19 Done
20 Done
21 Done
22 Done
23 Done
24 Done
25 Done
26 Done
27 Done
28 Done
29 Done
30 Done
31 Done
32 Done
33 Done
34 Done
35 Done
36 Done
37 Done
38 Done
39 Done
40 Done
41 Done
42 Done
43 Done
44 Done
45 Done
46 Done
47 Done
48 Done
49 Done
50 Done
51 Done
52 Done
53 Done
54 Done
55 Done
56 Done
57 Done
58 Done
59 Done
60 Done
61 Done
62 Done
63 Done
64 Done
65 Done
66 Done
67 Done
68 Done
69 Done
70 Done
71 Done
72 Done
73 Done
74 Done
75 Done
76 Done
77 Done
78 Done
79 Done
80 Done
81 Done
82 Done
83 Done
84 Done
85 Done
86 Done
87 Done
88 Done
89 Done
90 Done
91 Done
92 Done
93 Done
94 Done
95 Done
96 Done
97 Done
98 Done
99 Done


In [None]:
# here_we_go.to_excel('Literature_Prioritization_control3_use_this.xlsx')

### Extract!

In [29]:
import re, json

def extract_json_objects(text):
    """
    Extract all JSON-like objects (dicts) from a text string.
    Returns a list of parsed dicts.
    """
    objects = []
    brace_stack = []
    start = None
    
    for i, ch in enumerate(text):
        if ch == "{":
            if not brace_stack:
                start = i
            brace_stack.append("{")
        elif ch == "}":
            if brace_stack:
                brace_stack.pop()
                if not brace_stack and start is not None:
                    snippet = text[start:i+1]
                    try:
                        objects.append(json.loads(snippet))
                    except json.JSONDecodeError:
                        # fallback: strip trailing commas etc.
                        snippet = re.sub(r",\s*}", "}", snippet)
                        snippet = re.sub(r",\s*]", "]", snippet)
                        try:
                            objects.append(json.loads(snippet))
                        except:
                            pass
                    start = None
    return objects


def extract_note_text(text):
    """
    Grab any trailing 'Note:' or 'Explanation:' text from a response.
    Returns a single string ('' if none found).
    """
    match = re.search(r"(?:Note|Explanation)[:\-–]\s*(.+)", text, flags=re.IGNORECASE|re.DOTALL)
    return match.group(1).strip() if match else ""


In [30]:
here_we_go['json'] = None
here_we_go['free_text_explanation'] = None
for idx, row in here_we_go.iterrows():
    here_we_go.at[idx, 'free_text_explanation'] = extract_note_text(row['response'])
    here_we_go.at[idx,'json'] = extract_json_objects(row['response'])

here_we_go.head()

Unnamed: 0,pmid,abstract,label,scores,tagged_drugs,concepts,gene,drug,method,total_interaction_evidence,total_regulation_evidence,context,prompt,response,json,free_text_explanation
0,12941843,Signaling pathways involved in survival respon...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",TRAIL | calphostin C | G&ouml; 6976 | rottleri...,normalize.therapy.iuphar.ligand:5065 | normali...,BCL2,interaction_search,interaction_search,16,16,PMID: 12941843\n Abstract: Signaling pathways ...,"You are an expert biomedical scientist, bioche...","Based on the abstract provided, I will analyze...","[{'pmid': '12941843', 'drug_name': 'TRAIL', 'g...",
1,24398428,Activating BRAF kinase mutations serve as onco...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",vemurafenib | Ganetespib,normalize.therapy.rxcui:1147220 | normalize.th...,BCL2,interaction_search,interaction_search,16,16,PMID: 24398428\n Abstract: Activating BRAF kin...,"You are an expert biomedical scientist, bioche...","Based on the provided abstract, I will fill ou...","[{'pmid': '24398428', 'drug_name': 'vemurafeni...",
2,15870285,The expression of the glutathione S-transferas...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",glutathione | dexamethasone | Oltipraz | Retin...,normalize.therapy.rxcui:4890 | normalize.thera...,BCL2,interaction_search,interaction_search,16,16,PMID: 15870285\n Abstract: The expression of t...,"You are an expert biomedical scientist, bioche...","Based on the provided abstract, I will analyze...","[{'pmid': '15870285', 'drug_name': 'dexamethas...",
3,22649091,During a clinical trial of the tyrosine kinase...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",tyrosine | dasatinib anhydrous,normalize.therapy.rxcui:10962 | normalize.ther...,BCL2,interaction_search,interaction_search,15,15,PMID: 22649091\n Abstract: During a clinical t...,"You are an expert biomedical scientist, bioche...","Based on the provided abstract, I can identify...","[{'pmid': '22649091', 'drug_name': 'dasatinib ...",The abstract describes an interaction between ...
4,15319449,Leukemia inhibitory factor (LIF) cooperates wi...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",serine,normalize.therapy.rxcui:9671,BCL2,interaction_search,interaction_search,15,15,PMID: 15319449\n Abstract: Leukemia inhibitory...,"You are an expert biomedical scientist, bioche...","Based on the given abstract, there is no menti...","[{'pmid': '15319449', 'drug_name': 'serine', '...",


In [31]:
here_we_go['json'][0]

[{'pmid': '12941843',
  'drug_name': 'TRAIL',
  'gene_name': 'XIAP',
  'interaction_occurs_with_gene': 'YES',
  'interaction_type': 'INHIBITING',
  'evidence': 'In RKO, rottlerin induced the release of cytochrome c, HtrA2/Omi, Smac/DIABLO, and AIF from the mitochondria, potentiated in combination with TRAIL, with concomitant caspase activation and down-regulation of XIAP.'},
 {'pmid': '12941843',
  'drug_name': 'calphostin C',
  'gene_name': 'PKC',
  'interaction_occurs_with_gene': 'YES',
  'interaction_type': 'INHIBITING',
  'evidence': 'Calphostin c [an inhibitor of classic and novel isoforms of protein kinase C (PKC)] sensitized five of six cell lines to TRAIL'},
 {'pmid': '12941843',
  'drug_name': 'Gö 6976',
  'gene_name': 'PKC',
  'interaction_occurs_with_gene': 'YES',
  'interaction_type': 'INHIBITING',
  'evidence': 'Go6976, (inhibitor of classic PKC isoforms), did not influence TRAIL sensitivity.'},
 {'pmid': '12941843',
  'drug_name': 'rottlerin',
  'gene_name': 'PKC delta',


In [32]:
flat_df = pd.DataFrame([obj for lst in here_we_go["json"] for obj in lst])
flat_df


Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence
0,12941843,TRAIL,XIAP,YES,INHIBITING,"In RKO, rottlerin induced the release of cytoc..."
1,12941843,calphostin C,PKC,YES,INHIBITING,Calphostin c [an inhibitor of classic and nove...
2,12941843,Gö 6976,PKC,YES,INHIBITING,"Go6976, (inhibitor of classic PKC isoforms), d..."
3,12941843,rottlerin,PKC delta,YES,INHIBITING,"Rottlerin, an inhibitor of novel isoforms of P..."
4,12941843,phorbol 12-myristate 13-acetate,PKC,YES,INHIBITING,"Furthermore, the incubation of HCT116 or RKO w..."
...,...,...,...,...,...,...
264,8947586,glycine,RARA,YES,ACTIVATING,DNA sequence analysis revealed a mutation in t...
265,8947586,aspartic acid,RARA,YES,INHIBITING,DNA sequence analysis revealed a mutation in t...
266,11125300,Recombinant Tumor Necrosis Factor Family Protein,TNFR1,YES,ACTIVATING,Stimulation of TNFR1 was sufficient to induce ...
267,11125300,Recombinant Tumor Necrosis Factor Family Protein,TNFR2,YES,ACTIVATING,Experiments based on the use of receptor-agoni...


In [33]:
flat_df = flat_df[(flat_df['interaction_occurs_with_gene']=='YES') & (flat_df['gene_name']!='N/A')]
flat_df

Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence
0,12941843,TRAIL,XIAP,YES,INHIBITING,"In RKO, rottlerin induced the release of cytoc..."
1,12941843,calphostin C,PKC,YES,INHIBITING,Calphostin c [an inhibitor of classic and nove...
2,12941843,Gö 6976,PKC,YES,INHIBITING,"Go6976, (inhibitor of classic PKC isoforms), d..."
3,12941843,rottlerin,PKC delta,YES,INHIBITING,"Rottlerin, an inhibitor of novel isoforms of P..."
4,12941843,phorbol 12-myristate 13-acetate,PKC,YES,INHIBITING,"Furthermore, the incubation of HCT116 or RKO w..."
...,...,...,...,...,...,...
263,8947586,vitamin D3,RARA,YES,ACTIVATING,We have observed that the resistant cell line ...
264,8947586,glycine,RARA,YES,ACTIVATING,DNA sequence analysis revealed a mutation in t...
265,8947586,aspartic acid,RARA,YES,INHIBITING,DNA sequence analysis revealed a mutation in t...
266,11125300,Recombinant Tumor Necrosis Factor Family Protein,TNFR1,YES,ACTIVATING,Stimulation of TNFR1 was sufficient to induce ...


In [36]:
def _normalize_gene(word):
    try:
        r = requests.get(
            f'https://normalize.cancervariants.org/gene/normalize?q={word}',
            timeout=10  # Set timeout for network reliability
        )
        r.raise_for_status()
        response = r.json()

        if isinstance(response, dict) and response.get('match_type') is not None:
            if response['match_type'] != 0:
                return [
                    response['match_type'],
                    response['gene']['id'],
                    response['gene']['name']
                ]
            else:
                return [0, None, None]  # Not matched
        else:
            return ['Unexpected Response Format', None, None]
    except requests.exceptions.RequestException as e:
        return ['HTTP Error', str(e), None]
    except Exception as e:
        return ['Failure to Normalize', str(e), None]
    
def _normalize_therapy(word):
    try:
        r = requests.get(
            f'https://normalize.cancervariants.org/therapy/normalize?q={word}&infer_namespace=true',
            timeout=10  # Set timeout for network reliability
        )
        r.raise_for_status()
        response = r.json()

        if isinstance(response, dict) and response.get('match_type') is not None:
            if response['match_type'] != 0:
                return [
                    response['match_type'],
                    response['therapy']['id'],
                    response['therapy']['name']
                ]
            else:
                return [0, None, None]  # Not matched
        else:
            return ['Unexpected Response Format', None, None]
    except requests.exceptions.RequestException as e:
        return ['HTTP Error', str(e), None]
    except Exception as e:
        return ['Failure to Normalize', str(e), None]


In [37]:
flat_df['gene_concept'] = None
flat_df['gene_label'] = None
flat_df['gene_match_type'] = None
flat_df['drug_concept'] = None
flat_df['drug_label'] = None
flat_df['drug_match_type'] = None
for idx, row in tqdm(flat_df.iterrows()):
    drug_match_type, drug_concept, drug_label = _normalize_therapy(row['drug_name'])
    gene_match_type, gene_concept, gene_label = _normalize_gene(row['gene_name'])

    flat_df.at[idx, 'gene_concept'] = gene_concept
    flat_df.at[idx, 'gene_label'] = gene_label
    flat_df.at[idx, 'gene_match_type'] = gene_match_type
    flat_df.at[idx, 'drug_concept'] = drug_concept
    flat_df.at[idx, 'drug_label'] = drug_label
    flat_df.at[idx, 'drug_match_type'] = drug_match_type

flat_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flat_df['gene_concept'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flat_df['gene_label'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flat_df['gene_match_type'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = va

Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,12941843,TRAIL,XIAP,YES,INHIBITING,"In RKO, rottlerin induced the release of cytoc...",normalize.gene.hgnc:592,XIAP,100,normalize.therapy.iuphar.ligand:5065,TRAIL,80
1,12941843,calphostin C,PKC,YES,INHIBITING,Calphostin c [an inhibitor of classic and nove...,normalize.gene.hgnc:30500,PRRT2,60,normalize.therapy.iuphar.ligand:5156,calphostin C,80
2,12941843,Gö 6976,PKC,YES,INHIBITING,"Go6976, (inhibitor of classic PKC isoforms), d...",normalize.gene.hgnc:30500,PRRT2,60,,,0
3,12941843,rottlerin,PKC delta,YES,INHIBITING,"Rottlerin, an inhibitor of novel isoforms of P...",,,0,normalize.therapy.iuphar.ligand:2611,rottlerin,80
4,12941843,phorbol 12-myristate 13-acetate,PKC,YES,INHIBITING,"Furthermore, the incubation of HCT116 or RKO w...",normalize.gene.hgnc:30500,PRRT2,60,normalize.therapy.iuphar.ligand:2341,phorbol 12-myristate 13-acetate,80
...,...,...,...,...,...,...,...,...,...,...,...,...
263,8947586,vitamin D3,RARA,YES,ACTIVATING,We have observed that the resistant cell line ...,normalize.gene.hgnc:9864,RARA,100,normalize.therapy.rxcui:1244014,vitamin D3,80
264,8947586,glycine,RARA,YES,ACTIVATING,DNA sequence analysis revealed a mutation in t...,normalize.gene.hgnc:9864,RARA,100,normalize.therapy.rxcui:4919,glycine,80
265,8947586,aspartic acid,RARA,YES,INHIBITING,DNA sequence analysis revealed a mutation in t...,normalize.gene.hgnc:9864,RARA,100,normalize.therapy.rxcui:1169,aspartic acid,80
266,11125300,Recombinant Tumor Necrosis Factor Family Protein,TNFR1,YES,ACTIVATING,Stimulation of TNFR1 was sufficient to induce ...,normalize.gene.hgnc:11916,TNFRSF1A,80,normalize.therapy.ncit:C1941,Recombinant Tumor Necrosis Factor Family Protein,80


In [38]:
flat_df['gene_concept'].value_counts(dropna=False)

gene_concept
None                         24
normalize.gene.hgnc:1097     11
normalize.gene.hgnc:336      10
normalize.gene.hgnc:8975      9
normalize.gene.hgnc:12679     9
                             ..
normalize.gene.hgnc:6973      1
normalize.gene.hgnc:5173      1
normalize.gene.hgnc:7646      1
normalize.gene.hgnc:3553      1
normalize.gene.hgnc:11917     1
Name: count, Length: 77, dtype: int64

In [39]:
flat_df['drug_concept'].value_counts(dropna=False)

drug_concept
normalize.therapy.rxcui:1147220         6
normalize.therapy.rxcui:282388          6
normalize.therapy.ncit:C148229          6
normalize.therapy.rxcui:6964            5
normalize.therapy.ncit:C1277            3
                                       ..
normalize.therapy.rxcui:9671            1
normalize.therapy.drugbank:DB02097      1
normalize.therapy.rxcui:3024            1
normalize.therapy.rxcui:24698           1
normalize.therapy.iuphar.ligand:9404    1
Name: count, Length: 150, dtype: int64

In [40]:
final_results = flat_df[(flat_df['gene_concept'].isna()==False)].reset_index(drop=True)
final_results

Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,12941843,TRAIL,XIAP,YES,INHIBITING,"In RKO, rottlerin induced the release of cytoc...",normalize.gene.hgnc:592,XIAP,100,normalize.therapy.iuphar.ligand:5065,TRAIL,80
1,12941843,calphostin C,PKC,YES,INHIBITING,Calphostin c [an inhibitor of classic and nove...,normalize.gene.hgnc:30500,PRRT2,60,normalize.therapy.iuphar.ligand:5156,calphostin C,80
2,12941843,Gö 6976,PKC,YES,INHIBITING,"Go6976, (inhibitor of classic PKC isoforms), d...",normalize.gene.hgnc:30500,PRRT2,60,,,0
3,12941843,phorbol 12-myristate 13-acetate,PKC,YES,INHIBITING,"Furthermore, the incubation of HCT116 or RKO w...",normalize.gene.hgnc:30500,PRRT2,60,normalize.therapy.iuphar.ligand:2341,phorbol 12-myristate 13-acetate,80
4,24398428,vemurafenib,BRAF,YES,INHIBITING,"Selective BRAF(V600E) inhibitors, such as vemu...",normalize.gene.hgnc:1097,BRAF,100,normalize.therapy.rxcui:1147220,vemurafenib,80
...,...,...,...,...,...,...,...,...,...,...,...,...
183,8947586,vitamin D3,RARA,YES,ACTIVATING,We have observed that the resistant cell line ...,normalize.gene.hgnc:9864,RARA,100,normalize.therapy.rxcui:1244014,vitamin D3,80
184,8947586,glycine,RARA,YES,ACTIVATING,DNA sequence analysis revealed a mutation in t...,normalize.gene.hgnc:9864,RARA,100,normalize.therapy.rxcui:4919,glycine,80
185,8947586,aspartic acid,RARA,YES,INHIBITING,DNA sequence analysis revealed a mutation in t...,normalize.gene.hgnc:9864,RARA,100,normalize.therapy.rxcui:1169,aspartic acid,80
186,11125300,Recombinant Tumor Necrosis Factor Family Protein,TNFR1,YES,ACTIVATING,Stimulation of TNFR1 was sufficient to induce ...,normalize.gene.hgnc:11916,TNFRSF1A,80,normalize.therapy.ncit:C1941,Recombinant Tumor Necrosis Factor Family Protein,80


In [41]:
188 / 212

0.8867924528301887

In [None]:
# final_results.to_excel('final_results_control_test3.xlsx')

### Graph

In [4]:
import pandas as pd

final_results = pd.read_excel('literature_curation/final_results_control_test3.xlsx')

In [5]:
import psycopg2
import pandas as pd
import numpy as np

# 1) Prepare PMIDs from your DataFrame
pmids = pd.Series(final_results['pmid']).dropna().astype(str).unique().tolist()

# 2) Connect
conn = psycopg2.connect(
    dbname="dgidb_2025",
    user="mjc014",
    password="",
    host="localhost",
    port="5432"
)

# 3) Query: cast column -> text so it matches the text[] param
query = """
SELECT *
FROM public.publications
WHERE pmid::text = ANY(%s)
ORDER BY id ASC;
"""

df = pd.read_sql(query, conn, params=(pmids,))
conn.close()

print(f"Found {len(df)} PMIDs already in DGIdb")
display(df.head())

# 4) Novelty %
found_pmids = set(df['pmid'].astype(str).unique())
all_pmids   = set(pmids)
novel_pmids = sorted(all_pmids - found_pmids)

novelty_pct = 100.0 * (len(novel_pmids) / max(1, len(all_pmids)))
print(f"Novel PMIDs: {len(novel_pmids)} / {len(all_pmids)} = {novelty_pct:.2f}%")


Found 88 PMIDs already in DGIdb


  df = pd.read_sql(query, conn, params=(pmids,))


Unnamed: 0,id,pmid,citation,created_at,updated_at
0,0109e3d8-8420-454b-9654-b6bd6a1da050,34710737,"Fang DD et al., 2022, FLT3 inhibition by olver...",2024-11-26 15:26:39.610621,2024-11-26 15:28:51.964822
1,029dfb2e-6ad3-449f-a579-7cd05f85c487,26494859,"Tonsing-Carter E et al., 2015, Potentiation of...",2024-11-26 15:28:40.380681,2024-11-26 15:28:52.002905
2,03138d54-85b6-4b66-b184-53fdcbef8df5,15632153,"White KA et al., 2005, Limiting effects of RIP...",2024-11-26 14:23:28.626572,2024-11-26 14:25:28.349141
3,03392c0e-8dd1-45b8-897e-6f0a5529ca1d,19891553,"Kim SH et al., 2009, Genetic polymorphisms of ...",2024-11-26 14:27:50.762585,2024-11-26 14:33:10.628539
4,052c03a9-2519-4f0b-9635-48896af98936,26554404,"Infarinato NR et al., 2016, The ALK/ROS1 Inhib...",2024-11-26 14:05:03.412722,2024-11-26 14:08:11.783127


Novel PMIDs: 0 / 88 = 0.00%


In [6]:

import plotly.express as px

# Data summary
counts = {
    "Novel PMIDs": len(novel_pmids),
    "Known PMIDs (in DGIdb)": len(found_pmids)
}

# Create dataframe
plot_df = pd.DataFrame(list(counts.items()), columns=["Category", "Count"])

# Build pie chart
fig = px.pie(
    plot_df,
    names="Category",
    values="Count",
    color="Category",
    color_discrete_map={
        "Novel PMIDs": "#1f77b4",            # blue
        "Known PMIDs (in DGIdb)": "#ff7f0e"  # orange
    },
    hole=0.35  # makes it a donut chart, easier to read in talks/posters
)

# Update layout for professional styling
fig.update_traces(
    textinfo="label+percent",   # show both category and %
    textfont_size=16,
    pull=[0.05, 0]  # gently pull out the "Novel" slice for emphasis
)

fig.update_layout(
    title=dict(
        text="Novelty of PMIDs Compared to DGIdb",
        font=dict(size=22, family="Arial, sans-serif"),
        x=0.5, xanchor="center"
    ),
    legend=dict(
        font=dict(size=14),
        orientation="h",
        yanchor="bottom",
        y=-0.15,
        xanchor="center",
        x=0.5
    ),
    margin=dict(t=60, b=60, l=40, r=40)
)

fig.show()


import plotly
import plotly.io as pio

print("plotly:", plotly.__version__)

# Save as high-res PNG
fig.write_image("pmid_control_novelty_pie.png",
                engine="kaleido",
                width=1600, height=1200, scale=3)

# Vector versions (great for posters)
fig.write_image("pmid_control_novelty_pie.svg", engine="kaleido")
fig.write_image("pmid_control_novelty_pie.pdf", engine="kaleido")



plotly: 6.0.0


In [7]:
import pandas as pd
import requests
from collections import defaultdict

# -------- Inputs --------
df_in = final_results.copy()
df_in["gene_label"] = df_in["gene_label"].astype(str).str.strip()
df_in["drug_label"] = df_in["drug_label"].astype(str).str.strip()

# Unique pairs and gene list
pairs = df_in[["gene_label", "drug_label"]].dropna().drop_duplicates()
genes = sorted(pairs["gene_label"].unique().tolist())

GQL_URL = "https://dgidb.org/api/graphql"

GQL_QUERY = """
query GeneInteractions($genes: [String!]!) {
  genes(names: $genes) {
    nodes {
      name
      conceptId
      interactions {
        drug {
          name
          conceptId
          approved
        }
        gene {
          name
          conceptId
          longName
        }
        interactionScore
        interactionTypes { type directionality }
        interactionAttributes { name value }
        publications { pmid }
        sources { sourceDbName }
      }
    }
  }
}
"""

def batched(seq, n=40):
    for i in range(0, len(seq), n):
        yield seq[i:i+n]

# gene -> set of lowercase drug names that DGIdb reports for that gene
gene_to_known_drugs = defaultdict(set)
session = requests.Session()
session.headers.update({"Accept": "application/json"})

for chunk in batched(genes, 40):
    resp = session.post(GQL_URL, json={"query": GQL_QUERY, "variables": {"genes": chunk}}, timeout=60)
    resp.raise_for_status()
    data = resp.json()
    if "errors" in data:
        raise RuntimeError(data["errors"])
    nodes = (data.get("data", {}).get("genes", {}) or {}).get("nodes", []) or []
    for node in nodes:
        gname = (node.get("name") or "").strip()
        if not gname:
            continue
        known = gene_to_known_drugs[gname]
        for itx in node.get("interactions", []) or []:
            drug_obj = itx.get("drug") or {}
            # Prefer structured drug.name; fall back if older fields exist
            dname = (drug_obj.get("name")
                     or itx.get("drugName")  # backward compat (older examples)
                     or "").strip()
            if dname:
                known.add(dname.lower())

# Flag known vs novel (case-insensitive drug name check per gene)
pairs["known_in_dgidb"] = pairs.apply(
    lambda r: r["drug_label"].lower() in gene_to_known_drugs.get(r["gene_label"], set()),
    axis=1
)

# Metrics
n_total = len(pairs)
n_known = int(pairs["known_in_dgidb"].sum())
n_novel = int((~pairs["known_in_dgidb"]).sum())
pct_novel = (100.0 * n_novel / n_total) if n_total else 0.0

print(f"Unique gene–drug pairs: {n_total}")
print(f"Known in DGIdb:         {n_known}")
print(f"Novel (not in DGIdb):   {n_novel}")
print(f"Novelty = {pct_novel:.2f}%")

# Inspect a few novel pairs
pairs.loc[~pairs["known_in_dgidb"]].sort_values(["gene_label","drug_label"]).head(10)


Unique gene–drug pairs: 169
Known in DGIdb:         72
Novel (not in DGIdb):   97
Novelty = 57.40%


Unnamed: 0,gene_label,drug_label,known_in_dgidb
42,ABL1,panobinostat,False
52,AGTR1,EXP3174,False
177,AGTR2,ORTHOVANADATE,False
164,AR,Therapeutic Androgen,False
19,BCL2,Lisaftoclax,False
72,CD5,cyclosporine,False
75,CD5,phorbol 12-myristate 13-acetate,False
73,CD5,sirolimus,False
130,CDK1,threonine,False
129,CDK1,tyrosine,False


In [8]:
import pandas as pd
import plotly.graph_objects as go
from datetime import date

# ---- Inputs: counts from your earlier classification ----
n_total = len(pairs)
n_known = int(pairs["known_in_dgidb"].sum())
n_novel = int((~pairs["known_in_dgidb"]).sum())

pct_known = (100.0 * n_known / n_total) if n_total else 0.0
pct_novel = (100.0 * n_novel / n_total) if n_total else 0.0

# ---- Data frame (for clarity; not strictly required) ----
df_plot = pd.DataFrame({
    "Category": ["Novel Interactions", "Known Interactions (DGIdb)"],
    "Count": [n_novel, n_known],
    "Percent": [pct_novel, pct_known],
})

# ---- Colors (high-contrast, colorblind-friendly) ----
COLOR_NOVEL = "#1f77b4"   # blue
COLOR_KNOWN = "#ff7f0e"   # orange

# ---- Build a 100% stacked horizontal bar ----
fig = go.Figure()

fig.add_trace(go.Bar(
    x=[pct_novel], y=["Interactions"],
    orientation="h",
    name="Novel",
    marker_color=COLOR_NOVEL,
    text=[f"{pct_novel:.1f}% ({n_novel:,})"],
    textposition="inside",
    textfont=dict(size=16),
    hovertemplate="Novel: %{x:.1f}%<br>Count: " + f"{n_novel:,}" + "<extra></extra>"
))

fig.add_trace(go.Bar(
    x=[pct_known], y=["Interactions"],
    orientation="h",
    name="Known in DGIdb",
    marker_color=COLOR_KNOWN,
    text=[f"{pct_known:.1f}% ({n_known:,})"],
    textposition="inside",
    textfont=dict(size=16),
    hovertemplate="Known: %{x:.1f}%<br>Count: " + f"{n_known:,}" + "<extra></extra>"
))

fig.update_layout(
    barmode="stack",
    xaxis=dict(range=[0, 100], title=None, showgrid=False, showticklabels=False),
    yaxis=dict(title=None, showgrid=False),
    plot_bgcolor="white",
    paper_bgcolor="white",
    legend=dict(
        orientation="h",
        yanchor="bottom", y=-0.15,
        xanchor="center", x=0.5,
        font=dict(size=14)
    ),
    margin=dict(t=80, b=80, l=40, r=40),
    title=dict(
        text=f"Novelty of Gene–Drug Interactions vs DGIdb",
        x=0.5, xanchor="center",
        font=dict(size=22, family="Arial, sans-serif")
    ),
    font=dict(family="Arial, sans-serif", size=16)
)

# Subtitle-like annotation with totals and date
fig.add_annotation(
    x=0.5, y=-0.35, xref="paper", yref="paper", showarrow=False,
    text=f"Total unique pairs: {n_total:,} • Generated {date.today().isoformat()}",
    font=dict(size=14, color="gray")
)

fig.show()

# ---- High-quality export (PNG/SVG/PDF) via PlotlyScope ----
from kaleido.scopes.plotly import PlotlyScope
scope = PlotlyScope()

png = scope.transform(fig.to_dict(), format="png", width=1600, height=900, scale=3)
with open("interaction_control_novelty_100pct_bar.png", "wb") as f:
    f.write(png)

svg = scope.transform(fig.to_dict(), format="svg", width=1600, height=900, scale=3)
with open("interaction_control_novelty_100pct_bar.svg", "wb") as f:
    f.write(svg)

pdf = scope.transform(fig.to_dict(), format="pdf", width=1600, height=900, scale=3)
with open("interaction_control_novelty_100pct_bar.pdf", "wb") as f:
    f.write(pdf)
