In [81]:
import pandas as pd
import sys, os
import inflect
from tqdm import tqdm
import requests
import ast

# Add the parent directory of this notebook to the Python path
sys.path.append(os.path.abspath('..'))

import score
import novel
import search_set
import indicator
import literature


# Surpress NLP Mask Warning for Apple Silicon
import warnings
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=r".*encoder_attention_mask.*BertSdpaSelfAttention\.forward"
)

## Example Code for Identifying Literature to Curate
Demo notebook demonstrating how to use NLP methods to identify literature containing possible interactions for prioritization, manual review, and inclusion as supporting evidence in DGIdb.

In [None]:
gene = 'BCL2'
search_set.generate_search_set(gene)

In [2]:
abstracts = pd.read_csv('abstracts.csv')
abstracts

Unnamed: 0,Title,Abstract
0,39796006,"In the tissue regeneration field, stem cell tr..."
1,40524014,"Follicular lymphoma (FL), marginal zone lympho..."
2,40393040,Primary cutaneous diffuse large B cell lymphom...
3,40619042,"Primary ovarian insufficiency (POI), which aff..."
4,40444042,This investigation sought to explore the inhib...
...,...,...
39523,25873999,The role played by microRNAs in the deregulati...
39524,34394999,Late-onset posttransplant lymphoproliferative ...
39525,22431999,Several studies demonstrated that treatment wi...
39526,31465999,Antiviral interferons (IFN-alpha/beta) are pos...


In [11]:
import importlib
importlib.reload(novel)
results = novel.batch(abstracts['Abstract'])
results

Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
100%|██████████| 39528/39528 [1:27:22<00:00,  7.54it/s]


Unnamed: 0,original_text,entity_group,score,word,start,end
0,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.999927,CD47 - blocking antibody,273,295
1,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.999985,CD47,306,310
2,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.999992,SIRPα,311,316
3,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.999992,anti - CD20 antibody,403,421
4,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.882086,rituximab,422,431
...,...,...,...,...,...,...
712789,This study aimed to determine the effect of 8-...,DISEASE,0.999993,T2D,596,599
712790,This study aimed to determine the effect of 8-...,DISEASE,0.937653,insulin,781,788
712791,This study aimed to determine the effect of 8-...,DISEASE,0.999973,musculoaponeurotic fibrosarcoma,826,857
712792,This study aimed to determine the effect of 8-...,DISEASE,0.999990,T2D,1078,1081


In [15]:
results.to_excel('checkpoint.xlsx')

In [24]:
def _singularize(word):
    inflector = inflect.engine()
    return inflector.singular_noun(word) or word

def _normalize_therapy(word):
    try:
        r = requests.get(
            f'https://normalize.cancervariants.org/therapy/normalize?q={word}&infer_namespace=true',
            timeout=10  # Set timeout for network reliability
        )
        r.raise_for_status()
        response = r.json()

        if isinstance(response, dict) and response.get('match_type') is not None:
            if response['match_type'] != 0:
                return [
                    response['match_type'],
                    response['therapy']['id'],
                    response['therapy']['name']
                ]
            else:
                return [0, None, None]  # Not matched
        else:
            return ['Unexpected Response Format', None, None]
    except requests.exceptions.RequestException as e:
        return ['HTTP Error', str(e), None]
    except Exception as e:
        return ['Failure to Normalize', str(e), None]

# Main loop
checkpoint_interval = 5000
output_base = "normalized_results_checkpoint"
for idx, (index, row) in enumerate(tqdm(results[results['entity_group'] == 'CHEMICAL'].iterrows()), 1):
    word = _singularize(row['word'])
    norm_result = _normalize_therapy(word)

    results.at[index, 'concept_match_type'] = norm_result[0]
    results.at[index, 'concept_id'] = norm_result[1]
    results.at[index, 'concept_label'] = norm_result[2]

    if idx % checkpoint_interval == 0:
        checkpoint_filename = f"{output_base}_checkpoint_{idx}.xlsx"
        results.to_excel(checkpoint_filename, index=False)
        print(f"Checkpoint saved at row {idx} -> {checkpoint_filename}")

# Final save after loop completes
final_filename = f"{output_base}_final.xlsx"
results.to_excel(final_filename, index=False)
print(f"Final results saved -> {final_filename}")

  results.at[index, 'concept_match_type'] = norm_result[0]
5000it [20:29, 18.59s/it]

Checkpoint saved at row 5000 -> normalized_results_checkpoint_checkpoint_5000.xlsx


10000it [40:53, 20.07s/it]

Checkpoint saved at row 10000 -> normalized_results_checkpoint_checkpoint_10000.xlsx


15000it [59:47, 19.45s/it]

Checkpoint saved at row 15000 -> normalized_results_checkpoint_checkpoint_15000.xlsx


20000it [1:19:35, 10.96s/it]

Checkpoint saved at row 20000 -> normalized_results_checkpoint_checkpoint_20000.xlsx


25001it [1:32:19, 10.39s/it]

Checkpoint saved at row 25000 -> normalized_results_checkpoint_checkpoint_25000.xlsx


30000it [1:45:40, 16.57s/it]

Checkpoint saved at row 30000 -> normalized_results_checkpoint_checkpoint_30000.xlsx


35000it [1:58:38, 17.44s/it]

Checkpoint saved at row 35000 -> normalized_results_checkpoint_checkpoint_35000.xlsx


40001it [2:11:23, 11.58s/it]

Checkpoint saved at row 40000 -> normalized_results_checkpoint_checkpoint_40000.xlsx


45001it [2:23:48, 11.69s/it]

Checkpoint saved at row 45000 -> normalized_results_checkpoint_checkpoint_45000.xlsx


50000it [2:36:12, 15.80s/it]

Checkpoint saved at row 50000 -> normalized_results_checkpoint_checkpoint_50000.xlsx


55001it [2:48:44,  8.49s/it]

Checkpoint saved at row 55000 -> normalized_results_checkpoint_checkpoint_55000.xlsx


60000it [3:01:07, 12.66s/it]

Checkpoint saved at row 60000 -> normalized_results_checkpoint_checkpoint_60000.xlsx


65001it [3:13:22, 11.92s/it]

Checkpoint saved at row 65000 -> normalized_results_checkpoint_checkpoint_65000.xlsx


70001it [3:25:41, 11.18s/it]

Checkpoint saved at row 70000 -> normalized_results_checkpoint_checkpoint_70000.xlsx


75000it [3:37:54, 12.61s/it]

Checkpoint saved at row 75000 -> normalized_results_checkpoint_checkpoint_75000.xlsx


80000it [3:50:11, 15.41s/it]

Checkpoint saved at row 80000 -> normalized_results_checkpoint_checkpoint_80000.xlsx


85001it [4:02:34, 11.23s/it]

Checkpoint saved at row 85000 -> normalized_results_checkpoint_checkpoint_85000.xlsx


90001it [4:15:19, 10.00s/it]

Checkpoint saved at row 90000 -> normalized_results_checkpoint_checkpoint_90000.xlsx


95001it [4:27:55, 11.33s/it]

Checkpoint saved at row 95000 -> normalized_results_checkpoint_checkpoint_95000.xlsx


96459it [4:31:13,  5.93it/s]


Final results saved -> normalized_results_checkpoint_final.xlsx


In [None]:
tdf = results[(results['concept_match_type']!=0) & (results['concept_match_type'].isna()==False)].reset_index(drop=True)

condensed_results = tdf.groupby('original_text').apply(
    lambda group: pd.Series({
        'DRUG_LABELS': ' | '.join(group.loc[group['entity_group'] == 'CHEMICAL', 'concept_label'].dropna().astype(str).unique()),
        'DRUG_IDS': ' | '.join(group.loc[group['entity_group'] == 'CHEMICAL', 'concept_id'].dropna().astype(str).unique())
    })
).reset_index()

condensed_results

  condensed_results = tdf.groupby('original_text').apply(


Unnamed: 0,original_text,DRUG_LABELS,DRUG_IDS
0,ADLE was administered to high-fat diet treate...,nitric oxide | palmitate | oxygen | nitrite io...,normalize.therapy.rxcui:7442 | normalize.thera...
1,GOT1-bearing BALB/c nude mice were treated wi...,sonidegib,normalize.therapy.rxcui:1659191
2,"In this study, we have investigated the cytot...",lactate | ACRIDINE ORANGE | PROPIDIUM | oxygen,normalize.therapy.rxcui:114202 | normalize.the...
3,PNS were identified from the Traditional Chin...,"glutathione | malondialdehyde | ESTROGENS, CON...",normalize.therapy.rxcui:4890 | normalize.thera...
4,Several databases were screened for bioactive...,Tenamfetamine | glutathione,normalize.therapy.ncit:C80152 | normalize.ther...
...,...,...,...
15320,β-thalassemias are common hemoglobinopathies d...,oxygen,normalize.therapy.rxcui:7806
15321,δ-Tocotrienol is a naturally occurring proteas...,CERAMIDE,normalize.therapy.chembl:CHEMBL155886
15322,The malondialdehyde (MDA) level and TA count ...,malondialdehyde | Tenamfetamine | tantalum,normalize.therapy.rxcui:1657018 | normalize.th...
15323,• To investigate the role that oxidative stres...,streptozocin | glutathione | oxygen,normalize.therapy.rxcui:10114 | normalize.ther...


In [43]:
merged_df = pd.merge(
    abstracts,
    condensed_results,
    left_on='Abstract',
    right_on='original_text',
    how='left'
)
merged_df = merged_df[merged_df['DRUG_LABELS'].isnull()==False].reset_index(drop=True)
merged_df

Unnamed: 0,Title,Abstract,original_text,DRUG_LABELS,DRUG_IDS
0,40393040,Primary cutaneous diffuse large B cell lymphom...,Primary cutaneous diffuse large B cell lymphom...,cyclophosphamide anhydrous | doxorubicin hydro...,normalize.therapy.rxcui:1545988 | normalize.th...
1,40619042,"Primary ovarian insufficiency (POI), which aff...","Primary ovarian insufficiency (POI), which aff...",estradiol valerate | progesterone | Luteolin |...,normalize.therapy.rxcui:24395 | normalize.ther...
2,40444042,This investigation sought to explore the inhib...,This investigation sought to explore the inhib...,wogonin | bromocriptine,normalize.therapy.rxcui:2001619 | normalize.th...
3,40447053,The fate of granulosa cells determines ovarian...,The fate of granulosa cells determines ovarian...,acetate | propionate | butyric acid,normalize.therapy.rxcui:164 | normalize.therap...
4,40512124,BCL-2 is an anti-apoptotic protein expressed b...,BCL-2 is an anti-apoptotic protein expressed b...,venetoclax,normalize.therapy.rxcui:1747556
...,...,...,...,...,...
15399,36042999,Background The Oncotype DX Recurrence Score (O...,Background The Oncotype DX Recurrence Score (O...,"ESTROGENS, CONJUGATED",normalize.therapy.chembl:CHEMBL1201649
15400,38580999,Smilax china L. (SCL) is a traditional herbal ...,Smilax china L. (SCL) is a traditional herbal ...,ASTILBIN,normalize.therapy.chembl:CHEMBL486017
15401,34376999,Glioblastoma multiforme (GBM) is the primary a...,Glioblastoma multiforme (GBM) is the primary a...,curcumin | BISDEMETHOXYCURCUMIN | DEMETHOXYCUR...,normalize.therapy.rxcui:2955 | normalize.thera...
15402,30515999,Lung squamous cell carcinoma (SCC) accounts fo...,Lung squamous cell carcinoma (SCC) accounts fo...,honokiol,normalize.therapy.rxcui:2562546


### Generate Score

In [44]:
dgidb_df = pd.read_csv('search/2025-08-13_BCL2_clin_score.csv')
dgidb_df = dgidb_df.drop_duplicates(subset=['Drug','Gene'], keep='first')
dgidb_df.head()

Unnamed: 0,nomenclature,Gene,long_name,gene_concept_id,interaction_score,drug_specificity,gene_specificity,evidence_score,source_db_name,source_db_version,...,directionality,definition,reference,drug_name,nomenclature-2,Drug,approved,immunotherapy,anti_neoplastic,drug_concept_id
0,Gene Symbol,BCL2,BCL2 apoptosis regulator,hgnc:990,2.284732,4.141413,0.183893,3.0,TALC,12-May-16,...,1.0,"In inhibitor interactions, the drug binds to a...","<a href=""https://en.wikipedia.org/wiki/Enzyme_...",OBLIMERSEN,Primary Name,OBLIMERSEN,False,False,True,ncit:C1870
3,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,DOLASTATIN 10,Primary Name,DOLASTATIN 10,False,False,False,ncit:C1300
4,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,EPISIALIN,Primary Name,MUC-1 ANTIGEN,False,False,False,ncit:C2407
5,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,MICELLAR PACLITAXEL,Primary Name,MICELLAR PACLITAXEL,False,False,False,ncit:C29256
6,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,BEAUVERICIN,Primary Name,BEAUVERICIN,False,False,False,ncit:C1011


In [87]:
import importlib
importlib.reload(indicator)

indicator.generate_interaction_evidence(merged_df, dgidb_df)

                                     

Results saved to 2025-08-19_BCL2.zip!


## Load Scores

In [88]:
tdf = score.load_pmid_assessments(f'2025-08-19_BCL2.zip', 'interaction_search')
tdf = tdf[tdf['label']=='interaction_evidence'].reset_index(drop=True)
tdf


Unnamed: 0,pmid,abstract,label,scores,tagged_drugs,concepts,gene,drug,method
0,40393040,Primary cutaneous diffuse large B cell lymphom...,interaction_evidence,"{'direct_interaction': 0, 'binding_interaction...",cyclophosphamide anhydrous | doxorubicin hydro...,normalize.therapy.rxcui:1545988 | normalize.th...,BCL2,interaction_search,interaction_search
1,40619042,"Primary ovarian insufficiency (POI), which aff...",interaction_evidence,"{'direct_interaction': 1, 'binding_interaction...",estradiol valerate | progesterone | Luteolin |...,normalize.therapy.rxcui:24395 | normalize.ther...,BCL2,interaction_search,interaction_search
2,40444042,This investigation sought to explore the inhib...,interaction_evidence,"{'direct_interaction': 2, 'binding_interaction...",wogonin | bromocriptine,normalize.therapy.rxcui:2001619 | normalize.th...,BCL2,interaction_search,interaction_search
3,40447053,The fate of granulosa cells determines ovarian...,interaction_evidence,"{'direct_interaction': 2, 'binding_interaction...",acetate | propionate | butyric acid,normalize.therapy.rxcui:164 | normalize.therap...,BCL2,interaction_search,interaction_search
4,40512124,BCL-2 is an anti-apoptotic protein expressed b...,interaction_evidence,"{'direct_interaction': 0, 'binding_interaction...",venetoclax,normalize.therapy.rxcui:1747556,BCL2,interaction_search,interaction_search
...,...,...,...,...,...,...,...,...,...
14070,35804999,Mantle cell lymphoma (MCL) is an aggressive B-...,interaction_evidence,"{'direct_interaction': 0, 'binding_interaction...",Anthracycline Antineoplastic Antibiotic | cyta...,normalize.therapy.ncit:C1594 | normalize.thera...,BCL2,interaction_search,interaction_search
14071,37296999,Prostate cancer is the second most common canc...,interaction_evidence,"{'direct_interaction': 1, 'binding_interaction...",Therapeutic Androgen | oxygen,normalize.therapy.ncit:C243 | normalize.therap...,BCL2,interaction_search,interaction_search
14072,38580999,Smilax china L. (SCL) is a traditional herbal ...,interaction_evidence,"{'direct_interaction': 0, 'binding_interaction...",ASTILBIN,normalize.therapy.chembl:CHEMBL486017,BCL2,interaction_search,interaction_search
14073,34376999,Glioblastoma multiforme (GBM) is the primary a...,interaction_evidence,"{'direct_interaction': 1, 'binding_interaction...",curcumin | BISDEMETHOXYCURCUMIN | DEMETHOXYCUR...,normalize.therapy.rxcui:2955 | normalize.thera...,BCL2,interaction_search,interaction_search


In [120]:
def unpack_total(score):
    if type(score) is float:
        return 0
    if score is None:
        return 0
    return ast.literal_eval(score)['unweighted_total']

def unpack_regulation(score):
    if type(score) is float:
        return 0
    if score is None:
        return 0
    return ast.literal_eval(score)['regulation_changes']

tdf['total_interaction_evidence'] = tdf['scores'].apply(unpack_total)
tdf['total_regulation_evidence'] = tdf['scores'].apply(unpack_total)


tdf.sort_values(by='total_regulation_evidence', ascending=False)[0:10]

Unnamed: 0,pmid,abstract,label,scores,tagged_drugs,concepts,gene,drug,method,total_interaction_evidence,total_reguation_evidence,total_regulation_evidence
5047,16983347,In order to define genetic determinants of pri...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",TRAIL | adenosine triphosphate | imatinib,normalize.therapy.iuphar.ligand:5065 | normali...,BCL2,interaction_search,interaction_search,19,19,19
13789,37935978,High metabolic flexibility is pivotal for the ...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",pyruvate | tyrosine | quizartinib | anhydrous ...,normalize.therapy.rxcui:72031 | normalize.ther...,BCL2,interaction_search,interaction_search,16,16,16
7706,27997540,Pediatric acute lymphoblastic leukemia (ALL) i...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",prednisolone,normalize.therapy.rxcui:8638,BCL2,interaction_search,interaction_search,16,16,16
8337,30312583,Maternal diabetes induces neural tube defects ...,interaction_evidence,"{'direct_interaction': 7, 'binding_interaction...",streptozocin,normalize.therapy.rxcui:10114,BCL2,interaction_search,interaction_search,16,16,16
12831,32816907,"Regulation of the stemness factor, SOX2, by cy...",interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",tyrosine,normalize.therapy.rxcui:10962,BCL2,interaction_search,interaction_search,16,16,16
13577,26469962,Pancreatic ductal adenocarcinoma (PDAC) is an ...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",gemcitabine,normalize.therapy.rxcui:12574,BCL2,interaction_search,interaction_search,15,15,15
11405,29991802,The development of Barrett's esophagus (BE) an...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",APE1/Ref-1 Redox Inhibitor APX3330,normalize.therapy.ncit:C150216,BCL2,interaction_search,interaction_search,15,15,15
2877,38178196,Major depressive disorder (MDD) is a common bu...,interaction_evidence,"{'direct_interaction': 5, 'binding_interaction...",leucine | sucrose | sirolimus | tyrosine,normalize.therapy.rxcui:6308 | normalize.thera...,BCL2,interaction_search,interaction_search,15,15,15
13941,37004989,Mitophagy removes damaged mitochondria to main...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",Kynurenine | oxygen,normalize.therapy.drugbank:DB02070 | normalize...,BCL2,interaction_search,interaction_search,15,15,15
6205,26297434,The oncogenic transcription factor signal tran...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",sorafenib,normalize.therapy.rxcui:495881,BCL2,interaction_search,interaction_search,15,15,15


In [179]:
position = 1
print(tdf.sort_values(by='total_interaction_evidence', ascending=False).reset_index(drop=True)['scores'][position])

{'direct_interaction': 4, 'binding_interaction': 2, 'regulation_changes': 5, 'sensitivity_resistance': 3, 'pharmacogenomic_signals': 2, 'unweighted_total': 16}


## Build Prompts

In [181]:
here_we_go = tdf.sort_values(by='total_interaction_evidence', ascending=False)[0:100].reset_index(drop=True)

def generate_prompt_base(drugs, gene):
    prompt = f"""You are an expert biomedical scientist, biochemist, and scientific curator trained to identify drug-gene interactions from scientific literature. Given a list of drugs, a gene, and a scientific abstract, your task is to determine whether an interaction between a drug and a gene is occuring and assign it an interaction directionality. Use the following tools to help perform these tasks.
    
    *Interaction*
    - An interaction between a small molecule and a gene or gene product. 

    *Interaction Directionality*
    - Activating -> Activating interactions are those where the drug increases the biological activity or expression of a gene target.
    - Inhibiting -> Inhibiting interactions are those where the drug decreases the biological activity or expression of a gene target.

    The drugs to consider for this task are {drugs}. For each drug, fill out the following JSON schema:
    {{
        "pmid": "EXACT PMID FROM CONTEXT",
        "drug_name": "NAME OF DRUG",
        "gene_name": "NAME OF GENE BEING INTERACTED WITH",
        "interaction_occurs_with_gene": "YES" or "NO",
        "interaction_type": "ACTIVATING" or "INHIBITING",
        "evidence": "EXACT SENTENCE FROM ABSTRACT THAT SUPPORTS INTERACTION"        
    }} 

"""
    
    return prompt

here_we_go['context'] = None
here_we_go['prompt'] = None

for idx, row in here_we_go.iterrows():
    drugs = row['tagged_drugs']
    gene = row['gene']
    prompt_base = generate_prompt_base(drugs, gene)

    # tdf = here_we_go[here_we['pmid']==row['pmid']].reset_index(drop=True)
    context = f'PMID: {row['pmid']}\n Abstract: {row['abstract']}'

    full_prompt = f'{prompt_base}\n\n{context}'

    here_we_go.at[idx,'context'] = context
    here_we_go.at[idx,'prompt'] = full_prompt    



here_we_go.head()

Unnamed: 0,pmid,abstract,label,scores,tagged_drugs,concepts,gene,drug,method,total_interaction_evidence,total_reguation_evidence,total_regulation_evidence,context,prompt
0,16983347,In order to define genetic determinants of pri...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",TRAIL | adenosine triphosphate | imatinib,normalize.therapy.iuphar.ligand:5065 | normali...,BCL2,interaction_search,interaction_search,19,19,19,PMID: 16983347\n Abstract: In order to define ...,"You are an expert biomedical scientist, bioche..."
1,37935978,High metabolic flexibility is pivotal for the ...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",pyruvate | tyrosine | quizartinib | anhydrous ...,normalize.therapy.rxcui:72031 | normalize.ther...,BCL2,interaction_search,interaction_search,16,16,16,PMID: 37935978\n Abstract: High metabolic flex...,"You are an expert biomedical scientist, bioche..."
2,27997540,Pediatric acute lymphoblastic leukemia (ALL) i...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",prednisolone,normalize.therapy.rxcui:8638,BCL2,interaction_search,interaction_search,16,16,16,PMID: 27997540\n Abstract: Pediatric acute lym...,"You are an expert biomedical scientist, bioche..."
3,30312583,Maternal diabetes induces neural tube defects ...,interaction_evidence,"{'direct_interaction': 7, 'binding_interaction...",streptozocin,normalize.therapy.rxcui:10114,BCL2,interaction_search,interaction_search,16,16,16,PMID: 30312583\n Abstract: Maternal diabetes i...,"You are an expert biomedical scientist, bioche..."
4,32816907,"Regulation of the stemness factor, SOX2, by cy...",interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",tyrosine,normalize.therapy.rxcui:10962,BCL2,interaction_search,interaction_search,16,16,16,PMID: 32816907\n Abstract: Regulation of the s...,"You are an expert biomedical scientist, bioche..."


In [130]:
print(here_we_go['prompt'][0])

You are an expert biomedical scientist, biochemist, and scientific curator trained to identify drug-gene interactions from scientific literature. Given a list of drugs, a gene, and a scientific abstract, your task is to determine whether an interaction between a drug and a gene is occuring and assign it an interaction directionality. Use the following tools to help perform these tasks.
    
    *Interaction*
    - An interaction between a small molecule and a gene or gene product. 

    *Interaction Directionality*
    - Activating -> Activating interactions are those where the drug increases the biological activity or expression of a gene target.
    - Inhibiting -> Inhibiting interactions are those where the drug decreases the biological activity or expression of a gene target.

    The drugs to consider for this task are TRAIL | adenosine triphosphate | imatinib. For each drug, fill out the following JSON schema:
    {
        "pmid": "EXACT PMID FROM CONTEXT",
        "drug_name": 

In [182]:
import boto3
import json

# Initialize the Bedrock Runtime client
bedrock = boto3.client("bedrock-runtime", region_name="us-east-2")

# Replace with your actual inference profile ID or ARN
INFERENCE_PROFILE_ID = "us.anthropic.claude-3-5-sonnet-20240620-v1:0"
# Not opus 4 I guess????


def query_claude_sonnet(prompt: str) -> str:
    body = {
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 1024,
        "temperature": 0.0,
        "anthropic_version": "bedrock-2023-05-31"
    }

    try:
        response = bedrock.invoke_model(
            body=json.dumps(body),
            modelId=INFERENCE_PROFILE_ID,
            contentType="application/json",
            accept="application/json"
        )
        response_body = json.loads(response["body"].read())
        return response_body["content"][0]["text"]
    except Exception as e:
        return f"[Error] {str(e)}"

# Example usage
# response = query_claude_opus("Hello! This is a test message!")
# print(response)


In [None]:
here_we_go['response'] = None

for idx, row in here_we_go.iterrows():
    # here_we_go.at[idx,'response'] = query_claude_sonnet(row['prompt'])
    print(f'{idx} Done')

0 Done
1 Done
2 Done
3 Done
4 Done
5 Done
6 Done
7 Done
8 Done
9 Done
10 Done
11 Done
12 Done
13 Done
14 Done
15 Done
16 Done
17 Done
18 Done
19 Done
20 Done
21 Done
22 Done
23 Done
24 Done
25 Done
26 Done
27 Done
28 Done
29 Done
30 Done
31 Done
32 Done
33 Done
34 Done
35 Done
36 Done
37 Done
38 Done
39 Done
40 Done
41 Done
42 Done
43 Done
44 Done
45 Done
46 Done
47 Done
48 Done
49 Done
50 Done
51 Done
52 Done
53 Done
54 Done
55 Done
56 Done
57 Done
58 Done
59 Done
60 Done
61 Done
62 Done
63 Done
64 Done
65 Done
66 Done
67 Done
68 Done
69 Done
70 Done
71 Done
72 Done
73 Done
74 Done
75 Done
76 Done
77 Done
78 Done
79 Done
80 Done
81 Done
82 Done
83 Done
84 Done
85 Done
86 Done
87 Done
88 Done
89 Done
90 Done
91 Done
92 Done
93 Done
94 Done
95 Done
96 Done
97 Done
98 Done
99 Done


In [194]:
response = 10
print(here_we_go['response'][response])

Based on the provided abstract, I will analyze the interactions between the specified drugs and genes. Here are the results in the requested JSON format:

{
    "pmid": "27809310",
    "drug_name": "tamoxifen",
    "gene_name": "miR-27b-3p",
    "interaction_occurs_with_gene": "YES",
    "interaction_type": "INHIBITING",
    "evidence": "Notably, tamoxifen repressed miR-27b-3p expression, whereas estrogen induced miR-27b-3p expression in breast cancer cells."
}

{
    "pmid": "27809310",
    "drug_name": "ESTROGEN",
    "gene_name": "miR-27b-3p",
    "interaction_occurs_with_gene": "YES",
    "interaction_type": "ACTIVATING",
    "evidence": "Notably, tamoxifen repressed miR-27b-3p expression, whereas estrogen induced miR-27b-3p expression in breast cancer cells."
}

{
    "pmid": "27809310",
    "drug_name": "Cyclic adenosine monophosphate",
    "gene_name": "miR-27b-3p",
    "interaction_occurs_with_gene": "NO",
    "interaction_type": "",
    "evidence": ""
}

Note: For the third dr

In [None]:
# here_we_go.to_excel('Literature_Priortization_test3_use_this.xlsx')

### Extract!

In [195]:
import re, json

def extract_json_objects(text):
    """
    Extract all JSON-like objects (dicts) from a text string.
    Returns a list of parsed dicts.
    """
    objects = []
    brace_stack = []
    start = None
    
    for i, ch in enumerate(text):
        if ch == "{":
            if not brace_stack:
                start = i
            brace_stack.append("{")
        elif ch == "}":
            if brace_stack:
                brace_stack.pop()
                if not brace_stack and start is not None:
                    snippet = text[start:i+1]
                    try:
                        objects.append(json.loads(snippet))
                    except json.JSONDecodeError:
                        # fallback: strip trailing commas etc.
                        snippet = re.sub(r",\s*}", "}", snippet)
                        snippet = re.sub(r",\s*]", "]", snippet)
                        try:
                            objects.append(json.loads(snippet))
                        except:
                            pass
                    start = None
    return objects


def extract_note_text(text):
    """
    Grab any trailing 'Note:' or 'Explanation:' text from a response.
    Returns a single string ('' if none found).
    """
    match = re.search(r"(?:Note|Explanation)[:\-–]\s*(.+)", text, flags=re.IGNORECASE|re.DOTALL)
    return match.group(1).strip() if match else ""


In [207]:
here_we_go['json'] = None
here_we_go['free_text_explanation'] = None
for idx, row in here_we_go.iterrows():
    here_we_go.at[idx, 'free_text_explanation'] = extract_note_text(row['response'])
    here_we_go.at[idx,'json'] = extract_json_objects(row['response'])

here_we_go.head()

Unnamed: 0,pmid,abstract,label,scores,tagged_drugs,concepts,gene,drug,method,total_interaction_evidence,total_reguation_evidence,total_regulation_evidence,context,prompt,response,json,free_text_explanation
0,16983347,In order to define genetic determinants of pri...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",TRAIL | adenosine triphosphate | imatinib,normalize.therapy.iuphar.ligand:5065 | normali...,BCL2,interaction_search,interaction_search,19,19,19,PMID: 16983347\n Abstract: In order to define ...,"You are an expert biomedical scientist, bioche...","Based on the provided abstract, I will analyze...","[{'pmid': '16983347', 'drug_name': 'TRAIL', 'g...",There is no specific interaction mentioned bet...
1,37935978,High metabolic flexibility is pivotal for the ...,interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",pyruvate | tyrosine | quizartinib | anhydrous ...,normalize.therapy.rxcui:72031 | normalize.ther...,BCL2,interaction_search,interaction_search,16,16,16,PMID: 37935978\n Abstract: High metabolic flex...,"You are an expert biomedical scientist, bioche...","Based on the provided abstract, I will analyze...","[{'pmid': '37935978', 'drug_name': 'pyruvate',...",
2,27997540,Pediatric acute lymphoblastic leukemia (ALL) i...,interaction_evidence,"{'direct_interaction': 3, 'binding_interaction...",prednisolone,normalize.therapy.rxcui:8638,BCL2,interaction_search,interaction_search,16,16,16,PMID: 27997540\n Abstract: Pediatric acute lym...,"You are an expert biomedical scientist, bioche...","Based on the provided abstract, I can provide ...","[{'pmid': '27997540', 'drug_name': 'prednisolo...",
3,30312583,Maternal diabetes induces neural tube defects ...,interaction_evidence,"{'direct_interaction': 7, 'binding_interaction...",streptozocin,normalize.therapy.rxcui:10114,BCL2,interaction_search,interaction_search,16,16,16,PMID: 30312583\n Abstract: Maternal diabetes i...,"You are an expert biomedical scientist, bioche...","Based on the provided abstract, I can fill out...","[{'pmid': '30312583', 'drug_name': 'streptozot...",- The PMID is clearly stated at the beginning ...
4,32816907,"Regulation of the stemness factor, SOX2, by cy...",interaction_evidence,"{'direct_interaction': 4, 'binding_interaction...",tyrosine,normalize.therapy.rxcui:10962,BCL2,interaction_search,interaction_search,16,16,16,PMID: 32816907\n Abstract: Regulation of the s...,"You are an expert biomedical scientist, bioche...","Based on the provided abstract, I can provide ...","[{'pmid': '32816907', 'drug_name': 'tyrosine',...",- The abstract mentions tyrosine kinase inhibi...


In [212]:
here_we_go['json'][0]

[{'pmid': '16983347',
  'drug_name': 'TRAIL',
  'gene_name': 'c-FLIP(L)',
  'interaction_occurs_with_gene': 'YES',
  'interaction_type': 'INHIBITING',
  'evidence': 'Moreover, c-FLIP(L) knockdown partly restored TRAIL sensitivity in G1 cells, indicating that the expression level of c-FLIP(L) and its interaction with TRAIL receptor2 play a crucial role in determining TRAIL resistance in metastatic melanoma cells.'},
 {'pmid': '16983347',
  'drug_name': 'imatinib',
  'gene_name': 'c-FLIP(L)',
  'interaction_occurs_with_gene': 'YES',
  'interaction_type': 'INHIBITING',
  'evidence': 'Our data indicate that imatinib sensitizes T1 cells by directly downregulating c-FLIP(L), with the use of an alternative pathway for antitumor activity, because PDGFRalpha is not activated in T1 cells and these cells do not express c-kit, c-ABL or PDGFRbeta.'},
 {'pmid': '16983347',
  'drug_name': 'imatinib',
  'gene_name': 'Bax',
  'interaction_occurs_with_gene': 'YES',
  'interaction_type': 'ACTIVATING',
  

In [None]:
flat_df = pd.DataFrame([obj for lst in here_we_go["json"] for obj in lst])
flat_df


In [230]:
flat_df = flat_df[(flat_df['interaction_occurs_with_gene']=='YES') & (flat_df['gene_name']!='N/A')]
flat_df

Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence
0,16983347,TRAIL,c-FLIP(L),YES,INHIBITING,"Moreover, c-FLIP(L) knockdown partly restored ..."
1,16983347,imatinib,c-FLIP(L),YES,INHIBITING,Our data indicate that imatinib sensitizes T1 ...
2,16983347,imatinib,Bax,YES,ACTIVATING,Imatinib did not modulate the expression or ac...
4,37935978,pyruvate,PDP1,YES,ACTIVATING,"PDP1, an activator of the PDC."
5,37935978,quizartinib,PDP1,YES,ACTIVATING,Upon incubation with the FLT3 tyrosine kinase ...
...,...,...,...,...,...,...
195,31439879,cadmium,Fas,YES,ACTIVATING,"Cd produced tumor necrosis factor (TNF)-α, pea..."
196,39047882,galangin,TRPV1,YES,INHIBITING,"Molecularly, galangin demonstrated favorable b..."
201,31635329,tretinoin,PML-RARa,YES,INHIBITING,ATRA activates the transcription of blocked ge...
202,31635329,arsenic trioxide,PML-RARa,YES,INHIBITING,ATO degrades PML-RARa by promoting apoptosis a...


In [232]:
def _normalize_gene(word):
    try:
        r = requests.get(
            f'https://normalize.cancervariants.org/gene/normalize?q={word}',
            timeout=10  # Set timeout for network reliability
        )
        r.raise_for_status()
        response = r.json()

        if isinstance(response, dict) and response.get('match_type') is not None:
            if response['match_type'] != 0:
                return [
                    response['match_type'],
                    response['gene']['id'],
                    response['gene']['name']
                ]
            else:
                return [0, None, None]  # Not matched
        else:
            return ['Unexpected Response Format', None, None]
    except requests.exceptions.RequestException as e:
        return ['HTTP Error', str(e), None]
    except Exception as e:
        return ['Failure to Normalize', str(e), None]


In [233]:
flat_df['gene_concept'] = None
flat_df['gene_label'] = None
flat_df['gene_match_type'] = None
flat_df['drug_concept'] = None
flat_df['drug_label'] = None
flat_df['drug_match_type'] = None
for idx, row in tqdm(flat_df.iterrows()):
    drug_match_type, drug_concept, drug_label = _normalize_therapy(row['drug_name'])
    gene_match_type, gene_concept, gene_label = _normalize_gene(row['gene_name'])

    flat_df.at[idx, 'gene_concept'] = gene_concept
    flat_df.at[idx, 'gene_label'] = gene_label
    flat_df.at[idx, 'gene_match_type'] = gene_match_type
    flat_df.at[idx, 'drug_concept'] = drug_concept
    flat_df.at[idx, 'drug_label'] = drug_label
    flat_df.at[idx, 'drug_match_type'] = drug_match_type

flat_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flat_df['gene_concept'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flat_df['gene_label'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flat_df['gene_match_type'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = va

Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,16983347,TRAIL,c-FLIP(L),YES,INHIBITING,"Moreover, c-FLIP(L) knockdown partly restored ...",,,0,normalize.therapy.iuphar.ligand:5065,TRAIL,80
1,16983347,imatinib,c-FLIP(L),YES,INHIBITING,Our data indicate that imatinib sensitizes T1 ...,,,0,normalize.therapy.rxcui:282388,imatinib,80
2,16983347,imatinib,Bax,YES,ACTIVATING,Imatinib did not modulate the expression or ac...,normalize.gene.hgnc:959,BAX,100,normalize.therapy.rxcui:282388,imatinib,80
4,37935978,pyruvate,PDP1,YES,ACTIVATING,"PDP1, an activator of the PDC.",normalize.gene.hgnc:9279,PDP1,100,normalize.therapy.rxcui:72031,pyruvate,80
5,37935978,quizartinib,PDP1,YES,ACTIVATING,Upon incubation with the FLT3 tyrosine kinase ...,normalize.gene.hgnc:9279,PDP1,100,normalize.therapy.rxcui:2643048,quizartinib,80
...,...,...,...,...,...,...,...,...,...,...,...,...
195,31439879,cadmium,Fas,YES,ACTIVATING,"Cd produced tumor necrosis factor (TNF)-α, pea...",normalize.gene.hgnc:11920,FAS,100,normalize.therapy.rxcui:1362694,cadmium,80
196,39047882,galangin,TRPV1,YES,INHIBITING,"Molecularly, galangin demonstrated favorable b...",normalize.gene.hgnc:12716,TRPV1,100,normalize.therapy.iuphar.ligand:410,galangin,80
201,31635329,tretinoin,PML-RARa,YES,INHIBITING,ATRA activates the transcription of blocked ge...,,,0,normalize.therapy.rxcui:10753,tretinoin,80
202,31635329,arsenic trioxide,PML-RARa,YES,INHIBITING,ATO degrades PML-RARa by promoting apoptosis a...,,,0,normalize.therapy.rxcui:18330,arsenic trioxide,80


In [235]:
flat_df['gene_concept'].value_counts(dropna=False)

gene_concept
None                         27
normalize.gene.hgnc:990       7
normalize.gene.hgnc:3236      5
normalize.gene.hgnc:644       5
normalize.gene.hgnc:44048     4
                             ..
normalize.gene.hgnc:959       1
normalize.gene.hgnc:7562      1
normalize.gene.hgnc:12612     1
normalize.gene.hgnc:9287      1
normalize.gene.hgnc:9864      1
Name: count, Length: 88, dtype: int64

In [236]:
flat_df['drug_concept'].value_counts(dropna=False)

drug_concept
normalize.therapy.iuphar.ligand:9978     12
normalize.therapy.rxcui:1747556           8
normalize.therapy.rxcui:2555              7
normalize.therapy.rxcui:337525            4
normalize.therapy.chembl:CHEMBL592868     4
                                         ..
normalize.therapy.iuphar.ligand:7710      1
normalize.therapy.rxcui:1366666           1
normalize.therapy.rxcui:8129              1
normalize.therapy.chembl:CHEMBL510380     1
normalize.therapy.ncit:C804               1
Name: count, Length: 98, dtype: int64

In [237]:
final_results = flat_df[(flat_df['gene_concept'].isna()==False)].reset_index(drop=True)
final_results

Unnamed: 0,pmid,drug_name,gene_name,interaction_occurs_with_gene,interaction_type,evidence,gene_concept,gene_label,gene_match_type,drug_concept,drug_label,drug_match_type
0,16983347,imatinib,Bax,YES,ACTIVATING,Imatinib did not modulate the expression or ac...,normalize.gene.hgnc:959,BAX,100,normalize.therapy.rxcui:282388,imatinib,80
1,37935978,pyruvate,PDP1,YES,ACTIVATING,"PDP1, an activator of the PDC.",normalize.gene.hgnc:9279,PDP1,100,normalize.therapy.rxcui:72031,pyruvate,80
2,37935978,quizartinib,PDP1,YES,ACTIVATING,Upon incubation with the FLT3 tyrosine kinase ...,normalize.gene.hgnc:9279,PDP1,100,normalize.therapy.rxcui:2643048,quizartinib,80
3,27997540,prednisolone,JAK1,YES,INHIBITING,Our analysis revealed that mutations in JAK1 a...,normalize.gene.hgnc:6190,JAK1,100,normalize.therapy.rxcui:8638,prednisolone,80
4,30312583,streptozotocin,FOXO3a,YES,ACTIVATING,We used a well-established type 1 diabetic emb...,normalize.gene.hgnc:3821,FOXO3,80,normalize.therapy.rxcui:10114,streptozocin,60
...,...,...,...,...,...,...,...,...,...,...,...,...
132,37455373,paclitaxel,KPNA4,YES,INHIBITING,Circ_0000376 could modulate KPNA4 expression b...,normalize.gene.hgnc:6397,KPNA4,100,normalize.therapy.rxcui:56946,paclitaxel,80
133,32238563,docetaxel,C3,YES,ACTIVATING,"Interestingly, docetaxel, a common therapy for...",normalize.gene.hgnc:1318,C3,100,normalize.therapy.rxcui:1299922,docetaxel anhydrous,80
134,31439879,cadmium,Fas,YES,ACTIVATING,"Cd produced tumor necrosis factor (TNF)-α, pea...",normalize.gene.hgnc:11920,FAS,100,normalize.therapy.rxcui:1362694,cadmium,80
135,39047882,galangin,TRPV1,YES,INHIBITING,"Molecularly, galangin demonstrated favorable b...",normalize.gene.hgnc:12716,TRPV1,100,normalize.therapy.iuphar.ligand:410,galangin,80


In [238]:
137 / 204

0.6715686274509803

In [None]:
# final_results.to_excel('final_results_test3.xlsx')