In [23]:
import pandas as pd
import sys, os
import inflect
from tqdm import tqdm
import requests

# Add the parent directory of this notebook to the Python path
sys.path.append(os.path.abspath('..'))

import score
import novel
import search_set

# Surpress NLP Mask Warning for Apple Silicon
import warnings
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=r".*encoder_attention_mask.*BertSdpaSelfAttention\.forward"
)


## Example Code for Identifying Literature to Curate
Demo notebook demonstrating how to use NLP methods to identify literature containing possible interactions for prioritization, manual review, and inclusion as supporting evidence in DGIdb.

In [None]:
gene = 'BCL2'
search_set.generate_search_set(gene)

In [2]:
abstracts = pd.read_csv('abstracts.csv')
abstracts

Unnamed: 0,Title,Abstract
0,39796006,"In the tissue regeneration field, stem cell tr..."
1,40524014,"Follicular lymphoma (FL), marginal zone lympho..."
2,40393040,Primary cutaneous diffuse large B cell lymphom...
3,40619042,"Primary ovarian insufficiency (POI), which aff..."
4,40444042,This investigation sought to explore the inhib...
...,...,...
39523,25873999,The role played by microRNAs in the deregulati...
39524,34394999,Late-onset posttransplant lymphoproliferative ...
39525,22431999,Several studies demonstrated that treatment wi...
39526,31465999,Antiviral interferons (IFN-alpha/beta) are pos...


In [11]:
import importlib
importlib.reload(novel)
results = novel.batch(abstracts['Abstract'])
results

Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
100%|██████████| 39528/39528 [1:27:22<00:00,  7.54it/s]


Unnamed: 0,original_text,entity_group,score,word,start,end
0,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.999927,CD47 - blocking antibody,273,295
1,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.999985,CD47,306,310
2,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.999992,SIRPα,311,316
3,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.999992,anti - CD20 antibody,403,421
4,"Follicular lymphoma (FL), marginal zone lympho...",GENETIC,0.882086,rituximab,422,431
...,...,...,...,...,...,...
712789,This study aimed to determine the effect of 8-...,DISEASE,0.999993,T2D,596,599
712790,This study aimed to determine the effect of 8-...,DISEASE,0.937653,insulin,781,788
712791,This study aimed to determine the effect of 8-...,DISEASE,0.999973,musculoaponeurotic fibrosarcoma,826,857
712792,This study aimed to determine the effect of 8-...,DISEASE,0.999990,T2D,1078,1081


In [15]:
results.to_excel('checkpoint.xlsx')

In [24]:
def _singularize(word):
    inflector = inflect.engine()
    return inflector.singular_noun(word) or word

def _normalize_therapy(word):
    try:
        r = requests.get(
            f'https://normalize.cancervariants.org/therapy/normalize?q={word}&infer_namespace=true',
            timeout=10  # Set timeout for network reliability
        )
        r.raise_for_status()
        response = r.json()

        if isinstance(response, dict) and response.get('match_type') is not None:
            if response['match_type'] != 0:
                return [
                    response['match_type'],
                    response['therapy']['id'],
                    response['therapy']['name']
                ]
            else:
                return [0, None, None]  # Not matched
        else:
            return ['Unexpected Response Format', None, None]
    except requests.exceptions.RequestException as e:
        return ['HTTP Error', str(e), None]
    except Exception as e:
        return ['Failure to Normalize', str(e), None]

# Main loop
checkpoint_interval = 5000
output_base = "normalized_results_checkpoint"
for idx, (index, row) in enumerate(tqdm(results[results['entity_group'] == 'CHEMICAL'].iterrows()), 1):
    word = _singularize(row['word'])
    norm_result = _normalize_therapy(word)

    results.at[index, 'concept_match_type'] = norm_result[0]
    results.at[index, 'concept_id'] = norm_result[1]
    results.at[index, 'concept_label'] = norm_result[2]

    if idx % checkpoint_interval == 0:
        checkpoint_filename = f"{output_base}_checkpoint_{idx}.xlsx"
        results.to_excel(checkpoint_filename, index=False)
        print(f"Checkpoint saved at row {idx} -> {checkpoint_filename}")

# Final save after loop completes
final_filename = f"{output_base}_final.xlsx"
results.to_excel(final_filename, index=False)
print(f"Final results saved -> {final_filename}")

  results.at[index, 'concept_match_type'] = norm_result[0]
5000it [20:29, 18.59s/it]

Checkpoint saved at row 5000 -> normalized_results_checkpoint_checkpoint_5000.xlsx


10000it [40:53, 20.07s/it]

Checkpoint saved at row 10000 -> normalized_results_checkpoint_checkpoint_10000.xlsx


15000it [59:47, 19.45s/it]

Checkpoint saved at row 15000 -> normalized_results_checkpoint_checkpoint_15000.xlsx


20000it [1:19:35, 10.96s/it]

Checkpoint saved at row 20000 -> normalized_results_checkpoint_checkpoint_20000.xlsx


25001it [1:32:19, 10.39s/it]

Checkpoint saved at row 25000 -> normalized_results_checkpoint_checkpoint_25000.xlsx


30000it [1:45:40, 16.57s/it]

Checkpoint saved at row 30000 -> normalized_results_checkpoint_checkpoint_30000.xlsx


35000it [1:58:38, 17.44s/it]

Checkpoint saved at row 35000 -> normalized_results_checkpoint_checkpoint_35000.xlsx


40001it [2:11:23, 11.58s/it]

Checkpoint saved at row 40000 -> normalized_results_checkpoint_checkpoint_40000.xlsx


45001it [2:23:48, 11.69s/it]

Checkpoint saved at row 45000 -> normalized_results_checkpoint_checkpoint_45000.xlsx


50000it [2:36:12, 15.80s/it]

Checkpoint saved at row 50000 -> normalized_results_checkpoint_checkpoint_50000.xlsx


55001it [2:48:44,  8.49s/it]

Checkpoint saved at row 55000 -> normalized_results_checkpoint_checkpoint_55000.xlsx


60000it [3:01:07, 12.66s/it]

Checkpoint saved at row 60000 -> normalized_results_checkpoint_checkpoint_60000.xlsx


65001it [3:13:22, 11.92s/it]

Checkpoint saved at row 65000 -> normalized_results_checkpoint_checkpoint_65000.xlsx


70001it [3:25:41, 11.18s/it]

Checkpoint saved at row 70000 -> normalized_results_checkpoint_checkpoint_70000.xlsx


75000it [3:37:54, 12.61s/it]

Checkpoint saved at row 75000 -> normalized_results_checkpoint_checkpoint_75000.xlsx


80000it [3:50:11, 15.41s/it]

Checkpoint saved at row 80000 -> normalized_results_checkpoint_checkpoint_80000.xlsx


85001it [4:02:34, 11.23s/it]

Checkpoint saved at row 85000 -> normalized_results_checkpoint_checkpoint_85000.xlsx


90001it [4:15:19, 10.00s/it]

Checkpoint saved at row 90000 -> normalized_results_checkpoint_checkpoint_90000.xlsx


95001it [4:27:55, 11.33s/it]

Checkpoint saved at row 95000 -> normalized_results_checkpoint_checkpoint_95000.xlsx


96459it [4:31:13,  5.93it/s]


Final results saved -> normalized_results_checkpoint_final.xlsx


In [25]:
results[results['entity_group']=='CHEMICAL']

Unnamed: 0,original_text,entity_group,score,word,start,end,concept_match_type,concept_id,concept_label
26,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.997218,R - CHOP,780,786,0.0,,
27,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999997,cyclophosphamide,799,815,80.0,normalize.therapy.rxcui:1545988,cyclophosphamide anhydrous
28,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999997,doxorubicin hydrochloride,817,842,80.0,normalize.therapy.rxcui:142433,doxorubicin hydrochloride
29,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999997,vincristine sulfate,844,863,80.0,normalize.therapy.rxcui:11203,vincristine sulfate
30,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999996,prednisone,865,875,80.0,normalize.therapy.rxcui:8640,prednisone
...,...,...,...,...,...,...,...,...,...
712721,Several studies demonstrated that treatment wi...,CHEMICAL,0.999998,cisplatin,358,367,80,normalize.therapy.rxcui:2555,cisplatin
712722,Several studies demonstrated that treatment wi...,CHEMICAL,0.999998,cisplatin,434,443,80,normalize.therapy.rxcui:2555,cisplatin
712723,Several studies demonstrated that treatment wi...,CHEMICAL,0.955203,BUN,759,762,0,,
712724,Several studies demonstrated that treatment wi...,CHEMICAL,0.999998,creatinine,767,777,80,normalize.therapy.rxcui:2913,creatinine


In [29]:
tdf = results[(results['concept_match_type']!=0) & (results['concept_match_type'].isna()==False)].reset_index(drop=True)
tdf

Unnamed: 0,original_text,entity_group,score,word,start,end,concept_match_type,concept_id,concept_label
0,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999997,cyclophosphamide,799,815,80.0,normalize.therapy.rxcui:1545988,cyclophosphamide anhydrous
1,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999997,doxorubicin hydrochloride,817,842,80.0,normalize.therapy.rxcui:142433,doxorubicin hydrochloride
2,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999997,vincristine sulfate,844,863,80.0,normalize.therapy.rxcui:11203,vincristine sulfate
3,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999996,prednisone,865,875,80.0,normalize.therapy.rxcui:8640,prednisone
4,Primary cutaneous diffuse large B cell lymphom...,CHEMICAL,0.999984,tyrosine,1092,1100,80.0,normalize.therapy.rxcui:10962,tyrosine
...,...,...,...,...,...,...,...,...,...
62291,Several studies demonstrated that treatment wi...,CHEMICAL,0.999998,cisplatin,87,96,80,normalize.therapy.rxcui:2555,cisplatin
62292,Several studies demonstrated that treatment wi...,CHEMICAL,0.999998,cisplatin,358,367,80,normalize.therapy.rxcui:2555,cisplatin
62293,Several studies demonstrated that treatment wi...,CHEMICAL,0.999998,cisplatin,434,443,80,normalize.therapy.rxcui:2555,cisplatin
62294,Several studies demonstrated that treatment wi...,CHEMICAL,0.999998,creatinine,767,777,80,normalize.therapy.rxcui:2913,creatinine


In [31]:
tdf = results[(results['concept_match_type']!=0) & (results['concept_match_type'].isna()==False)].reset_index(drop=True)

condensed_results = tdf.groupby('original_text').apply(
    lambda group: pd.Series({
        'DRUG_LABELS': ' | '.join(group.loc[group['entity_group'] == 'CHEMICAL', 'concept_label'].dropna().astype(str).unique()),
        'DRUG_IDS': ' | '.join(group.loc[group['entity_group'] == 'CHEMICAL', 'concept_id'].dropna().astype(str).unique())
    })
).reset_index()

condensed_results

# condensed_results = tdf.groupby('original_text').apply(
#     lambda group: pd.Series({
#         'GENETIC_LABELS': ' | '.join(group.loc[group['entity_group'] == 'GENETIC', 'concept_label'].unique()),
#         'GENETIC_IDS': ' | '.join(group.loc[group['entity_group'] == 'GENETIC', 'concept_id'].unique()),
#         'DISEASE_LABELS': ' | '.join(group.loc[group['entity_group'] == 'DISEASE', 'concept_label'].unique()),
#         'DISEASE_IDS': ' | '.join(group.loc[group['entity_group'] == 'DISEASE', 'concept_id'].unique())
#     })
# ).reset_index()


  condensed_results = tdf.groupby('original_text').apply(


Unnamed: 0,original_text,DRUG_LABELS,DRUG_IDS
0,ADLE was administered to high-fat diet treate...,nitric oxide | palmitate | oxygen | nitrite io...,normalize.therapy.rxcui:7442 | normalize.thera...
1,GOT1-bearing BALB/c nude mice were treated wi...,sonidegib,normalize.therapy.rxcui:1659191
2,"In this study, we have investigated the cytot...",lactate | ACRIDINE ORANGE | PROPIDIUM | oxygen,normalize.therapy.rxcui:114202 | normalize.the...
3,PNS were identified from the Traditional Chin...,"glutathione | malondialdehyde | ESTROGENS, CON...",normalize.therapy.rxcui:4890 | normalize.thera...
4,Several databases were screened for bioactive...,Tenamfetamine | glutathione,normalize.therapy.ncit:C80152 | normalize.ther...
...,...,...,...
15320,β-thalassemias are common hemoglobinopathies d...,oxygen,normalize.therapy.rxcui:7806
15321,δ-Tocotrienol is a naturally occurring proteas...,CERAMIDE,normalize.therapy.chembl:CHEMBL155886
15322,The malondialdehyde (MDA) level and TA count ...,malondialdehyde | Tenamfetamine | tantalum,normalize.therapy.rxcui:1657018 | normalize.th...
15323,• To investigate the role that oxidative stres...,streptozocin | glutathione | oxygen,normalize.therapy.rxcui:10114 | normalize.ther...


In [35]:
print(condensed_results['original_text'][15321])

δ-Tocotrienol is a naturally occurring proteasome inhibitor, which has the capacity to inhibit proliferation and induce apoptosis in several cancer cells obtained from several organs of humans, and other cancer cell lines. Moreover, results of plasma total mRNAs after δ-tocotrienol feeding to hepatitis C patients revealed significant inhibition in the expression of pro-inflammatory cytokines (TNF-α, VCAM1, proteasome subunits) and induction in the expression of ICAM1 and IFN-γ after post-treatment. This down-regulation of proteasome subunits leads to autophagy, apoptosis of immune cells and several genes. The present study describes RNA-sequence analysis of plasma total mRNAs obtained from δ-tocotrienol treatment of hepatitis C patients on gene expression regulated by proteasome. Pooled specimens of plasma total mRNAs of pre-dose versus post-dose of δ-tocotrienol treatment of hepatitis C patients were submitted to RNA-sequence analyses. The data based on > 1 and 8-fold expression chang

In [36]:
merged_df = pd.merge(
    abstracts,
    condensed_results,
    left_on='Abstract',
    right_on='original_text',
    how='left'
)
merged_df

Unnamed: 0,Title,Abstract,original_text,DRUG_LABELS,DRUG_IDS
0,39796006,"In the tissue regeneration field, stem cell tr...",,,
1,40524014,"Follicular lymphoma (FL), marginal zone lympho...",,,
2,40393040,Primary cutaneous diffuse large B cell lymphom...,Primary cutaneous diffuse large B cell lymphom...,cyclophosphamide anhydrous | doxorubicin hydro...,normalize.therapy.rxcui:1545988 | normalize.th...
3,40619042,"Primary ovarian insufficiency (POI), which aff...","Primary ovarian insufficiency (POI), which aff...",estradiol valerate | progesterone | Luteolin |...,normalize.therapy.rxcui:24395 | normalize.ther...
4,40444042,This investigation sought to explore the inhib...,This investigation sought to explore the inhib...,wogonin | bromocriptine,normalize.therapy.rxcui:2001619 | normalize.th...
...,...,...,...,...,...
39523,25873999,The role played by microRNAs in the deregulati...,,,
39524,34394999,Late-onset posttransplant lymphoproliferative ...,,,
39525,22431999,Several studies demonstrated that treatment wi...,Several studies demonstrated that treatment wi...,cisplatin | creatinine,normalize.therapy.rxcui:2555 | normalize.thera...
39526,31465999,Antiviral interferons (IFN-alpha/beta) are pos...,,,


In [38]:
merged_df

Unnamed: 0,Title,Abstract,original_text,DRUG_LABELS,DRUG_IDS
0,39796006,"In the tissue regeneration field, stem cell tr...",,,
1,40524014,"Follicular lymphoma (FL), marginal zone lympho...",,,
2,40393040,Primary cutaneous diffuse large B cell lymphom...,Primary cutaneous diffuse large B cell lymphom...,cyclophosphamide anhydrous | doxorubicin hydro...,normalize.therapy.rxcui:1545988 | normalize.th...
3,40619042,"Primary ovarian insufficiency (POI), which aff...","Primary ovarian insufficiency (POI), which aff...",estradiol valerate | progesterone | Luteolin |...,normalize.therapy.rxcui:24395 | normalize.ther...
4,40444042,This investigation sought to explore the inhib...,This investigation sought to explore the inhib...,wogonin | bromocriptine,normalize.therapy.rxcui:2001619 | normalize.th...
...,...,...,...,...,...
39523,25873999,The role played by microRNAs in the deregulati...,,,
39524,34394999,Late-onset posttransplant lymphoproliferative ...,,,
39525,22431999,Several studies demonstrated that treatment wi...,Several studies demonstrated that treatment wi...,cisplatin | creatinine,normalize.therapy.rxcui:2555 | normalize.thera...
39526,31465999,Antiviral interferons (IFN-alpha/beta) are pos...,,,
