In [1]:
import pandas as pd
import sys, os
import inflect
from tqdm import tqdm
import requests
import ast

# Add the parent directory of this notebook to the Python path
sys.path.append(os.path.abspath('..'))

import score
import novel
import search_set
import indicator
import literature


# Surpress NLP Mask Warning for Apple Silicon
import warnings
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=r".*encoder_attention_mask.*BertSdpaSelfAttention\.forward"
)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0


## Curate Literature (DGIdb Control)

In [5]:
publications = pd.read_csv('data/dgidb/publications.csv')
pmids = publications['pmid'].tolist()
pmids = [str(pmid) for pmid in pmids]
abstracts = literature.fetch_abstracts(pmids)
abstracts[0:5]

13142 PMIDs found!
Fetching...


100%|██████████| 66/66 [01:34<00:00,  1.44s/it]


[('38496920',
  'Isocitrate dehydrogenase (IDH) is commonly mutated (mIDH) in gliomas, and this mutant enzyme produces the oncometabolite 2-hydroxyglutarate (2HG). 2HG promotes gliomagenesis and is implicated in epileptogenesis. Ivosidenib (IVO), a small molecule oral mIDH1 inhibitor, is FDA-approved for mIDH1 newly diagnosed and relapsed/refractory acute myeloid leukemia. Moreover, IVO has efficacy in clinical trials for recurrent mIDH1 gliomas. Given the lack of targeted treatments for gliomas, we initiated off-label IVO for mIDH glioma patients in October 2020. Retrospectively, we sought to assess early outcomes in our patients and describe their experience on IVO from October 2020 through February 2022. Our objective was to report on the following variables of off-label use of IVO: radiographic response, seizure control, tolerability, and access to the medication. All patients initially received single-agent IVO dosed at 500 mg orally once daily. The cohort age range was 21-74 year

In [9]:
abstract = pd.DataFrame(abstracts, columns=["Title", "Abstract"])
abstract

Unnamed: 0,Title,Abstract
0,38496920,Isocitrate dehydrogenase (IDH) is commonly mut...
1,10100098,The tissue renin-angiotensin system and extrac...
2,25686592,Intestinal P-glycoprotein (P-gp) is a limiting...
3,23920485,"Design, synthesis and anticancer activity of a..."
4,23459444,Compelling evidence points to a key role for i...
...,...,...
12756,20855208,Cancer therapy has moved beyond conventional c...
12757,22369181,The ligand-regulated nuclear receptor peroxiso...
12758,37773077,Approximately 10% to 15% of triple-negative br...
12759,29992026,"Tramadol is a complex drug, being metabolized ..."


In [11]:
results = novel.batch(abstract['Abstract'])
results

100%|██████████| 12761/12761 [32:21<00:00,  6.57it/s] 


Unnamed: 0,entity_group,score,word,start,end,original_text
0,GENETIC,0.999992,Isocitrate dehydrogenase,0,24,Isocitrate dehydrogenase (IDH) is commonly mut...
1,GENETIC,0.999984,IDH,26,29,Isocitrate dehydrogenase (IDH) is commonly mut...
2,GENETIC,0.999972,mIDH,52,56,Isocitrate dehydrogenase (IDH) is commonly mut...
3,GENETIC,0.999993,mIDH1,252,257,Isocitrate dehydrogenase (IDH) is commonly mut...
4,GENETIC,0.999970,mIDH1,289,294,Isocitrate dehydrogenase (IDH) is commonly mut...
...,...,...,...,...,...,...
317746,DISEASE,0.999982,tumor,1252,1257,Bladder cancer accounts for nearly 5% of all n...
317747,DISEASE,0.999979,tumor,1333,1338,Bladder cancer accounts for nearly 5% of all n...
317748,DISEASE,0.999992,bladder cancer,1403,1417,Bladder cancer accounts for nearly 5% of all n...
317749,DISEASE,0.999992,bladder cancer,1509,1523,Bladder cancer accounts for nearly 5% of all n...


In [13]:
results[results['entity_group'] == 'CHEMICAL']

Unnamed: 0,entity_group,score,word,start,end,original_text,concept_match_type,concept_id,concept_label
5,CHEMICAL,0.999998,Isocitrate,0,10,Isocitrate dehydrogenase (IDH) is commonly mut...,80.0,normalize.therapy.chembl:CHEMBL539669,ISOCITRATE
6,CHEMICAL,0.999999,2 - hydroxyglutarate,121,139,Isocitrate dehydrogenase (IDH) is commonly mut...,0.0,,
7,CHEMICAL,0.999998,2HG,141,144,Isocitrate dehydrogenase (IDH) is commonly mut...,0.0,,
8,CHEMICAL,0.999994,2HG,147,150,Isocitrate dehydrogenase (IDH) is commonly mut...,0.0,,
9,CHEMICAL,0.999996,Ivosidenib,212,222,Isocitrate dehydrogenase (IDH) is commonly mut...,80.0,normalize.therapy.rxcui:2049873,ivosidenib
...,...,...,...,...,...,...,...,...,...
317719,CHEMICAL,0.999972,olaparib,1450,1458,Approximately 10% to 15% of triple-negative br...,,,
317720,CHEMICAL,0.999997,ceralasertib,1463,1475,Approximately 10% to 15% of triple-negative br...,,,
317721,CHEMICAL,0.958486,olaparib,1641,1649,Approximately 10% to 15% of triple-negative br...,,,
317728,CHEMICAL,0.999998,Tramadol,0,8,"Tramadol is a complex drug, being metabolized ...",,,


In [14]:
def _singularize(word):
    inflector = inflect.engine()
    return inflector.singular_noun(word) or word

def _normalize_therapy(word):
    try:
        r = requests.get(
            f'https://normalize.cancervariants.org/therapy/normalize?q={word}&infer_namespace=true',
            timeout=10  # Set timeout for network reliability
        )
        r.raise_for_status()
        response = r.json()

        if isinstance(response, dict) and response.get('match_type') is not None:
            if response['match_type'] != 0:
                return [
                    response['match_type'],
                    response['therapy']['id'],
                    response['therapy']['name']
                ]
            else:
                return [0, None, None]  # Not matched
        else:
            return ['Unexpected Response Format', None, None]
    except requests.exceptions.RequestException as e:
        return ['HTTP Error', str(e), None]
    except Exception as e:
        return ['Failure to Normalize', str(e), None]

# Main loop
checkpoint_interval = 5000
output_base = "normalized_results_checkpoint"
for idx, (index, row) in enumerate(tqdm(results[results['entity_group'] == 'CHEMICAL'].iterrows()), 1):
    word = _singularize(row['word'])
    norm_result = _normalize_therapy(word)

    results.at[index, 'concept_match_type'] = norm_result[0]
    results.at[index, 'concept_id'] = norm_result[1]
    results.at[index, 'concept_label'] = norm_result[2]

    if idx % checkpoint_interval == 0:
        checkpoint_filename = f"{output_base}_checkpoint_{idx}.xlsx"
        results.to_excel(checkpoint_filename, index=False)
        print(f"Checkpoint saved at row {idx} -> {checkpoint_filename}")

# Final save after loop completes
final_filename = f"{output_base}_final.xlsx"
results.to_excel(final_filename, index=False)
print(f"Final results saved -> {final_filename}")

  results.at[index, 'concept_match_type'] = norm_result[0]
5000it [22:47,  8.47s/it]

Checkpoint saved at row 5000 -> normalized_results_checkpoint_checkpoint_5000.xlsx


10001it [48:24,  5.89s/it]

Checkpoint saved at row 10000 -> normalized_results_checkpoint_checkpoint_10000.xlsx


15001it [1:08:17,  5.96s/it]

Checkpoint saved at row 15000 -> normalized_results_checkpoint_checkpoint_15000.xlsx


20000it [1:28:44,  8.67s/it]

Checkpoint saved at row 20000 -> normalized_results_checkpoint_checkpoint_20000.xlsx


25001it [1:49:18,  6.03s/it]

Checkpoint saved at row 25000 -> normalized_results_checkpoint_checkpoint_25000.xlsx


30000it [2:09:52,  8.64s/it]

Checkpoint saved at row 30000 -> normalized_results_checkpoint_checkpoint_30000.xlsx


35002it [2:28:09,  4.17s/it]

Checkpoint saved at row 35000 -> normalized_results_checkpoint_checkpoint_35000.xlsx


40000it [2:40:19,  5.14s/it]

Checkpoint saved at row 40000 -> normalized_results_checkpoint_checkpoint_40000.xlsx


45001it [2:52:33,  5.54s/it]

Checkpoint saved at row 45000 -> normalized_results_checkpoint_checkpoint_45000.xlsx


50000it [3:04:42,  8.20s/it]

Checkpoint saved at row 50000 -> normalized_results_checkpoint_checkpoint_50000.xlsx


55001it [3:17:14,  5.88s/it]

Checkpoint saved at row 55000 -> normalized_results_checkpoint_checkpoint_55000.xlsx


60001it [3:29:22,  5.57s/it]

Checkpoint saved at row 60000 -> normalized_results_checkpoint_checkpoint_60000.xlsx


65002it [3:41:28,  4.50s/it]

Checkpoint saved at row 65000 -> normalized_results_checkpoint_checkpoint_65000.xlsx


70001it [3:53:19,  5.01s/it]

Checkpoint saved at row 70000 -> normalized_results_checkpoint_checkpoint_70000.xlsx


75001it [4:05:26,  5.32s/it]

Checkpoint saved at row 75000 -> normalized_results_checkpoint_checkpoint_75000.xlsx


80001it [4:17:25,  5.81s/it]

Checkpoint saved at row 80000 -> normalized_results_checkpoint_checkpoint_80000.xlsx


85001it [4:29:30,  4.80s/it]

Checkpoint saved at row 85000 -> normalized_results_checkpoint_checkpoint_85000.xlsx


90000it [4:41:41,  6.98s/it]

Checkpoint saved at row 90000 -> normalized_results_checkpoint_checkpoint_90000.xlsx


95001it [4:53:59,  4.49s/it]

Checkpoint saved at row 95000 -> normalized_results_checkpoint_checkpoint_95000.xlsx


100000it [5:06:24,  8.67s/it]

Checkpoint saved at row 100000 -> normalized_results_checkpoint_checkpoint_100000.xlsx


100297it [5:07:03,  5.44it/s]


Final results saved -> normalized_results_checkpoint_final.xlsx
