In [39]:
import sys
import os
import pandas as pd
from tqdm import tqdm

# Add the parent directory of this notebook to the Python path
sys.path.append(os.path.abspath('..'))

import search_set
import indicator
import literature


## Example Code for Generating Literature Indicators
Example notebook showing how to use this clinical score package to generate literature indicators for genes and drugs present in DGIdb data. This notebook demonstrates three of the four search strategies: *Raw String*, *NCBI GeneID*, *PubTator3*

### Generate DGIdb Search Set
Trim the DGIdb interactions database file to just the set of relevant interactions for a given gene.

In [40]:
gene = 'BCL2'
search_set.generate_search_set(gene)

Search set saved to search/2025-08-14_BCL2_clin_score.csv


### Grab PMIDs, Fetch Abstracts
Use a desired search method to obtain a list of relevant PubMed IDs to retrieve abstracts for downstream indicator analysis.  
  
Currently defined search methods are: *Raw String*, *NCBI GeneID*, *PubTator3*, *PubTator3+Drug*  
  
After PMIDs are identified, fetch all the abstracts.

In [45]:
import importlib
importlib.reload(literature)
pmids = literature.fetch_pmids_by_pubtator3(gene)
abstracts = literature.fetch_abstracts(pmids)

  gene_reference = pd.read_csv('data/pubtator/gene2pubtator3', sep='\t', header=None)


Gene Pubtator3 set loaded!
49965 PMIDs found!
Fetching...


100%|██████████| 250/250 [08:42<00:00,  2.09s/it]


In [46]:
len(abstracts)

39528

### Generate Scores
Use retrieved abstracts to determine the literature indicators for each gene, drug interaction pair


In [47]:
dgidb_df = pd.read_csv('search/2025-08-14_BCL2_clin_score.csv')
dgidb_df = dgidb_df.drop_duplicates(subset=['Drug','Gene'], keep='first')
dgidb_df.head()

Unnamed: 0,nomenclature,Gene,long_name,gene_concept_id,interaction_score,drug_specificity,gene_specificity,evidence_score,source_db_name,source_db_version,...,directionality,definition,reference,drug_name,nomenclature-2,Drug,approved,immunotherapy,anti_neoplastic,drug_concept_id
0,Gene Symbol,BCL2,BCL2 apoptosis regulator,hgnc:990,2.284732,4.141413,0.183893,3.0,TALC,12-May-16,...,1.0,"In inhibitor interactions, the drug binds to a...","<a href=""https://en.wikipedia.org/wiki/Enzyme_...",OBLIMERSEN,Primary Name,OBLIMERSEN,False,False,True,ncit:C1870
3,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,DOLASTATIN 10,Primary Name,DOLASTATIN 10,False,False,False,ncit:C1300
4,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,EPISIALIN,Primary Name,MUC-1 ANTIGEN,False,False,False,ncit:C2407
5,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,MICELLAR PACLITAXEL,Primary Name,MICELLAR PACLITAXEL,False,False,False,ncit:C29256
6,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,BEAUVERICIN,Primary Name,BEAUVERICIN,False,False,False,ncit:C1011


In [48]:
indicator.generate_indicators(abstracts, dgidb_df)

0it [00:25, ?it/s]


KeyboardInterrupt: 