In [None]:
import sys
import os
import pandas as pd
from tqdm import tqdm

# Add the parent directory of this notebook to the Python path
sys.path.append(os.path.abspath('..'))

import search_set
import indicator
import literature


## Example Code for Generating Literature Indicators
Example notebook showing how to use this clinical score package to generate literature indicators for genes and drugs present in DGIdb data. This notebook is specific for the *Pubtator3+Drug* search strategy, which requires a PMID look-up per drug in the dataset

### Generate DGIdb Search Set
Trim the DGIdb interactions database file to just the set of relevant interactions for a given gene.

In [None]:
gene = 'EGFR'
search_set.generate_search_set(gene)

### Grab PMIDs, Fetch Abstracts
Use a desired search method to obtain a list of relevant PubMed IDs to retrieve abstracts for downstream indicator analysis.  
  
To use *PubTator3+Drug* as a search method, the overlap for gene, drugs must be identified beforehand to determine the unique set of abstracts to pull for indicator generation. Depending on the search set, this may take anywhere from 20 minutes to an hour (or two)  
  
  
After PMIDs are identified, fetch all the abstracts.

In [None]:
dgidb_df = pd.read_csv('search/2025-08-13_EGFR_clin_score.csv')
dgidb_df = dgidb_df.drop_duplicates(subset=['Drug','Gene'], keep='first')
pmid_dicts = literature.fetch_pmids_by_pubtator3drug(gene, list(dgidb_df['Drug']))

In [None]:
unique_pmids = {pmid for pmids in pmid_dicts.values() for pmid in pmids}
unique_pmids = [pmid for pmid in unique_pmids]
print(len(unique_pmids))
abstracts = literature.fetch_abstracts(unique_pmids)

### Generate Scores
Use retrieved abstracts to determine the literature indicators for each gene, drug interaction pair. Depending on number of abstracts retrieved, this could take many hours.


In [None]:
indicator.generate_indicators(abstracts, dgidb_df)