In [98]:
import pandas as pd
import sys, os, ast

# Add the parent directory of this notebook to the Python path
sys.path.append(os.path.abspath('..'))

import score

## Example Code for Generating Clinical Prioritization Scores
Demo notebook demonstrating how to take a set of literature indicators to generate new prioritization scores for downstream curation or hypothesis generation efforts.

### Load Indicator Scores

In [None]:
search_strategies = ['raw','ncbi','pubtator3']
assessments = pd.DataFrame()
for strategy in search_strategies:
    tdf = score.load_pmid_assessments(f'search_strategy_evaluation/{strategy}/2025-08-14_BCL2.zip', strategy)
    assessments = pd.concat([assessments,tdf])

tdf = score.load_pmid_assessments('search_strategy_evaluation/pubtator3+drug/2025-08-13_BCL2.zip','pubtator3+drug')

assessments = pd.concat([assessments, tdf])

Unnamed: 0,pmid,label,scores,gene,drug,method
0,40810224,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,raw
1,40809918,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,raw
2,40809891,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,raw
3,40809237,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,raw
4,40809170,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,raw
...,...,...,...,...,...,...
986057,31799611,not_evaluated,0.0,BCL2,VINCRISTINE,pubtator3+drug
986058,34567953,not_evaluated,0.0,BCL2,VINCRISTINE,pubtator3+drug
986059,34631583,not_evaluated,0.0,BCL2,VINCRISTINE,pubtator3+drug
986060,33760220,not_evaluated,0.0,BCL2,VINCRISTINE,pubtator3+drug


In [70]:
assessments[assessments['method']=='pubtator3'].groupby(by='pmid').head()

Unnamed: 0,pmid,label,scores,gene,drug,method
0,39796006,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,pubtator3
1,40524014,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,pubtator3
2,40393040,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,pubtator3
3,40619042,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,pubtator3
4,40444042,not_evaluated,0.0,BCL2,4-PHENYLBUTYRIC ACID,pubtator3
...,...,...,...,...,...,...
49995,34898246,not_evaluated,0.0,BCL2,ANG1005,pubtator3
49996,34434246,not_evaluated,0.0,BCL2,ANG1005,pubtator3
49997,30644246,not_evaluated,0.0,BCL2,ANG1005,pubtator3
49998,37748246,not_evaluated,0.0,BCL2,ANG1005,pubtator3


In [None]:
assessments[(assessments['scores']!='0.0') & (assessments['scores']!=0.0) & (assessments['label']!='no_indicator_evidence')]['method'].value_counts()

method
pubtator3+drug    1073
raw                382
pubtator3          270
ncbi                45
Name: count, dtype: int64

### Attach Indicators to DGIdb Search Set
Now that we have indicators loaded from our NLP methods, let's calculate and attach them to their relevant search terms

In [35]:
dgidb_df = pd.read_csv('search/2025-08-14_BCL2_clin_score.csv')
dgidb_df = dgidb_df.drop_duplicates(subset=['Drug','Gene'], keep='first')
dgidb_df.head()

Unnamed: 0,nomenclature,Gene,long_name,gene_concept_id,interaction_score,drug_specificity,gene_specificity,evidence_score,source_db_name,source_db_version,...,directionality,definition,reference,drug_name,nomenclature-2,Drug,approved,immunotherapy,anti_neoplastic,drug_concept_id
0,Gene Symbol,BCL2,BCL2 apoptosis regulator,hgnc:990,2.284732,4.141413,0.183893,3.0,TALC,12-May-16,...,1.0,"In inhibitor interactions, the drug binds to a...","<a href=""https://en.wikipedia.org/wiki/Enzyme_...",OBLIMERSEN,Primary Name,OBLIMERSEN,False,False,True,ncit:C1870
3,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,DOLASTATIN 10,Primary Name,DOLASTATIN 10,False,False,False,ncit:C1300
4,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,EPISIALIN,Primary Name,MUC-1 ANTIGEN,False,False,False,ncit:C2407
5,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,MICELLAR PACLITAXEL,Primary Name,MICELLAR PACLITAXEL,False,False,False,ncit:C29256
6,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,0.183893,2.0,NCI,14-Sep-17,...,,,,BEAUVERICIN,Primary Name,BEAUVERICIN,False,False,False,ncit:C1011


In [None]:
def unpack_total(score):
    return ast.literal_eval(score)['unweighted_total']

trimmed_data = assessments[(assessments['scores']!='0.0') & (assessments['scores']!=0.0) & (assessments['label']!='no_indicator_evidence')].reset_index(drop=True)

trimmed_data['pmid_indicator'] = trimmed_data['scores'].apply(unpack_total)

drugs = list(trimmed_data['drug'].unique())

trimmed_data['total_indicator'] = trimmed_data.groupby(['drug','method'])['pmid_indicator'].transform("sum")

trimmed_data


Unnamed: 0,pmid,label,scores,gene,drug,method,pmid_indicator,total_indicator
0,37753296,indicator_evidence,"{'clinical_study': 0, 'case_report': 0, 'anima...",BCL2,4-PHENYLBUTYRIC ACID,raw,2,2
1,40253027,indicator_evidence,"{'clinical_study': 1, 'case_report': 0, 'anima...",BCL2,AMINOLEVULINIC ACID,raw,5,5
2,40612794,indicator_evidence,"{'clinical_study': 0, 'case_report': 0, 'anima...",BCL2,BEAUVERICIN,raw,2,2
3,40593443,indicator_evidence,"{'clinical_study': 1, 'case_report': 0, 'anima...",BCL2,BORTEZOMIB,raw,4,27
4,40539846,indicator_evidence,"{'clinical_study': 1, 'case_report': 0, 'anima...",BCL2,BORTEZOMIB,raw,2,27
...,...,...,...,...,...,...,...,...
1765,24289107,indicator_evidence,"{'clinical_study': 1, 'case_report': 0, 'anima...",BCL2,VINCRISTINE,pubtator3+drug,2,349
1766,26301373,indicator_evidence,"{'clinical_study': 0, 'case_report': 0, 'anima...",BCL2,VINCRISTINE,pubtator3+drug,4,349
1767,25090026,indicator_evidence,"{'clinical_study': 1, 'case_report': 0, 'anima...",BCL2,VINCRISTINE,pubtator3+drug,1,349
1768,27628560,indicator_evidence,"{'clinical_study': 0, 'case_report': 0, 'anima...",BCL2,VINCRISTINE,pubtator3+drug,2,349


In [133]:
summary = (trimmed_data.groupby(["gene", "drug", "method"], as_index=False).agg(total_indicator=("total_indicator", "first")))

method = 'pubtator3'
summary = summary[summary['method']==method].reset_index(drop=True)

merged_df = pd.merge(
    summary,
    dgidb_df,
    left_on='drug',
    right_on='Drug',
    how='left'
)
merged_df.head()

Unnamed: 0,gene,drug,method,total_indicator,nomenclature,Gene,long_name,gene_concept_id,interaction_score,drug_specificity,...,directionality,definition,reference,drug_name,nomenclature-2,Drug,approved,immunotherapy,anti_neoplastic,drug_concept_id
0,BCL2,ALPHA-TOCOPHEROL,pubtator3,2,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,0.138469,0.376492,...,,,,ALPHA-TOCOPHEROL,Primary Name,ALPHA-TOCOPHEROL,False,False,False,ncit:C74960
1,BCL2,AMINOLEVULINIC ACID,pubtator3,11,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,1.523154,4.141413,...,,,,5-AMINOLAEVULINIC ACID,Primary Name,AMINOLEVULINIC ACID,False,False,False,ncit:C234
2,BCL2,BORTEZOMIB,pubtator3,12,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,0.009066,0.049303,...,,,,Bortezomib,Primary Name,BORTEZOMIB,True,False,True,rxcui:358258
3,BCL2,C5A,pubtator3,4,Gene Name,BCL2,BCL2 apoptosis regulator,hgnc:990,0.217593,0.59163,...,,,,C5A,Primary Name,C5A,False,False,False,iuphar.ligand:3576
4,BCL2,CARBOPLATIN,pubtator3,7,Gene Symbol,BCL2,BCL2 apoptosis regulator,hgnc:990,0.00896,0.048723,...,,,,CARBOPLATIN,Primary Name,CARBOPLATIN,True,False,True,rxcui:40048


### Create Graphs for Research