## Topic Classifier (specific sub/child categories)
This notebooks demonstrates the usage of the scripts supporting classification of outbreak publications, Clinical Trials, and Datasets into specific subcategories or child categories

In [None]:
%%time
#### Update training data for clinical trials

import os
import pathlib
import pandas as pd
from pandas import read_csv


#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
SUBDATAPATH = os.path.join(DATAPATH,'subtopics/')
CLINDATAPATH = os.path.join(SUBDATAPATH,'ct_topics/')


from src.fetch_clinical_trials import *
update_clin_cats(DATAPATH,CLINDATAPATH)

#### time: 1 min, 54 sec

In [None]:
%%time
#### Update training data for all subtopics

import os
import pathlib
import pandas as pd
from pandas import read_csv


#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
SUBDATAPATH = os.path.join(DATAPATH,'subtopics/')
CLINDATAPATH = os.path.join(SUBDATAPATH,'ct_topics/')


from src.fetch_clinical_trials import *
update_clin_cats(DATAPATH,CLINDATAPATH)

from src.fetch_litsubtopics import *
from src.fetch_litcovid_topics import *
get_sub_topics(DATAPATH,RESULTSPATH)
map_keywords(DATAPATH)

from src.fetch_subtopics import *
from src.common import topic_dict
subtopics_only = load_subtopics_data(SUBDATAPATH,RESULTSPATH,topic_dict)

## This took 1 hr, 40 min

In [1]:
%%time
## Update all topics

import os
import pathlib
from src.common import *

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
SUBDATAPATH = os.path.join(DATAPATH,'subtopics/')
CLINDATAPATH = os.path.join(SUBDATAPATH ,'ct_topics/')

print('fetching litcovid topics')
from src.fetch_litcovid_topics import *
get_litcovid_topics(DATAPATH)

print('updating other broad topics in litcovid')
from src.fetch_offtopics import *
get_other_topics(DATAPATH,RESULTSPATH)

print('updating clinical trials annotations')
from src.fetch_clinical_trials import *
update_clin_cats(DATAPATH,CLINDATAPATH)

print('fetching subtopics from litcovid via keyword-mapping')      
from src.fetch_litsubtopics import *
from src.fetch_litcovid_topics import *
get_sub_topics(DATAPATH,RESULTSPATH)
map_keywords(DATAPATH)

print('fetching all available subtopic data')
from src.fetch_subtopics import *
subtopics_only = load_subtopics_data(DATAPATH,RESULTSPATH,topic_dict)

### run time:1h 10min

fetching litcovid topics
updating other broad topics in litcovid
updating clinical trials annotations
fetching subtopics from litcovid via keyword-mapping
fetching all available subtopic data
Wall time: 1h 5min 47s


In [None]:
with open(os.path.join(SUBDATAPATH,'subtopics_only.pickle'),'rb') as save_file:
    subtopics_only = pickle.load(save_file)
print(subtopics_only.groupby('topicCategory').size().reset_index(name='counts'))

In [None]:
%%time
#### update models for all subtopic 
import os
import pandas as pd
import pathlib
from src.train_classifier import *
from src.fetch_subtopics import *
from src.common import topic_dict

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
SUBMODELPATH = os.path.join(MODELPATH,'subtopics/')
SUBDATAPATH = os.path.join(DATAPATH,'subtopics/')

with open(os.path.join(SUBDATAPATH,'subtopics_only.pickle'),'rb') as save_file:
    subtopics_only = pickle.load(save_file)

classifiers = load_classifiers('best')
generate_models(SUBMODELPATH,subtopics_only,classifiers,"all",False)

## This took 9hrs, 10 min
## Adding a limitation on max training size (30000) actually slowed down the script, so it was removed

In [None]:
%%time
## update models
import os
import pandas as pd
import pathlib
from src.train_classifier import *
from src.fetch_subtopics import *
from src.common import topic_dict

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
SUBMODELPATH = os.path.join(MODELPATH,'subtopics/')
SUBDATAPATH = os.path.join(DATAPATH,'subtopics/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)

classifiers = load_classifiers('best')

models_to_update = 'i'
while models_to_update not in ['b','a','s','c']:
    models_to_update = input("Which models need to be updated? (b: broad topics, c: child/sub-topics, a: all topics, s: single topic")

if models_to_update == 'a':
    topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
    topiclist = topicsdf['topicCategory'].unique().tolist()
    generate_models(MODELPATH,topicsdf,classifiers) 
    subtopics_only = load_subtopics_data(SUBDATAPATH,RESULTSPATH,topic_dict)
    generate_models(SUBMODELPATH,subtopics_only,classifiers,"all",False)
elif models_to_update == 'b':
    topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
    topiclist = topicsdf['topicCategory'].unique().tolist()
    generate_models(MODELPATH,topicsdf,classifiers)
elif models_to_update == 'c':
    subtopics_only = load_subtopics_data(DATAPATH,RESULTSPATH,topic_dict)
    generate_models(SUBMODELPATH,subtopics_only,classifiers,"all",False)
elif models_to_update == 's':
    topic_to_check = input("enter the topic Category: ")
    if topic_to_check in topic_dict['broadtopics']:
        topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
        topiclist = topicsdf['topicCategory'].unique().tolist()
        generate_models(MODELPATH,topicsdf,classifiers,topic_to_check)
    else:
        subtopics_only = load_subtopics_data(DATAPATH,RESULTSPATH,topic_dict)
        generate_models(SUBMODELPATH,subtopics_only,classifiers,topic_to_check,False)        

#### Subtopics update run time: 2hrs 52 min


In [None]:
%%time
#### classify clinical trials
import os
import pandas as pd
import pathlib
from src.classify_pubs import *
from src.common import load_classifiers
from src.common import topic_dict

script_path = ''
DATAPATH = os.path.join(script_path,'data/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')


classifiers = load_classifiers('best')
classify_clins(DATAPATH,MODELPATH,PREDICTPATH,classifiers,topic_dict)

#### run time: 4 min, 17 sec    

In [None]:
%%time
#### classify publications
import os
import pandas as pd
import pathlib
from src.classify_pubs import *
from src.common import load_classifiers


script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
SUBDATAPATH = os.path.join(DATAPATH,'subtopics/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
subtopicsfile = os.path.join(DATAPATH,'subtopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
subtopicsdf = read_csv(subtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist() 
allsubslist = subtopicsdf['topicCategory'].unique().tolist()
subtopiclist = [x for x in allsubslist if x not in topiclist]

new_pubs_only,new_topic_ids = check_for_new(RESULTSPATH,topicsdf,"nonlitcovid")
all_new_ids = list(set(new_pubs_only).union(set(new_topic_ids)))
newonly = False
alldf = batch_fetch_meta(all_new_ids)
alldata = merge_texts(alldf)    

classifiers = load_classifiers("best")
classifierlist = classifiers.keys()
PUBPREDICTPATH = os.path.join(PREDICTPATH,'pubpredict/')
SUBMODELPATH = os.path.join(MODELPATH,'subtopics/')

if newonly == True:
    predict_class(MODELPATH,PUBPREDICTPATH,topiclist,classifierlist,alldata,True)
    predict_class(MODELPATH,PUBPREDICTPATH,topiclist,classifierlist,alldata,True)
else:
    predict_class(SUBMODELPATH,PUBPREDICTPATH,subtopiclist,classifierlist,alldata,False)
    predict_class(MODELPATH,PUBPREDICTPATH,topiclist,classifierlist,alldata,False)


#### runtime: 1 hr 28 min

In [None]:
#### Run tests
RESULTPATH = 'results/'
testresultsdf = run_test(RESULTPATH,topicsdf,classifierset_type='best',export_report=True)

In [None]:
%%time
#### load annotations
import os
import pandas as pd
import pathlib
from src.classify_pubs import *
from src.common import load_classifiers
from datetime import datetime

script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
#subtopicsfile = os.path.join(DATAPATH,'subtopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
#subtopicsdf = read_csv(subtopicsfile,delimiter='\t',header=0,index_col=0)
subtopic_results = read_csv(os.path.join(RESULTSPATH,'subtopicCats.tsv'),delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf,subtopic_results),ignore_index=True)
topicsdf.drop_duplicates(keep='first',inplace=True)
topicsdf.reset_index(drop=True)

print('topicsdf: Molecular Epi: ',len(topicsdf.loc[topicsdf['topicCategory']=='Molecular Epidemiology']))
print('topicsdf: Molecular Epi PMIDs: ',len(topicsdf.loc[((topicsdf['topicCategory']=='Molecular Epidemiology')&
                                               (topicsdf['_id'].str.contains('pmid',regex=False)))]))

topiclist = topicsdf['topicCategory'].unique().tolist()
classifiers = load_classifiers('best')
newonly=False

#classify_clins(DATAPATH,MODELPATH,PREDICTPATH,classifiers,topic_dict)
from src.common import topic_dict
classifierlist = classifiers.keys()
CLINPREDICTPATH = os.path.join(PREDICTPATH,'clinpredict/')
PUBPREDICTPATH = os.path.join(PREDICTPATH,'pubpredict/')
classify_clins(DATAPATH,MODELPATH,PREDICTPATH,classifiers,topic_dict)
clin_total_agree = merge_predictions(CLINPREDICTPATH,topic_dict,classifierlist,agreetype='perfect')
if newonly==True:
    new_pubs_only,new_topic_ids = check_for_new(RESULTSPATH,topicsdf,"nonlitcovid")
    all_new_ids = list(set(new_pubs_only).union(set(new_topic_ids)))
    classify_pubs(MODELPATH,PUBPREDICTPATH,new_pubs_only,topic_dict,classifiers)
    total_agree = merge_predictions(PUBPREDICTPATH,topic_dict,classifierlist,'perfect')
    new_total_agree = total_agree.loc[total_agree['_id'].isin(new_pubs_only)].copy()
    new_topics_df = topicsdf.loc[topicsdf['_id'].isin(new_topic_ids)].copy()
    totalnewresults = pd.concat((new_total_agree,new_topics_df,clin_total_agree),ignore_index=True)
    allnewresults = include_clin(totalnewresults)
    allnewresults['topicCategory'] = allnewresults['topicCategory'].str.replace('-','/')
    allnewresults.dropna(axis=0,inplace=True)
    allnewresults.reset_index(drop=True)
    cleanresults = clean_results(allnewresults)
    cleanresults.to_csv(os.path.join(RESULTSPATH,'topicCats.tsv'),mode='a',sep='\t',header=True)
else:
    all_ids = get_pub_ids(sourceset="nonlitcovid")
    classify_pubs(MODELPATH,PUBPREDICTPATH,all_ids,topic_dict,classifiers,False)
    total_agree = merge_predictions(PUBPREDICTPATH,topic_dict,classifierlist,'perfect')
    totalresults = pd.concat((total_agree,topicsdf,clin_total_agree),ignore_index=True)
    print('total results: ',len(totalresults))
    allresults = include_clin(totalresults)
    print('allresults: ',len(allresults))
    allresults['topicCategory'] = allresults['topicCategory'].str.replace('-','/')
    allresults.dropna(axis=0,inplace=True)
    print('allresults less na: ',len(allresults))
    allresults.reset_index(drop=True)
    print('allresults with new index: ',len(allresults))
    cleanresults = clean_results(allresults) 
    print('clean results: ',len(cleanresults))
    cleanresults.to_csv(os.path.join(RESULTSPATH,'topicCats.tsv'),mode='w',sep='\t',header=True)
updated_results = read_csv(os.path.join(RESULTSPATH,'topicCats.tsv'),delimiter='\t',header=0,index_col=0)
updated_results.drop_duplicates(subset='_id',keep='first',inplace=True)
updated_results.to_csv(os.path.join(RESULTSPATH,'topicCats.tsv'),sep='\t',header=True)
updated_results.to_json(os.path.join(RESULTSPATH,'topicCats.json'), orient='records')

####time: 3hrs 14 min

In [None]:
#### Inspect the results
littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
subtopicsdf = read_csv(os.path.join(RESULTSPATH,'subtopicCats.tsv'),delimiter='\t',header=0,index_col=0)
subtopicsdf.dropna(axis=0,inplace=True)

litsub = subtopicsdf.loc[subtopicsdf['_id'].str.contains('pmid')]
litcovidtopics = pd.concat((littopicsdf,offtopicsdf,litsub),ignore_index=True)
litcovidtopics.drop_duplicates(keep='first',inplace=True)
litcovidtopics.dropna(axis=0,inplace=True)
littopicfreq = litcovidtopics.groupby('topicCategory').size().reset_index(name='litcounts')

updated_results = read_csv(os.path.join(RESULTSPATH,'topicCats.tsv'),delimiter='\t',header=0,index_col=0,converters={"topicCategory": lambda x: x.strip("[]").replace("'","").split(", ")})
check_it = updated_results.explode('topicCategory')
frequency = check_it.groupby('topicCategory').size().reset_index(name='allcounts')

from src.fetch_subtopics import load_citsci_data
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
SUBDATAPATH = os.path.join(DATAPATH,'subtopics/')
curate_df = load_citsci_data(SUBDATAPATH)
curate_freq = curate_df.groupby('topicCategory').size().reset_index(name='citsci_counts')
curate_freq['topicCategory'] = curate_freq['topicCategory'].str.replace(' / ','/')

rawclinical = check_it.loc[check_it['_id'].str.contains('NCT|DKRS|DRKS|ACTRN|ChiCTR|IRCT')].copy()
clinicalfreq = clinical.groupby('topicCategory').size().reset_index(name='clinical_counts')

rawpreprint = check_it.loc[~(check_it['_id'].str.contains('NCT|DKRS|DRKS|ACTRN|ChiCTR|IRCT|pmid|zenodo|pdb|figshare'))].copy()
preprintfreq = rawpreprint.groupby('topicCategory').size().reset_index(name='preprint_count')

basic_info = frequency.merge(littopicfreq.merge(curate_freq,on='topicCategory',how='outer'),on='topicCategory',how='outer').fillna(0)
freq_info = basic_info.merge(clinicalfreq.merge(preprintfreq,on='topicCategory',how='outer'),on='topicCategory',how='outer').fillna(0)

freq_info.to_csv(os.path.join(RESULTSPATH,'topic_frequencies.tsv'),sep='\t',header=True)

In [None]:
%%time
## refresh annotations
import os
import pathlib

from src.classify_pubs import *
from src.common import load_classifiers
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
subtopic_results = read_csv(os.path.join(RESULTSPATH,'subtopicCats.tsv'),delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf,subtopic_results),ignore_index=True)
topicsdf.drop_duplicates(keep='first',inplace=True)

classifiers = load_classifiers('best')
load_annotations(DATAPATH,MODELPATH,PREDICTPATH,RESULTSPATH,topicsdf,classifiers,False)

#### run time: 8 hr 46 min
#### After efficiency changes implemented, run time was: 2 hr 13 min

In [None]:
%%time
## update annotations
import os
import pathlib

from src.classify_pubs import *
from src.common import load_classifiers
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
subtopic_results = read_csv(os.path.join(RESULTSPATH,'subtopicCats.tsv'),delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf,subtopic_results),ignore_index=True)
topicsdf.drop_duplicates(keep='first',inplace=True)

classifiers = load_classifiers('best')
load_annotations(DATAPATH,MODELPATH,PREDICTPATH,RESULTSPATH,topicsdf,classifiers,True)

#### run time: 

In [None]:
## Speed up map_keywords function

In [None]:
%%time
import os
import pandas as pd
import pathlib
from src.fetch_subtopics import *
from src.common import *
from src.fetch_litsubtopics import *

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
SUBMODELPATH = os.path.join(MODELPATH,'subtopics/')

map_keywords(DATAPATH)

SUBDATAPATH = os.path.join(DATAPATH,'subtopics/')
with open(os.path.join(SUBDATAPATH,'subtopic_pmids_for_training.pickle'),"rb") as dumpfile:
    curated_pmids_df = pickle.load(dumpfile)

print(curated_pmids_df.loc[curated_pmids_df['topicCategory']=='Repurposing'])