# This is the TopicCategory classifier for outbreak.info publications. 

It has not yet been tested for non-publications as it is trained completely on publication data primarily from LitCovid. It contains functions for the following tasks:

1. Retrieve and format LitCovid Topics (must be done frequently)
2. Build Behavioral and offtopic training sets (must be done frequently)
3. Generate Models (should be done rarely, or only on newly introduced topics)
4. Classify non-litcovid publications using models (must be done for new publications)
5. Merge formatted LitCovid Topics and Offtopic data with predicted data for inclusion into all publications

In [None]:
%%time
#### Update Litcovid topics
import os
import pathlib
from src.fetch_litcovid_topics import *
    
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')

get_litcovid_topics(DATAPATH)

## Runtime: 43.7 seconds

In [16]:
%%time
####  Update Offtopics
import os
import pathlib
from src.fetch_offtopics import *
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')

get_other_topics(DATAPATH,RESULTSPATH)

## Runtime: 21 min

Wall time: 21min 2s


In [None]:
#### Run classifier tests
import os
import pandas as pd
import pathlib
from src.train_classifier import *

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

testresultsdf = run_test(RESULTSPATH,topicsdf,classifierset_type='best',export_report=True)

In [None]:
#### Update all models
import os
import pandas as pd
import pathlib
from src.train_classifier import *

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()    

classifiers = load_classifiers('best')
generate_models(MODELPATH,topicsdf,classifiers)

In [None]:
#### Update single topic model
import os
import pandas as pd
import pathlib
from src.train_classifier import *

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()    

classifiers = load_classifiers('best')
topic_to_check = input("enter the topic Category: ")
generate_models(MODELPATH,topicsdf,classifiers,topic_to_check)

In [None]:
#### Update annotations for all pubs
import os
import pathlib

from src.classify_pubs import *
from src.common import load_classifiers
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

classifiers = load_classifiers('best')
load_annotations(MODELPATH,PREDICTPATH,RESULTSPATH,topicsdf,classifiers,newonly = False)

In [None]:
#### Update annotations for new pubs only
import os
import pathlib

from src.classify_pubs import *
from src.common import load_classifiers
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

classifiers = load_classifiers('best')
load_annotations(MODELPATH,PREDICTPATH,RESULTSPATH,topicsdf,classifiers)

In [None]:
#### Update (refresh) the annotations for all publications
import os
import pathlib

from src.classify_pubs import *
from src.common import load_classifiers
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

classifiers = load_classifiers('best')
load_annotations(MODELPATH,PREDICTPATH,RESULTSPATH,topicsdf,classifiers,newonly=False)

## Increasing the efficiency of common functions

In [None]:
%%time
#### Increasing the efficiency of the cleanresults function

import os
import pandas as pd
from pandas import read_csv

script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
subtopicsfile = os.path.join(DATAPATH,'subtopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
subtopicsdf = read_csv(subtopicsfile,delimiter='\t',header=0,index_col=0)
subtopic_results = read_csv(os.path.join(RESULTSPATH,'subtopicCats.tsv'),delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf,subtopicsdf,subtopic_results),ignore_index=True)


def clean_results(allresults):
    cleanresults = allresults.groupby('_id')['topicCategory'].apply(list).reset_index(name='newTopicCategory')
    cleanresults.rename(columns={'newTopicCategory':'topicCategory'},inplace=True)
    return(cleanresults) 

cleanresults = clean_results(topicsdf)
print(cleanresults.head(n=2))

#### The new method takes 14.5 seconds

In [None]:
%%time
#### Increasing the efficiency of the old cleanresults function

def clean_results(allresults):
    allresults.drop_duplicates(keep="first",inplace=True)
    counts = allresults.groupby('_id').size().reset_index(name='counts')
    duplicates = counts.loc[counts['counts']>1]
    singles = counts.loc[counts['counts']==1]
    dupids = duplicates['_id'].unique().tolist()
    tmplist = []
    for eachid in dupids:
        catlist = allresults['topicCategory'].loc[allresults['_id']==eachid].tolist()
        tmplist.append({'_id':eachid,'topicCategory':catlist})
    tmpdf = pd.DataFrame(tmplist)  
    tmpsingledf = allresults[['_id','topicCategory']].loc[allresults['_id'].isin(singles['_id'].tolist())]
    idlist = tmpsingledf['_id'].tolist()
    catlist = tmpsingledf['topicCategory'].tolist()
    cattycat = [[x] for x in catlist]
    list_of_tuples = list(zip(idlist,cattycat))
    singledf = pd.DataFrame(list_of_tuples, columns = ['_id', 'topicCategory']) 
    cleanresults = pd.concat((tmpdf,singledf),ignore_index=True)
    return(cleanresults)

cleanresults = clean_results(topicsdf)
print(cleanresults.head(n=2))

#### The old method took a ridiculously long time! (over 30 min)

In [None]:
%%time
#### The old get agreement function

import os
import pandas as pd
from pandas import read_csv
from src.common import load_classifiers

def get_agreement(PREDICTPATH,eachtopic,classifierlist):
    agreement = pd.DataFrame(columns=['_id','topicCategory','pos_pred_count','pos_pred_algorithms'])
    classresult = pd.DataFrame(columns=['_id','prediction','topicCategory','classifier'])
    for eachclass in classifierlist:
        tmpfile = read_csv(os.path.join(PREDICTPATH,eachtopic+"_"+eachclass+".tsv"),delimiter='\t',header=0,index_col=0)
        classresult = pd.concat((classresult,tmpfile),ignore_index=True)
    posresults = classresult.loc[classresult['prediction']=='in category']
    agreecounts = posresults.groupby('_id').size().reset_index(name='counts')
    no_agree = posresults.loc[posresults['_id'].isin(agreecounts['_id'].loc[agreecounts['counts']==1].tolist())].copy()
    no_agree.rename(columns={'classifier':'pos_pred_algorithms'},inplace=True)
    no_agree['pos_pred_count']=1
    no_agree.drop('prediction',axis=1,inplace=True)
    perfect_agree = posresults.loc[posresults['_id'].isin(agreecounts['_id'].loc[agreecounts['counts']==len(classifierlist)].tolist())].copy()
    perfect_agree['pos_pred_count']=len(classifierlist)
    perfect_agree['pos_pred_algorithms']=str(classifierlist)
    perfect_agree.drop(['prediction','classifier'],axis=1,inplace=True)
    perfect_agree.drop_duplicates('_id',keep='first',inplace=True)
    partialcountids = agreecounts['_id'].loc[((agreecounts['counts']>1)&
                                          (agreecounts['counts']<len(classifierlist)))].tolist()
    tmplist = []
    for eachid in list(set(partialcountids)):
        tmpdf = posresults.loc[posresults['_id']==eachid]
        tmpdict = {'_id':eachid,'topicCategory':eachtopic,'pos_pred_count':len(tmpdf),
                   'pos_pred_algorithms':str(tmpdf['classifier'].tolist())}
        tmplist.append(tmpdict)
    partial_agree = pd.DataFrame(tmplist)    
    agreement = pd.concat((agreement,no_agree,partial_agree,perfect_agree),ignore_index=True)
    return(agreement)

classifiers = load_classifiers('best')
classifierlist = list(classifiers.keys())
PREDICTPATH = os.path.join(script_path,'predictions/')
CLINPREDICTPATH = os.path.join(PREDICTPATH,'clinpredict/')
PUBPREDICTPATH = os.path.join(PREDICTPATH,'pubpredict/')
agreement = get_agreement(PUBPREDICTPATH,'Mechanism',classifierlist)
print(agreement.tail(n=2))

#### A test run for the Mechanism topic took 54 seconds to run

In [None]:
%%time
#### increasing the efficiency of the get agreement function
import os
import pandas as pd
from pandas import read_csv
from src.common import load_classifiers

def get_agreement(PREDICTPATH,eachtopic,classifierlist):
    classresult = pd.DataFrame(columns=['_id','prediction','topicCategory','classifier'])
    for eachclass in classifierlist:
        tmpfile = read_csv(os.path.join(PREDICTPATH,eachtopic+"_"+eachclass+".tsv"),delimiter='\t',header=0,index_col=0)
        classresult = pd.concat((classresult,tmpfile),ignore_index=True)
    classresult.drop_duplicates(keep='first',inplace=True)
    posresults = classresult.loc[classresult['prediction']=='in category']
    agreement = posresults.groupby(['_id','topicCategory'])['classifier'].apply(list).reset_index(name='pos_pred_algorithms')
    agreement['pos_pred_count'] = agreecounts['pos_pred_algorithms'].str.len()
    return(agreement)


classifiers = load_classifiers('best')
classifierlist = classifiers.keys()
PREDICTPATH = os.path.join(script_path,'predictions/')
CLINPREDICTPATH = os.path.join(PREDICTPATH,'clinpredict/')
PUBPREDICTPATH = os.path.join(PREDICTPATH,'pubpredict/')
agreement = get_agreement(PUBPREDICTPATH,'Mechanism',classifierlist)
print(agreement.tail(n=2))
#### A test run for Mechanism took to run 5 seconds

In [None]:
import os
import pathlib
import requests
import json
import pandas as pd
import pickle

from src.common import *
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')



WIKIDATAPATH = os.path.join(DATAPATH,'from wikidata/')
repurposetypes = ['Q12140', 'Q35456', 'Q28885102','Q8386']
headers = {'User-Agent': 'outbreak resource topic classifier bot (https://outbreak.info/; help@outbreak.info)'}
querystart = """
SELECT
  ?item ?itemLabel ?itemAltLabel
  ?value 
WHERE 
{
  ?item wdt:P31 wd:"""
queryend = """.        
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""
repurpose = pd.DataFrame(columns=['wdid','drug_name','name','alias'])
for eachwdid in repurposetypes:
    query = querystart+eachwdid+queryend
    params = {'format': 'json', 'query': query, 'headers': headers}
    r = make_request(params)
    if r != 0:
        data = r.json()
        with open(os.path.join(WIKIDATAPATH,eachwdid+'.pickle'),'wb') as dumpfile:
            pickle.dump(data,dumpfile)
    else:
        with open(os.path.join(WIKIDATAPATH,eachwdid+'.pickle'),'rb') as loadfile:
            data = pickle.load(loadfile)  
    tmpdf = parse_wikidata(data)
    repurpose = pd.concat((repurpose,tmpdf),ignore_index=True)
    time.sleep(1)
repurpose.drop_duplicates(keep='first',inplace=True)
