# This is the TopicCategory classifier for outbreak.info publications. 

It has not yet been tested for non-publications as it is trained completely on publication data primarily from LitCovid. It contains functions for the following tasks:

1. Retrieve and format LitCovid Topics (must be done frequently)
2. Build Behavioral and offtopic training sets (must be done frequently)
3. Generate Models (should be done rarely, or only on newly introduced topics)
4. Classify non-litcovid publications using models (must be done for new publications)
5. Merge formatted LitCovid Topics and Offtopic data with predicted data for inclusion into all publications

In [None]:
#### Update Litcovid topics
import os
import pathlib
from src.fetch_litcovid_topics import *
    
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')

get_litcovid_topics(DATAPATH)

In [None]:
####  Update Offtopics
import os
import pathlib
from src.fetch_offtopics import *
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')


get_other_topics(DATAPATH,RESULTSPATH)

In [3]:
#### Run classifier tests
import os
import pandas as pd
import pathlib
from src.train_classifier import *

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

testresultsdf = run_test(RESULTSPATH,topicsdf,classifierset_type='best',export_report=True)

fetching the abstracts:  2021-06-14 09:53:39.954352
fetching complete:  0:21:01.342979
now testing:  Behavioral Research 2021-06-14 10:14:41.315312
now testing:  Case Descriptions 2021-06-14 11:45:47.309046
now testing:  Clinical 2021-06-14 12:00:02.667610
now testing:  Diagnosis 2021-06-14 12:04:02.873544
now testing:  Environment 2021-06-14 13:50:17.173233
now testing:  Epidemiology 2021-06-14 13:57:53.682438
now testing:  Forecasting 2021-06-14 14:33:18.103157
now testing:  Information Sciences 2021-06-14 14:34:53.918354
now testing:  Mechanism 2021-06-14 14:43:30.381435
now testing:  Prevention 2021-06-14 15:29:57.509145


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


now testing:  Risk Factors 2021-06-14 20:51:54.454521
now testing:  Transmission 2021-06-14 20:56:37.818074
now testing:  Treatment 2021-06-14 21:05:34.288737


In [None]:
#### Update all models
import os
import pandas as pd
import pathlib
from src.train_classifier import *

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()    

classifiers = load_classifiers('best')
generate_models(MODELPATH,topicsdf,classifiers)

In [None]:
#### Update single topic model
import os
import pandas as pd
import pathlib
from src.train_classifier import *

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()    

classifiers = load_classifiers('best')
topic_to_check = input("enter the topic Category: ")
generate_models(MODELPATH,topicsdf,classifiers,topic_to_check)

In [None]:
#### Update annotations for all pubs
import os
import pathlib

from src.classify_pubs import *
from src.common import load_classifiers
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

classifiers = load_classifiers('best')
load_annotations(MODELPATH,PREDICTPATH,RESULTSPATH,topicsdf,classifiers,newonly = False)

In [None]:
#### Update annotations for new pubs only
import os
import pathlib

from src.classify_pubs import *
from src.common import load_classifiers
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

classifiers = load_classifiers('best')
load_annotations(MODELPATH,PREDICTPATH,RESULTSPATH,topicsdf,classifiers)

In [2]:
#### Update (refresh) the annotations for all publications
import os
import pathlib

from src.classify_pubs import *
from src.common import load_classifiers
#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

classifiers = load_classifiers('best')
load_annotations(MODELPATH,PREDICTPATH,RESULTSPATH,topicsdf,classifiers,newonly=False)