# Outbreak resource litcovid and preprint matcher

This code pings the outbreak.info api to pull an updated list of ids, compares the ids with files containing previously run ids and identifies the newly updated ids. For the newly updated ids, it pings the api to pull the relevant metadata so that a similarity test can be run for the new ids.

**Requirements**
This code was written in python 3.6 and uses the following libraries:
* requests
* pickle
* json
* pandas
* nltk
* string
* datetime

**Limitations**
This code does not account for publications hosted in Zenodo, Dataverse, Figshare, or any other general repository, as the relationship between publications hosted on those sites and litcovid publications cannot be automatically determined.  This code is only for linking preprints in biorxiv and medrxiv to litcovid. Note that it currently does not accommodate preprint rxivs outside of biorxiv and medrxiv as the parsers for those preprints have yet to be written.

**Assumptions**
In order to minimize manual review, the thresholds have been set pretty high so precision is expected to be high, but sensitivity is expected to be low. An initial run was already performed and all the relevant data was already saved.  This data is included in the repo as detailed below

**File structure**
Previous results are 'cached' (ie-saved and updated), so that recalculations are not required, and time isn't wasted re-running
Files may be named by type of meta compared (either 'text' or 'auth' (author)), and source (either 'litcovid' or 'preprint')

**file paths**:
* 'results/archives/' - stores precomputed files from previous runs and lists of identifiers in previous runs
* 'temp/' - temporarily stores the type-specific successful matches in a run
* 'to review/' - stores the results of the matching that require manual review
* 'update dumps/' - stores the dataframe of updates to make based on sorted matches in this run

**Pre-existing files**
* 'results/archives/all_`source`_ids.txt' - a pickled list of identifiers that has already been run (where `source` is either litcovid or preprints
* 'results/archives/`compare_type`_`source`_set.txt' - a pickled pandas dataframe containing preprocessed text for comparison. The `source` is again either litcovid or preprints, while the `compare_type` is either auth (author), or text (title and abstract)
* 'temp/`compare_type`_above_threshold.txt - a tab-delimited text file containing all matches based on the `compare_type` (either auth or text) where the similarity was found to be above the minimum threshold. These files are merged to identify match candidates
* 'results/to review/low_scores.txt' - a tab-delimited pandas dump for matches where the sum score was below the threshold for acceptance
* 'results/to review/manual_check.txt' - a tab-delimited pandas dump for matches where a litcovid item matched with more than one preprint or vice versa
* 'results/archives/clean_results.txt' - a tab-delimited pandas dump for matches which do not need further screening. This file is processed for creating the update dump
* 'results/update dumps/update_file.txt' - a tab-delimited pandas dump for matches which do not need further screening and have been formatted with the appropriate fields for importing into outbreak.info resource metadata

Note that the script has been broken up into parts for ease of re-running and troubleshooting. It also uses topicCategories generated by the topic classifier, and date information to limit the number of comparisons to be made

In [None]:
import os
import requests
import pickle
import json
import pandas as pd
from pandas import read_csv
import nltk
import string
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
from datetime import datetime,timedelta
import pathlib


#### Set paths
#scriptpath = pathlib.Path(__file__).parent.absolute()
#try:
#    generalpath = pathlib.Path(__file__).parents[1].absolute()
#except:
#    generalpath = pathlib.Path(__file__).resolve().parents[1].absolute()
scriptpath = ''
RESULTSPATH = os.path.join(scriptpath,'results/')
ARCHIVEPATH = os.path.join(RESULTSPATH,'archives/')
TEMPPATH = os.path.join(RESULTSPATH,'temp/')
OUTPUTPATH = os.path.join(RESULTSPATH,'update dumps/')
REVIEWPATH = os.path.join(RESULTSPATH,'to review/')
TOPICPATH = ''
#TOPICPATH = os.path.join(generalpath,'topic_classifier/results/')

## Functions to fetch all ids and metadata from a source

In [None]:
#### Get the size of the source (to make it easy to figure out when to stop scrolling)
def fetch_src_size(source):
    pubmeta = requests.get("https://api.outbreak.info/resources/query?q=curatedBy.name:"+source+"&size=0&aggs=@type")
    pubjson = json.loads(pubmeta.text)
    pubcount = int(pubjson["facets"]["@type"]["total"])
    return(pubcount)

#### Pull ids from a json file
def get_ids_from_json(jsonfile):
    idlist = []
    for eachhit in jsonfile["hits"]:
        if eachhit["_id"] not in idlist:
            idlist.append(eachhit["_id"])
    return(idlist)

#### Ping the API and get all the ids for a specific source and scroll through the source until number of ids matches meta
def get_source_ids(source):
    source_size = fetch_src_size(source)
    r = requests.get("https://api.outbreak.info/resources/query?q=curatedBy.name:"+source+"&fields=_id&fetch_all=true")
    response = json.loads(r.text)
    idlist = get_ids_from_json(response)
    try:
        scroll_id = response["_scroll_id"]
        while len(idlist) < source_size:
            r2 = requests.get("https://api.outbreak.info/resources/query?q=curatedBy.name:"+source+"&fields=_id&fetch_all=true&scroll_id="+scroll_id)
            response2 = json.loads(r2.text)
            idlist2 = set(get_ids_from_json(response2))
            tmpset = set(idlist)
            idlist = tmpset.union(idlist2)
            try:
                scroll_id = response2["_scroll_id"]
            except:
                print("no new scroll id")
        return(idlist)
    except:
        return(idlist)

#### Pull ids from the major publication sources (litcovid, medrxiv,biorxiv)
def get_pub_ids():
    update_date = datetime.now()
    biorxiv_ids = get_source_ids("bioRxiv")
    medrxiv_ids = get_source_ids("medRxiv")
    litcovid_idlist = get_source_ids("litcovid")
    preprint_idlist = list(set(medrxiv_ids).union(set(biorxiv_ids)))
    preprint_dict={datetime.strftime(update_date,'%Y-%m-%d'):preprint_idlist}
    litcovid_dict={datetime.strftime(update_date,'%Y-%m-%d'):litcovid_idlist}
    return(preprint_dict,litcovid_dict)


def get_date(datedict):
    for eachdate in list(datedict.keys()):
        dict_date = datetime.strptime(eachdate,'%Y-%m-%d')
    return(dict_date)

#### Load the previously saved id lists, and compare the two to identify only the new ids
def remove_old_ids(preprint_dict,litcovid_dict,ARCHIVEPATH,TEMPPATH):
    preprint_run = pickle.load(open(os.path.join(ARCHIVEPATH,"all_preprint_dict.txt"), "rb"))
    litcovid_run = pickle.load(open(os.path.join(ARCHIVEPATH,"all_litcovid_dict.txt"), "rb"))
    old_preprint_date = get_date(preprint_run)
    old_litcovid_date = get_date(litcovid_run)
    all_preprint_date = get_date(preprint_dict)
    all_litcovid_date = get_date(litcovid_dict)
    old_pre_str_date = datetime.strftime(old_preprint_date,'%Y-%m-%d')
    old_lit_str_date = datetime.strftime(old_litcovid_date,'%Y-%m-%d')
    all_pre_str_date = datetime.strftime(all_preprint_date,'%Y-%m-%d')
    all_lit_str_date = datetime.strftime(all_litcovid_date,'%Y-%m-%d')
    if (all_preprint_date-old_preprint_date)> timedelta(days=1):
        new_preprint_ids = [x for x in preprint_dict[all_pre_str_date] if x not in preprint_run[old_pre_str_date]]
        new_preprint_dict = {all_pre_str_date:new_preprint_ids}
        with open(os.path.join(TEMPPATH,"new_preprint_dict.txt"),"wb") as dumpfile:
            pickle.dump(new_preprint_dict,dumpfile)
        with open(os.path.join(ARCHIVEPATH,"all_preprint_dict.txt"),"wb") as dumpfile:
            pickle.dump(preprint_dict,dumpfile)       
    if (all_litcovid_date-old_litcovid_date)> timedelta(days=1):
        new_litcovid_ids = [x for x in litcovid_dict[all_lit_str_date] if x not in litcovid_run[old_lit_str_date]]
        new_litcovid_dict = {all_lit_str_date:new_litcovid_ids}
        with open(os.path.join(TEMPPATH,"new_litcovid_dict.txt"),"wb") as dumpfile:
            pickle.dump(new_litcovid_dict,dumpfile)
        with open(os.path.join(ARCHIVEPATH,"all_litcovid_dict.txt"),"wb") as dumpfile:
            pickle.dump(litcovid_dict,dumpfile)  
            
            
def check_id_update_status(TEMPPATH):
    today = datetime.now()
    preprint_run = pickle.load(open(os.path.join(TEMPPATH,"new_preprint_dict.txt"), "rb"))
    old_preprint_date = get_date(preprint_run)
    litcovid_run = pickle.load(open(os.path.join(TEMPPATH,"new_litcovid_dict.txt"), "rb"))
    old_litcovid_date = get_date(litcovid_run)
    run_dict = {'preprint_updated':False,'litcovid_updated':False}
    if (today-old_preprint_date) < timedelta(days = 1):
        run_dict['preprint_updated']=True
    if (today-old_litcovid_date) < timedelta(days = 1):
        run_dict['litcovid_updated']=True
    return(run_dict)


def run_id_update(ARCHIVEPATH,TEMPPATH):
    run_dict = check_id_update_status(TEMPPATH)
    if False in list(run_dict.values()):
        all_preprint_dict,all_litcovid_dict = get_pub_ids()
        remove_old_ids(all_preprint_dict,all_litcovid_dict,ARCHIVEPATH,TEMPPATH)
        run_dict = check_id_update_status(ARCHIVEPATH)
        return(run_dict)
    else:
        return(run_dict)

In [None]:
def load_new_ids(TEMPPATH):
    new_preprint_dict = pickle.load(open(os.path.join(TEMPPATH,"new_preprint_dict.txt"), "rb"))
    preprintdatekey = list(new_preprint_dict.keys())[0]
    new_preprint_ids = new_preprint_dict[preprintdatekey]
    new_litcovid_dict = pickle.load(open(os.path.join(TEMPPATH,"new_litcovid_dict.txt"), "rb"))
    litcoviddatekey = list(new_litcovid_dict.keys())[0]
    new_litcovid_ids = new_litcovid_dict[litcoviddatekey]
    return(new_preprint_ids,new_litcovid_ids)

In [None]:
#### Get the metadata for each list
#### Note, I've tried batches of 1000, and the post request has failed, so this uses a batch size that's less likely to fail
def batch_fetch_meta(idlist):
    ## Break the list of ids into smaller chunks so the API doesn't fail the post request
    runs = round((len(idlist))/100,0)
    i=0 
    separator = ','
    ## Create dummy dataframe to store the meta data
    textdf = pd.DataFrame(columns = ['_id','abstract','name','date'])
    authdf = pd.DataFrame(columns = ['_id','author','date'])
    while i < runs+1:
        if len(idlist)<100:
            sample = idlist
        elif i == 0:
            sample = idlist[i:(i+1)*100]
        elif i == runs:
            sample = idlist[i*100:len(idlist)]
        else:
            sample = idlist[i*100:(i+1)*100]
        sample_ids = separator.join(sample)
        ## Get the text-based metadata (abstract, title) and save it
        r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'name,abstract,date'})
        if r.status_code == 200:
            rawresult = pd.read_json(r.text)
            cleanresult = rawresult[['_id','name','abstract','date']].loc[rawresult['_score']==1].copy()
            cleanresult.drop_duplicates(subset='_id',keep="first", inplace=True)
            textdf = pd.concat((textdf,cleanresult),ignore_index=True)
        ## Get the author metadata and save it    
        a = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'author,date'})
        if a.status_code == 200:
            rawresult = pd.read_json(a.text)
            cleanresult = rawresult[['_id','author','date']].loc[rawresult['_score']==1].copy()
            cleanresult.drop_duplicates(subset='_id',keep="first", inplace=True)
            authdf = pd.concat((authdf,cleanresult),ignore_index=True)
        i=i+1
    return(textdf,authdf)
        

## Functions for cleaning up metadata for new entries prior to running comparisons

In [None]:
## reduce camelcase differences by lower casing everything, deal with punctuation oddities, remove stopwords and tokenize
def text2word_tokens(section_text):
    sample_text = section_text.lower().translate(str.maketrans('','',string.punctuation))
    sample_set = [x for x in nltk.tokenize.word_tokenize(sample_text) if x not in stopwords]
    return(sample_set)

## Pull the ids from a dataframe
def get_ids_from_df(rawdf_set):
    rawdf_ids = rawdf_set['_id'].unique().tolist()
    return(rawdf_ids)

## merge title and abstract and create bag of words, remove entries missing abstract (can't be compared)
def remove_text_na(rawdf):
    rawdf['text'] = rawdf['name'].str.cat(rawdf['abstract'], sep=" | ")
    rawdf_set = rawdf.loc[~rawdf['abstract'].isna() & ~rawdf['text'].isna()].copy()
    rawdf_set['words'] = rawdf_set.apply(lambda x: text2word_tokens(x['text']), axis=1)
    return(rawdf_set)

## create bag of words from author and remove entries missing authors (can't be compared)
def remove_auth_na(rawdf,textset_ids):
    rawdf_set = rawdf.loc[~rawdf['author'].isna() & rawdf['_id'].isin(textset_ids)].copy()
    rawdf_set['author'] = rawdf_set['author'].astype(str)
    rawdf_set['words'] = rawdf_set.apply(lambda x: text2word_tokens(x['author']), axis=1)
    return(rawdf_set)

## run the cleaning functions above on a given text dataframe, author dataframe, and source (preprint or litcovid)
def clean_source_data(textdf,authdf,source):
    textdf_set = remove_text_na(textdf)
    textdf_ids = get_ids_from_df(textdf_set)
    authdf_set = remove_auth_na(authdf,textdf_ids)
    authdf_ids = get_ids_from_df(authdf_set)
    return(textdf_set,authdf_set)

## Remove previous successful matches from old metadata prior to running comparisons
def remove_matched_values(source,dftype,ARCHIVEPATH):
    clean_matches = read_csv(os.path.join(ARCHIVEPATH,'clean_results.txt'),delimiter='\t',header=0,index_col=0)
    matched_ids = clean_matches[source].unique().tolist()
    filename = dftype+"_"+source+"_set.txt"
    with open(os.path.join(ARCHIVEPATH,filename), "rb") as openfile:
        old_source = pickle.load(openfile)
    clean_source = old_source.loc[~old_source['_id'].isin(matched_ids)]
    return(clean_source)


## Blank out the previous temp files 
def blank_temps(TEMPPATH):
    tmpfiles = ['auth_above_threshold.txt','text_above_threshold.txt']
    for eachfile in tmpfiles:
        with open(os.path.join(TEMPPATH,eachfile),'w') as outwrite:
            outwrite.write('\tlitcovid\tpreprint\tj_sim\n')

In [None]:
def update_meta(ARCHIVEPATH,TEMPPATH):
    run_dict = run_id_update(ARCHIVEPATH,TEMPPATH)
    if False not in list(run_dict.values()):
        new_preprint_ids,new_litcovid_ids = load_new_ids(TEMPPATH)

        clean_rxiv_text = pickle.load(open(os.path.join(TEMPPATH,"clean_rxiv_text.txt"), "rb"))
        clean_rxiv_ids = get_ids_from_df(clean_rxiv_text)
        if (len(set(new_preprint_ids).intersection(set(clean_rxiv_ids)))/len(new_preprint_ids))<0.66:
            new_preprint_textdf,new_preprint_authdf = batch_fetch_meta(new_preprint_ids)
            clean_rxiv_text,clean_rxiv_auth = clean_source_data(new_preprint_textdf,new_preprint_authdf,'preprint')
            with open(os.path.join(TEMPPATH,"clean_rxiv_text.txt"), "wb") as dmpfile:
                pickle.dump(clean_rxiv_text, dmpfile)
            with open(os.path.join(TEMPPATH,"clean_rxiv_auth.txt"), "wb") as dmpfile:
                pickle.dump(clean_rxiv_auth, dmpfile)

        clean_litcovid_text = pickle.load(open(os.path.join(TEMPPATH,"clean_lit_text.txt"), "rb"))
        clean_litcovid_ids = get_ids_from_df(clean_litcovid_text)
        if (len(set(new_litcovid_ids).intersection(set(clean_litcovid_ids)))/len(new_litcovid_ids))<0.66:
            new_litcovid_textdf,new_litcovid_authdf = batch_fetch_meta(new_litcovid_ids)
            clean_lit_text,clean_lit_auth = clean_source_data(new_litcovid_textdf,new_litcovid_authdf,'litcovid')
            with open(os.path.join(TEMPPATH,"clean_lit_text.txt"), "wb") as dmpfile:
                pickle.dump(clean_lit_text, dmpfile)
            with open(os.path.join(TEMPPATH,"clean_lit_auth.txt"), "wb") as dmpfile:
                pickle.dump(clean_lit_auth, dmpfile)

In [None]:
#### Functions for updating the save files

## Function to update the bag of words dataframes
def update_precompute(clean_df_set,ARCHIVEPATH):
    if 'pmid' in clean_df_set['_id'].iloc[0]:
        df_source = "litcovid"
    else:
        df_source = "preprint"
    if 'author' in list(clean_df_set.columns):
        df_type = 'auth'
    else:
        df_type = 'text'
    filename = df_type+"_"+df_source+"_set.txt"
    with open(os.path.join(ARCHIVEPATH,filename), "rb") as tmpfile:
        old_info = pickle.load(tmpfile)
    updated_info = pd.concat((old_info,clean_df_set),ignore_index=True)
    updated_info.drop_duplicates(keep='last',inplace=True)
    with open(os.path.join(ARCHIVEPATH,filename), "wb") as dmpfile:
        pickle.dump(updated_info, dmpfile)

In [None]:
## load the metadata for comparison
def load_new_meta(TEMPPATH):
    clean_rxiv_text = pickle.load(open(os.path.join(TEMPPATH,"clean_rxiv_text.txt"),"rb"))
    clean_rxiv_auth = pickle.load(open(os.path.join(TEMPPATH,"clean_rxiv_auth.txt"),"rb"))
    clean_lit_text = pickle.load(open(os.path.join(TEMPPATH,"clean_lit_text.txt"),"rb"))
    clean_lit_auth = pickle.load(open(os.path.join(TEMPPATH,"clean_lit_auth.txt"),"rb"))
    return(clean_rxiv_text,clean_rxiv_auth,clean_lit_text,clean_lit_auth)

def load_previous_runs(ARCHIVEPATH):
    ## Load previous run and remove successfully mapped entries
    old_rxiv_text = remove_matched_values('preprint','text',ARCHIVEPATH)
    old_rxiv_auth = remove_matched_values('preprint','auth',ARCHIVEPATH)
    old_lit_text = remove_matched_values('litcovid','text',ARCHIVEPATH)
    old_lit_auth = remove_matched_values('litcovid','auth',ARCHIVEPATH)
    return(old_rxiv_text,old_rxiv_auth,old_lit_text,old_lit_auth)

#### Check if 2/3 of the new ids are already in the old ids. If so, then there's no need to do the comparison
#### Ideally, we'd want to see if 100% of new ids are in the old ids, but if an id was dropped due to lack of
#### text or any other error, this may trigger an unnecessary re-run, so it's relaxed to 2/3 
def check_b4_compare(newdf,olddf):
    newids = set(newdf['_id'].unique().tolist())
    oldids = set(olddf['_id'].unique().tolist())
    incommon = oldids.intersection(newids)
    if len(incommon)/len(newids)>0.67:
        newdata = False ## There isn't much new data that isn't already in the archived data
    else:
        newdata = True ## New data available
    return(newdata)

## Functions to subset the data

In [None]:
#### Generate id_lists by topicCategory  
def generate_comparison_dfs(topicdf,litcoviddf,preprintdf,topicCategory):
    if topicCategory==None:
        idlist = topicdf['_id'].unique().tolist()
        preprint_topicdf = preprintdf.loc[~preprintdf['_id'].isin(idlist)]
        litcovid_topicdf = litcoviddf.loc[~litcoviddf['_id'].isin(idlist)]         
    else:
        idlist = topicdf['_id'].loc[topicdf['topicCategory']==topicCategory].unique().tolist()
        alltopicids = topicdf['_id'].unique().tolist()
        preprint_topicdf = preprintdf.loc[((preprintdf['_id'].isin(idlist))|
                                          (~preprintdf['_id'].isin(alltopicids)))]
        litcovid_topicdf = litcoviddf.loc[((litcoviddf['_id'].isin(idlist))|
                                          (~litcoviddf['_id'].isin(alltopicids)))]       
    return(preprint_topicdf,litcovid_topicdf)

## Functions to perform comparisons

In [None]:
def calculate_jsim(sample_set1,sample_set2,thresholds,set_type):
    j_dist = nltk.jaccard_distance(set(sample_set1),set(sample_set2))
    j_sim = 1-j_dist
    if j_sim > thresholds[set_type]:
        return(j_sim)
    else:
        return(-1)
    

## The comparison function re-written with pandas itterrows AND lambda apply in hopes of speeding it up even more
def run_comparison(preprint_set,litcovid_set,set_type,thresholds,TEMPPATH):
    matches = pd.DataFrame(columns=['litcovid','preprint','j_sim'])
    filename = set_type+"_above_threshold.txt"
    for index, row in litcovid_set.iterrows():
        litcovidwords = set(row['words'])
        preprint_subset = preprint_set.loc[preprint_set['date']<=row['date']].copy()
        if len(preprint_subset)>0:
            preprint_subset.rename(columns={'_id':'preprint'},inplace=True)
            preprint_subset['j_sim'] = preprint_subset.apply(lambda x: calculate_jsim(litcovidwords,set(x['words']),thresholds,set_type),axis=1)
            preprint_subset['litcovid']=row['_id']
            clean = preprint_subset[['litcovid','preprint','j_sim']].loc[preprint_subset['j_sim']!=-1].copy()
            if len(clean)>0:
                matches = pd.concat((matches,clean),ignore_index=True)
    matches.to_csv(os.path.join(TEMPPATH,filename),mode="a",sep='\t',header=False) 
    
    
## Merge the author and text matches that meet threshold, calculate sum score, and sort results
def sort_matches(new_text_matches,new_auth_matches,threshold):
    new_text_matches.rename(columns={'j_sim':'j_sim_text'},inplace=True)
    new_auth_matches.rename(columns={'j_sim':'j_sim_author'},inplace=True)
    preprint_matches = new_text_matches.merge(new_auth_matches,on=['litcovid','preprint'],how='inner')
    preprint_matches['sum_score'] = preprint_matches['j_sim_text']+preprint_matches['j_sim_author']
    preprint_matches['date'] = datetime.now().strftime('%Y-%m-%d')
    ## Set duplicates aside for manual checking
    dupcheckdf = preprint_matches.groupby('preprint').size().reset_index(name='preprint_count')
    dup_preprints = dupcheckdf['preprint'].loc[dupcheckdf['preprint_count']>1].tolist() ## does a preprint map to more than one pmid?
    duplitcheckdf = preprint_matches.groupby('litcovid').size().reset_index(name='litcovid_count')
    dup_pmids = duplitcheckdf['litcovid'].loc[duplitcheckdf['litcovid_count']>1].tolist() ## does a preprint map to more than one pmid?
        
    duplicates = preprint_matches.loc[(preprint_matches['litcovid'].isin(dup_pmids)) | 
                                      (preprint_matches['preprint'].isin(dup_preprints))]
    ## Set low scores aside for manual checking
    lowscores = preprint_matches.loc[preprint_matches['sum_score']<threshold['sum_min']]
    ## Save the clean matches for auto updating
    clean_matches = preprint_matches.loc[(~preprint_matches['litcovid'].isin(dup_pmids)) &
                                         (~preprint_matches['preprint'].isin(dup_preprints)) &
                                         (preprint_matches['sum_score']>=threshold['sum_min'])]

    manual_check = pd.concat((duplicates,lowscores),ignore_index=True)
    return(clean_matches,lowscores,manual_check)


## Functions for cleaning up the results and caching processed data

In [None]:
## Format the results for easier updating in biothings
def convert_txt_dumps(txtdump):
    colnames = list(txtdump.columns)
    txtdump.rename(columns={'correction.identifier':'identifier','correction.url':'url','correction.correctionType':'correctionType'}, inplace=True)
    dictlist = []
    for i in range(len(txtdump)):
        if 'correction.pmid' in colnames:
            tmpdict={'_id':txtdump.iloc[i]['_id'],'correction':[{'@type':'Correction',
                                                                'identifier':txtdump.iloc[i]['identifier'],
                                                                'correctionType':txtdump.iloc[i]['correctionType'],
                                                                'url':txtdump.iloc[i]['url'],
                                                                'pmid':txtdump.iloc[i]['correction.pmid']}]}
        else:
             tmpdict={'_id':txtdump.iloc[i]['_id'],'correction':[{'@type':'Correction',
                                                                'identifier':txtdump.iloc[i]['identifier'],
                                                                'correctionType':txtdump.iloc[i]['correctionType'],
                                                                'url':txtdump.iloc[i]['url']}]}           
        dictlist.append(tmpdict)
    return(dictlist)

def generate_updates(updatedf,OUTPUTPATH):
    priorupdates = read_csv(os.path.join(OUTPUTPATH,'update_file.tsv'),delimiter="\t",header=0,index_col=0)
    correctionA = updatedf[['litcovid','preprint']].copy()
    correctionA.rename(columns={'litcovid':'_id','preprint':'correction.identifier'},inplace=True)
    correctionA['correction.@type']='outbreak:Correction'
    correctionA['correction.correctionType']='preprint'
    correctionA['baseurl']='https://doi.org/10.1101/'
    correctionA['correction.url']=correctionA['baseurl'].str.cat(correctionA['correction.identifier'])
    correctionA.drop('baseurl',axis=1,inplace=True)
    correctionB = updatedf[['litcovid','preprint']].copy()
    correctionB.rename(columns={'litcovid':'correction.identifier','preprint':'_id'},inplace=True)
    correctionB['correction.@type']='outbreak:Correction'
    correctionB['correction.correctionType']='peer-reviewed version'
    correctionB['baseurl']='https://pubmed.ncbi.nlm.nih.gov/'
    correctionB['correction.pmid'] = correctionB['correction.identifier'].astype(str).str.replace('pmid','')
    correctionB['correction.url']=correctionB['baseurl'].str.cat(correctionB['correction.pmid'])
    correctionB.drop('baseurl',axis=1,inplace=True)
    correctionB.drop('correction.pmid',axis=1,inplace=True)
    correctionupdate = pd.concat((priorupdates,correctionA,correctionB),ignore_index=True)
    correctionupdate.drop_duplicates(keep='first')
    correctionupdate.to_csv(os.path.join(OUTPUTPATH,'update_file.tsv'),sep="\t",header=True)
    corrections_added = len(correctionupdate)
    json_corrections = convert_txt_dumps(correctionupdate)
    with open(os.path.join(OUTPUTPATH,'update_file.json'), 'w', encoding='utf-8') as f:
        json.dump(json_corrections, f)
    return(corrections_added)


#### save the results to different files
def generate_split_updates(updatedf,OUTPUTPATH):
    priorlitcovidupdates = read_csv(os.path.join(OUTPUTPATH,'litcovid_update_file.tsv'),delimiter="\t",header=0,index_col=0)
    correctionA = updatedf[['litcovid','preprint']].copy()
    correctionA.rename(columns={'litcovid':'_id','preprint':'correction.identifier'},inplace=True)
    correctionA['correction.@type']='outbreak:Correction'
    correctionA['correction.correctionType']='preprint'
    correctionA['baseurl']='https://doi.org/10.1101/'
    correctionA['correction.url']=correctionA['baseurl'].str.cat(correctionA['correction.identifier'])
    correctionA.drop('baseurl',axis=1,inplace=True)
    correctionAupdate = pd.concat((priorlitcovidupdates,correctionA),ignore_index=True)
    correctionAupdate.drop_duplicates(keep='first')
    correctionAupdate.to_csv(os.path.join(OUTPUTPATH,'litcovid_update_file.tsv'),sep="\t",header=True)
    json_correctionsA = convert_txt_dumps(correctionAupdate)
    with open(os.path.join(OUTPUTPATH,'litcovid_update_file.json'), 'w', encoding='utf-8') as f:
        json.dump(json_correctionsA, f)
    priorpreprintupdates = read_csv(os.path.join(OUTPUTPATH,'preprint_update_file.tsv'),delimiter="\t",header=0,index_col=0,,converters = {'correction.pmid': str})
    correctionB = updatedf[['litcovid','preprint']].copy()
    correctionB.rename(columns={'litcovid':'correction.identifier','preprint':'_id'},inplace=True)
    correctionB['correction.@type']='outbreak:Correction'
    correctionB['correction.correctionType']='peer-reviewed version'
    correctionB['baseurl']='https://pubmed.ncbi.nlm.nih.gov/'
    correctionB['correction.pmid'] = correctionB['correction.identifier'].astype(str).str.replace('pmid','')
    correctionB['correction.url']=correctionB['baseurl'].str.cat(correctionB['correction.pmid'])
    correctionB.drop('baseurl',axis=1,inplace=True)
    correctionBupdate = pd.concat((priorpreprintupdates,correctionB),ignore_index=True)
    correctionBupdate.drop_duplicates(keep='first')
    correctionBupdate.to_csv(os.path.join(OUTPUTPATH,'preprint_update_file.tsv'),sep="\t",header=True)
    json_correctionsB = convert_txt_dumps(correctionBupdate)
    with open(os.path.join(OUTPUTPATH,'preprint_update_file.json'), 'w', encoding='utf-8') as f:
        json.dump(json_correctionsB, f)
    corrections_added = len(correctionBupdate)+len(correctionAupdate)
    return(corrections_added)


## Function to update the save files for manual review or further processing (formatting for biothings)        
def update_results(result_df,ARCHIVEPATH,REVIEWPATH):
    update_dict = {}
    dupcheck = result_df.groupby('litcovid').size().reset_index(name='counts')
    dupcheck2 = result_df.groupby('preprint').size().reset_index(name='counts')
    if len(dupcheck.loc[dupcheck['counts']>1]) or len(dupcheck2.loc[dupcheck2['counts']>1]):
        old_manual_check = read_csv(os.path.join(REVIEWPATH,'manual_check.txt'),delimiter='\t',header=0,index_col=0)
        update_dict['previous matches for manual checking']=len(old_manual_check)
        update_dict['current matches for manual checking'] =len(result_df)
        total_manual_check = pd.concat((old_manual_check,result_df),ignore_index=True)
        total_manual_check.drop_duplicates(subset=['litcovid','preprint'],keep='first',inplace=True)
        total_manual_check.to_csv(os.path.join(REVIEWPATH,'manual_check.txt'),sep='\t',header=True)
    elif result_df['sum_score'].max() < 0.75:
        old_low_scores = read_csv(os.path.join(REVIEWPATH,'low_scores.txt'),delimiter='\t',header=0,index_col=0)
        update_dict['previous matches with low scores']=len(old_low_scores)
        update_dict['current matches with low scores'] =len(result_df)
        old_low_scores = pd.concat((old_low_scores,result_df),ignore_index=True)
        old_low_scores.drop_duplicates(subset=['litcovid','preprint'],keep='first',inplace=True)
        old_low_scores.to_csv(os.path.join(REVIEWPATH,'low_scores.txt'),sep='\t',header=True)
    elif (len(dupcheck) == len(result_df)) and (len(dupcheck2)==len(result_df)):
        old_clean_results = read_csv(os.path.join(ARCHIVEPATH,'clean_results.txt'),delimiter='\t',header=0,index_col=0)
        update_dict['previous matches for updating']=len(old_clean_results)
        update_dict['current matches for updating'] =len(result_df)
        old_clean_results = pd.concat((old_clean_results,result_df),ignore_index=True)
        old_clean_results.drop_duplicates(subset=['litcovid','preprint'],keep='first',inplace=True)
        old_clean_results.to_csv(os.path.join(ARCHIVEPATH,'clean_results.txt'),sep='\t',header=True)
    return(update_dict)   


def check_comparison_run(TEMPPATH):
    clean_rxiv_file = pathlib.Path(os.path.join(TEMPPATH,'clean_rxiv_text.txt'))
    clean_lit_file = pathlib.Path(os.path.join(TEMPPATH,'clean_lit_text.txt'))
    auth_matches = pathlib.Path(os.path.join(TEMPPATH,'auth_above_threshold.txt'))
    text_matches = pathlib.Path(os.path.join(TEMPPATH,'text_above_threshold.txt'))
    rxiv_mtime = datetime.fromtimestamp(clean_rxiv_file.stat().st_mtime)
    lit_mtime = datetime.fromtimestamp(clean_lit_file.stat().st_mtime)
    text_mtime = datetime.fromtimestamp(text_matches.stat().st_mtime)
    auth_mtime = datetime.fromtimestamp(auth_matches.stat().st_mtime)
    text_size = text_matches.stat().st_size/1024
    auth_size = auth_matches.stat().st_size/1024
    blank_size = 27/1024
    if (text_size > blank_size) and (auth_size > blank_size):
        size_check_success = True
    else:
        size_check_success = False
    if ((text_mtime - rxiv_mtime) < timedelta(days=3)) and ((auth_mtime - rxiv_mtime) < timedelta(days=3)):
        rxiv_check_success = True
    else:
        rxiv_check_success = False
    if ((text_mtime - lit_mtime) < timedelta(days=3)) and ((auth_mtime - lit_mtime) < timedelta(days=3)):
        lit_check_success = True
    else:
        lit_check_success = False
    runcheck_dict = {"size_check_success":size_check_success,
                     "preprint_check_success":rxiv_check_success,
                     "litcovid_check_success":lit_check_success}
    return(runcheck_dict)
        

## Main Functions:

The original preprint matching script was divided up into 3 parts with save/load points each
**update metadata**
1. Load previous preprint and pmids
2. Fetch all new preprint and pmids (save old and new preprint and pmids), include date for a check so it doesn't do this if within a day
3. Fetch all data from new preprint and pmids
4. Clean up metadata and save metadata

**compare data**
1. load metadata, use merges with preprint and pmid lists to determine old vs new
2. compare New litcovid to old preprints (save data)
3. compare New litcovid to new preprints (save data)
4. compare New preprints to old Litcovid (save data)

**clean up results**
1. Check results of saved data for items meeting criteria
2. generate results for appending data

## Check functionality of modularized script

In [None]:
%%time
import os
import requests
import pickle
import json
import pandas as pd
from pandas import read_csv
import nltk
import string
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
from datetime import datetime,timedelta
import pathlib
from src.update_functions import *
from src.cleaning_functions import *

#scriptpath = pathlib.Path(__file__).parent.absolute()
scriptpath = ''
RESULTSPATH = os.path.join(scriptpath,'results/')
ARCHIVEPATH = os.path.join(RESULTSPATH,'archives/')
TEMPPATH = os.path.join(RESULTSPATH,'temp/')

update_meta(ARCHIVEPATH,TEMPPATH)

#### Run time = 5 min

In [None]:
%%time
#### This code performs the actual bag of word comparisons

import os
import requests
import pickle
import json
import pandas as pd
from pandas import read_csv
import nltk
import string
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
from datetime import datetime,timedelta
import pathlib
from src.comparison_functions import *

#scriptpath = pathlib.Path(__file__).parent.absolute()
#try:
#    generalpath = pathlib.Path(__file__).parents[1].absolute()
#except:
#    generalpath = pathlib.Path(__file__).resolve().parents[1].absolute()

script_path = ''
general_path = os.path.abspath(os.path.join(os.getcwd(),"../"))


RESULTSPATH = os.path.join(scriptpath,'results/')
ARCHIVEPATH = os.path.join(RESULTSPATH,'archives/')
TEMPPATH = os.path.join(RESULTSPATH,'temp/')
TOPICPATH = os.path.join(generalpath,'topic_classifier/results/') 
TOPICFILE = read_csv(os.path.join(TOPICPATH,'topicCats.tsv'),delimiter='\t',header=0,index_col=0,
                     converters={"topicCategory": lambda x: x.strip("[]").replace("'","").split(", ")})
TOPICFILE.fillna({i: [] for i in TOPICFILE.index})
topicdf = TOPICFILE.explode('topicCategory').reset_index()
topicdf.drop(columns=['index'],inplace=True)

thresholds = {"auth":0.45,
              "text":0.2,
              "sum_min":0.75}

clean_rxiv_text,clean_rxiv_auth,clean_lit_text,clean_lit_auth = load_new_meta(TEMPPATH)
print(len(clean_rxiv_text),len(clean_lit_text))
old_rxiv_text,old_rxiv_auth,old_lit_text,old_lit_auth =  load_previous_runs(ARCHIVEPATH)
print(len(old_rxiv_text),len(old_lit_text)

new_rxiv = check_b4_compare(clean_rxiv_text,old_rxiv_text)
new_litcovid = check_b4_compare(clean_lit_text,old_lit_text)

if new_rxiv==True and new_litcovid==True:
    blank_temps(TEMPPATH)
    ## run old preprints against new litcovid entries:
    if len(clean_lit_text)>0:
        run_comparison(clean_lit_text,old_rxiv_text,'text',thresholds,TEMPPATH)
    if len(clean_lit_auth)>0:
        run_comparison(old_rxiv_auth,clean_lit_auth,'auth', thresholds,TEMPPATH)

    ## run new preprints against new litcovid entries
    if len(clean_rxiv_text)>0:
        run_comparison(clean_rxiv_text,clean_lit_text,'text',thresholds,TEMPPATH)

    if len(clean_rxiv_auth)>0:
        run_comparison(clean_rxiv_auth,clean_lit_auth,'auth', thresholds,TEMPPATH)


elif new_rxiv==False and new_litcovid==True:
    ## run old preprints against new litcovid entries
    if len(clean_lit_text)>0:
        run_comparison(old_rxiv_text,clean_lit_text,'text',thresholds,TEMPPATH)

    if len(clean_lit_auth)>0:
        run_comparison(old_rxiv_auth,clean_lit_auth,'auth', thresholds,TEMPPATH)


elif new_rxiv==True and new_litcovid==False:
    print("no point in comparing new preprints to old litcovid entries")
          
else:
    print("nothing new to compare")

In [None]:
print(len(clean_lit_text))

In [None]:
%%time
#### clean_up_results.py
import os
import requests
import pickle
import json
import pandas as pd
from pandas import read_csv
from datetime import datetime,timedelta
import pathlib
from src.comparison_functions import *
from src.archive_functions import *


#### Set paths
#scriptpath = pathlib.Path(__file__).parent.absolute()
scriptpath = ''
RESULTSPATH = os.path.join(scriptpath,'results/')
ARCHIVEPATH = os.path.join(RESULTSPATH,'archives/')
TEMPPATH = os.path.join(RESULTSPATH,'temp/')
OUTPUTPATH = os.path.join(RESULTSPATH,'update dumps/')
REVIEWPATH = os.path.join(RESULTSPATH,'to review/')

thresholds = {"auth":0.45,
              "text":0.2,
              "sum_min":0.75}

## update the set after the run
runcheck_dict = check_comparison_run(TEMPPATH)
if False not in list(runcheck_dict.values()):
    clean_rxiv_text,clean_rxiv_auth,clean_lit_text,clean_lit_auth = load_new_meta(TEMPPATH)
    update_precompute(clean_lit_text,ARCHIVEPATH)
    update_precompute(clean_lit_auth,ARCHIVEPATH)
    update_precompute(clean_rxiv_text,ARCHIVEPATH)
    update_precompute(clean_rxiv_auth,ARCHIVEPATH)
    try:
        new_text_matches = read_csv(os.path.join(TEMPPATH,'text_above_threshold.txt'),delimiter='\t',header=0,index_col=False)
        if 'Unnamed: 0' in new_text_matches.columns:
            new_text_matches.drop('Unnamed: 0',axis=1,inplace=True)
    except:
        new_text_matches = pd.DataFrame(columns=['litcovid','preprint','j_sim'])
    try:
        new_auth_matches = read_csv(os.path.join(TEMPPATH,'auth_above_threshold.txt'),delimiter='\t',header=0,index_col=False)
        if 'Unnamed: 0' in new_auth_matches.columns:
            new_auth_matches.drop('Unnamed: 0',axis=1,inplace=True)
    except:
        new_auth_matches = pd.DataFrame(columns=['litcovid','preprint','j_sim'])

    if len(new_text_matches)<1 or len(new_auth_matches)<1:
        matchupdates = False
    else:
        matchupdates = True
        clean_matches,lowscores,manual_check = sort_matches(new_text_matches,new_auth_matches,thresholds)
    corrections_added = generate_updates(clean_matches,OUTPUTPATH)
    split_corrections_added = generate_split_updates(clean_matches,OUTPUTPATH)
    manual_check_update = update_results(manual_check,ARCHIVEPATH,REVIEWPATH)
    lowscores_update = update_results(lowscores,ARCHIVEPATH,REVIEWPATH)
    clean_match_update = update_results(clean_matches,ARCHIVEPATH,REVIEWPATH)
else:
    print(runcheck_dict)

In [1]:
%%time
#### Sequentially run all scripts needed for preprint-matching
import subprocess
import pathlib
import os

#scriptpath = pathlib.Path(__file__).parent.absolute()
scriptpath = ''
update_script = os.path.join(scriptpath,'update_data.py')
compare_script = os.path.join(scriptpath,'compare_data.py')
cleanup_script = os.path.join(scriptpath,'clean_up_results.py')
program_list = [update_script,compare_script,cleanup_script]

for program in program_list:
    subprocess.call(['python', program])
    print("Finished:" + program)

Finished:update_data.py
Finished:compare_data.py
Finished:clean_up_results.py
Wall time: 6min 31s


### Parts of the main functions for trouble-shooting purposes

#### The comparison function parts

In [None]:
set_type = 'text'
thresholds = {"auth":0.45,
              "text":0.2,
              "sum_min":0.75}

#### Load annotations file
TOPICFILE = read_csv(os.path.join(TOPICPATH,'topicCats.tsv'),delimiter='\t',header=0,index_col=0,
                     converters={"topicCategory": lambda x: x.strip("[]").replace("'","").split(", ")})
TOPICFILE.fillna({i: [] for i in TOPICFILE.index})
topicdf = TOPICFILE.explode('topicCategory').reset_index()
topicdf.drop(columns=['index'],inplace=True)

In [None]:
%%time
update_meta(ARCHIVEPATH,TEMPPATH)

In [None]:
%%time
## load the metadata for comparison
clean_rxiv_text,clean_rxiv_auth,clean_lit_text,clean_lit_auth = load_new_meta(TEMPPATH)
old_rxiv_text,old_rxiv_auth,old_lit_text,old_lit_auth =  load_previous_runs(ARCHIVEPATH)

In [None]:
blank_temps(TEMPPATH)

#### Comparison Functions of binning by topicCategory is desired

In [None]:
%%time
## run old preprints against new litcovid entries:
if len(clean_lit_text)>0:
    for eachtopic in topicdf['topicCategory'].unique().tolist():
        preprint_topicdf,litcovid_topicdf = generate_comparison_dfs(topicdf,clean_lit_text,old_rxiv_text,eachtopic)
        if len(preprint_topicdf)+len(litcovid_topicdf)>0:
            run_comparison(preprint_topicdf,litcovid_topicdf,'text',thresholds,TEMPPATH)
    try:
        eachtopic = None
        preprint_topicdf,litcovid_topicdf = generate_comparison_dfs(topicdf,clean_lit_text,clean_rxiv_text,eachtopic)
        if len(preprint_topicdf)+len(litcovid_topicdf)>0:
            run_comparison(preprint_topicdf,litcovid_topicdf,'text',thresholds,TEMPPATH)
    except:
        pass
if len(clean_lit_auth)>0:
    for eachtopic in topicdf['topicCategory'].unique().tolist():
        preprint_topicdf,litcovid_topicdf = generate_comparison_dfs(topicdf,clean_lit_auth,old_rxiv_auth,eachtopic)
        if len(preprint_topicdf)+len(litcovid_topicdf)>0:
            run_comparison(preprint_topicdf,litcovid_topicdf,'auth', thresholds,TEMPPATH)
    try:
        eachtopic = None
        preprint_topicdf,litcovid_topicdf = generate_comparison_dfs(topicdf,clean_lit_text,clean_rxiv_text,eachtopic)
        if len(preprint_topicdf)+len(litcovid_topicdf)>0:
            run_comparison(preprint_topicdf,litcovid_topicdf,'text',thresholds,TEMPPATH)
    except:
        pass

In [None]:
%%time
## run new preprints against new litcovid entries:
if len(clean_lit_text)>0:
    for eachtopic in topicdf['topicCategory'].unique().tolist():
        preprint_topicdf,litcovid_topicdf = generate_comparison_dfs(topicdf,clean_lit_text,old_rxiv_text,eachtopic)
        if len(preprint_topicdf)+len(litcovid_topicdf)>0:
            run_comparison(preprint_topicdf,litcovid_topicdf,'text',thresholds,TEMPPATH)
    try:
        eachtopic = None
        preprint_topicdf,litcovid_topicdf = generate_comparison_dfs(topicdf,clean_lit_text,clean_rxiv_text,eachtopic)
        if len(preprint_topicdf)+len(litcovid_topicdf)>0:
            run_comparison(preprint_topicdf,litcovid_topicdf,'text',thresholds,TEMPPATH)
    except:
        pass
if len(clean_lit_auth)>0:
    for eachtopic in topicdf['topicCategory'].unique().tolist():
        preprint_topicdf,litcovid_topicdf = generate_comparison_dfs(topicdf,clean_lit_auth,old_rxiv_auth,eachtopic)
        if len(preprint_topicdf)+len(litcovid_topicdf)>0:
            run_comparison(preprint_topicdf,litcovid_topicdf,'auth', thresholds,TEMPPATH)
    try:
        eachtopic = None
        preprint_topicdf,litcovid_topicdf = generate_comparison_dfs(topicdf,clean_lit_text,clean_rxiv_text,eachtopic)
        if len(preprint_topicdf)+len(litcovid_topicdf)>0:
            run_comparison(preprint_topicdf,litcovid_topicdf,'text',thresholds,TEMPPATH)
    except:
        pass

#### Comparison function without binning

In [None]:
if new_rxiv==True and new_litcovid==True:
    blank_temps(TEMPPATH)
    ## run old preprints against new litcovid entries:
    if len(clean_lit_text)>0:
        run_comparison(clean_lit_text,old_rxiv_text,'text',thresholds,TEMPPATH)
    if len(clean_lit_auth)>0:
        run_comparison(old_rxiv_auth,clean_lit_auth,'auth', thresholds,TEMPPATH)

    ## run new preprints against new litcovid entries
    if len(clean_rxiv_text)>0:
        run_comparison(clean_rxiv_text,clean_lit_text,'text',thresholds,TEMPPATH)

    if len(clean_rxiv_auth)>0:
        run_comparison(clean_rxiv_auth,clean_lit_auth,'auth', thresholds,TEMPPATH)


elif new_rxiv==False and new_litcovid==True:
    ## run old preprints against new litcovid entries
    if len(clean_lit_text)>0:
        run_comparison(old_rxiv_text,clean_lit_text,'text',thresholds,TEMPPATH)

    if len(clean_lit_auth)>0:
        run_comparison(old_rxiv_auth,clean_lit_auth,'auth', thresholds,TEMPPATH)

#### The update function

In [None]:
## update the set after the run
runcheck_dict = check_comparison_run(TEMPPATH)
if False not in list(runcheck_dict.values()):
    update_precompute(clean_lit_text,ARCHIVEPATH)
    update_precompute(clean_lit_auth,ARCHIVEPATH)
    update_precompute(clean_rxiv_text,ARCHIVEPATH)
    update_precompute(clean_rxiv_auth,ARCHIVEPATH)
    try:
        new_text_matches = read_csv(os.path.join(TEMPPATH,'text_above_threshold.txt'),delimiter='\t',header=0,index_col=False)
        if 'Unnamed: 0' in new_text_matches.columns:
            new_text_matches.drop('Unnamed: 0',axis=1,inplace=True)
    except:
        new_text_matches = pd.DataFrame(columns=['litcovid','preprint','j_sim'])
    try:
        new_auth_matches = read_csv(os.path.join(TEMPPATH,'auth_above_threshold.txt'),delimiter='\t',header=0,index_col=False)
        if 'Unnamed: 0' in new_auth_matches.columns:
            new_auth_matches.drop('Unnamed: 0',axis=1,inplace=True)
    except:
        new_auth_matches = pd.DataFrame(columns=['litcovid','preprint','j_sim'])

    if len(new_text_matches)<1 or len(new_auth_matches)<1:
        matchupdates = False
    else:
        matchupdates = True
        clean_matches,lowscores,manual_check = sort_matches(new_text_matches,new_auth_matches,thresholds)
    corrections_added = generate_updates(clean_matches,OUTPUTPATH)
    split_corrections_added = generate_split_updates(clean_matches,OUTPUTPATH)
    manual_check_update = update_results(manual_check,ARCHIVEPATH,REVIEWPATH)
    lowscores_update = update_results(lowscores,ARCHIVEPATH,REVIEWPATH)
    clean_match_update = update_results(clean_matches,ARCHIVEPATH,REVIEWPATH)
else:
    print(runcheck_dict)