## Fix archives from previous runs to be compatible with new code

In [2]:
import os
import requests
import pickle
import json
import pandas
from pandas import read_csv
import nltk
import string
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
from datetime import datetime
import pathlib

In [4]:
#### Get the metadata for each list
#### Note, I've tried batches of 1000, and the post request has failed, so this uses a batch size that's less likely to fail
def batch_fetch_date(idlist):
    ## Break the list of ids into smaller chunks so the API doesn't fail the post request
    runs = round((len(idlist))/100,0)
    i=0 
    separator = ','
    ## Create dummy dataframe to store the meta data
    datedf = pandas.DataFrame(columns = ['_id','date'])
    while i < runs+1:
        if len(idlist)<100:
            sample = idlist
        elif i == 0:
            sample = idlist[i:(i+1)*100]
        elif i == runs:
            sample = idlist[i*100:len(idlist)]
        else:
            sample = idlist[i*100:(i+1)*100]
        sample_ids = separator.join(sample)
        ## Get the date info and save it
        r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'date'})
        if r.status_code == 200:
            rawresult = pandas.read_json(r.text)
            cleanresult = rawresult[['_id','date']].loc[rawresult['_score']==1].copy()
            cleanresult.drop_duplicates(subset='_id',keep="first", inplace=True)
            datedf = pandas.concat((datedf,cleanresult),ignore_index=True)
        i=i+1
    return(datedf)
        

In [3]:
#### Update archives to contain missing date information
## This step is DONE and should not be performed again

## load the archive data
with open('results/archives/auth_litcovid_set.txt', "rb") as temp:
    auth_litcovid_set = pickle.load(temp)
with open('results/archives/auth_preprint_set.txt', "rb") as temp:
    auth_preprint_set = pickle.load(temp)
with open('results/archives/text_litcovid_set.txt', "rb") as temp:
    text_litcovid_set = pickle.load(temp)
with open('results/archives/text_preprint_set.txt', "rb") as temp:
    text_preprint_set = pickle.load(temp)


In [5]:
## load the id lists
with open('results/archives/all_litcovid_ids.txt', "rb") as temp:
    all_litcovid_ids = pickle.load(temp)
    
with open('results/archives/all_preprint_ids.txt', "rb") as temp:
    all_preprint_ids = pickle.load(temp)

In [6]:
#### Check coverage of dateless litcovid entris
#### Before fix applied to litcovid (Run 2021.04.22)
litcovid_datedf = batch_fetch_date(list(all_litcovid_ids))
litcovid_datedf.drop_duplicates(keep='first',inplace=True)
nodates = litcovid_datedf.loc[(litcovid_datedf['date']==-1)|(litcovid_datedf['date'].isna())|(litcovid_datedf['date']=='not found')]
print(len(nodates))
print(nodates['date'].unique().tolist())
print(nodates.head(n=5))

10
[None]
                _id date
2995   pmid32150360  NaT
5451   pmid32310553  NaT
8460   pmid32644403  NaT
12925  pmid32401466  NaT
19631  pmid32310612  NaT


In [8]:
#### only fetch date for old files
old_preprint_ids = list(set(auth_preprint_set['_id'].unique().tolist()).union(set(text_preprint_set['_id'].unique().tolist())))
old_litcovid_ids = list(set(auth_litcovid_set['_id'].unique().tolist()).union(set(text_litcovid_set['_id'].unique().tolist())))

preprint_datedf = batch_fetch_date(old_preprint_ids)
print(preprint_datedf.loc[(preprint_datedf['date']==-1)|(preprint_datedf['date'].isna())|(preprint_datedf['date']=='not found')])
litcovid_datedf = batch_fetch_date(old_litcovid_ids)
litcovid_datedf.drop_duplicates(keep='first',inplace=True)

Empty DataFrame
Columns: [_id, date]
Index: []


In [9]:
#### Add the missing dates to the table
#auth_litcovid_set.drop('date',axis=1,inplace=True)
#text_litcovid_set.drop('date',axis=1,inplace=True)
text_litcovid_set_updated = text_litcovid_set.merge(litcovid_datedf,on='_id',how='left')
auth_litcovid_set_updated = auth_litcovid_set.merge(litcovid_datedf,on='_id',how='left')

#auth_preprint_set.drop('datePublished',axis=1,inplace=True)
#text_preprint_set.drop('datePublished',axis=1,inplace=True)
text_preprint_set_updated = text_preprint_set.merge(preprint_datedf,on='_id',how='left')
auth_preprint_set_updated = auth_preprint_set.merge(preprint_datedf,on='_id',how='left')

In [10]:
#### Deal with preprints and litcovid entries that were removed
missingpre = text_preprint_set_updated['_id'].loc[(text_preprint_set_updated['date']==-1)|
                                               (text_preprint_set_updated['date'].isna())|
                                               (text_preprint_set_updated['date']=='not found')].unique().tolist()
missinglit = text_litcovid_set_updated['_id'].loc[(text_litcovid_set_updated['date']==-1)|
                                               (text_litcovid_set_updated['date'].isna())|
                                               (text_litcovid_set_updated['date']=='not found')].unique().tolist()

text_litcovid_set_updated_clean = text_litcovid_set_updated.loc[~text_litcovid_set_updated['_id'].isin(missinglit)]
auth_litcovid_set_updated_clean = auth_litcovid_set_updated.loc[~auth_litcovid_set_updated['_id'].isin(missinglit)]

text_preprint_set_updated_clean = text_preprint_set_updated.loc[~text_preprint_set_updated['_id'].isin(missingpre)]
auth_preprint_set_updated_clean = auth_preprint_set_updated.loc[~auth_preprint_set_updated['_id'].isin(missingpre)]


print(text_litcovid_set_updated_clean.loc[(text_litcovid_set_updated_clean['date']==-1)|
                                          (text_litcovid_set_updated_clean['date'].isna())|
                                          (text_litcovid_set_updated_clean['date']=='not found')])

Empty DataFrame
Columns: [_id, abstract, name, text, words, date]
Index: []


In [11]:
#### Verify that the merge happened correctly
print('size of auth_litcovid_set: ',len(auth_litcovid_set),' updated: ',len(auth_litcovid_set_updated))
print('zize of text_litcovid_set: ',len(text_litcovid_set),' updated: ',len(text_litcovid_set_updated))
print('size of auth_preprint_set: ',len(auth_preprint_set),' updated: ',len(auth_preprint_set_updated))
print('size of text_preprint_set: ',len(text_preprint_set),' updated: ',len(text_preprint_set_updated))

print(auth_litcovid_set.head(n=2))
print("-------------------------------------------------------")
print(auth_litcovid_set_updated.head(n=2))
print("=======================================================")
print(text_litcovid_set.head(n=2))
print("-------------------------------------------------------")
print(text_litcovid_set_updated.head(n=2))
print("=======================================================")
print(auth_preprint_set.head(n=2))
print("-------------------------------------------------------")
print(auth_preprint_set_updated.head(n=2))
print("=======================================================")
print(text_preprint_set.head(n=2))
print("-------------------------------------------------------")
print(text_preprint_set_updated.head(n=2))
print("=======================================================")

size of auth_litcovid_set:  19422  updated:  19422
zize of text_litcovid_set:  19488  updated:  19488
size of auth_preprint_set:  7205  updated:  7205
size of text_preprint_set:  7229  updated:  7229
            _id                                             author  \
0  pmid32633231  [{'@type': 'outbreak:Person', 'affiliation': [...   
1  pmid32633235  [{'@type': 'outbreak:Person', 'affiliation': [...   

                                               words  
0  [type, outbreakperson, affiliation, type, outb...  
1  [type, outbreakperson, affiliation, type, outb...  
-------------------------------------------------------
            _id                                             author  \
0  pmid32633231  [{'@type': 'outbreak:Person', 'affiliation': [...   
1  pmid32633235  [{'@type': 'outbreak:Person', 'affiliation': [...   

                                               words       date  
0  [type, outbreakperson, affiliation, type, outb... 2020-10-01  
1  [type, outbreakperson,

In [12]:
#### Update the archive files

with open('results/archives/auth_litcovid_set.txt', 'wb') as dmpfile:
        pickle.dump(auth_litcovid_set_updated_clean, dmpfile)
with open('results/archives/text_litcovid_set.txt', 'wb') as dmpfile:
        pickle.dump(text_litcovid_set_updated_clean, dmpfile)

with open('results/archives/auth_preprint_set.txt', 'wb') as dmpfile:
        pickle.dump(auth_preprint_set_updated_clean, dmpfile)
with open('results/archives/text_preprint_set.txt', 'wb') as dmpfile:
        pickle.dump(text_preprint_set_updated_clean, dmpfile)

In [12]:
#### Convert the archived id list into a dict and save

all_litcovid_dict = {'2020-09-01':all_litcovid_ids}
with open('results/archives/all_litcovid_dict.txt', "wb") as temp:
    pickle.dump(all_litcovid_dict,temp)

all_preprint_dict = {'2020-09-01':all_preprint_ids}
with open('results/archives/all_preprint_dict.txt', "wb") as temp:
    pickle.dump(all_preprint_dict,temp)

{'pmid32532430', 'pmid32635681', 'pmid32278865', 'pmid32553478', 'pmid32364119', 'pmid32555983', 'pmid32493070', 'pmid32554953', 'pmid32417123', 'pmid32389754', 'pmid32496097', 'pmid32139552', 'pmid32291593', 'pmid32409759', 'pmid32452955', 'pmid32717489', 'pmid32512075', 'pmid32621352', 'pmid32467382', 'pmid32523144', 'pmid32501807', 'pmid32358858', 'pmid32401679', 'pmid32705978', 'pmid32427226', 'pmid32495819', 'pmid32269081', 'pmid32363809', 'pmid32213231', 'pmid32329337', 'pmid32693249', 'pmid32573126', 'pmid32203360', 'pmid32402477', 'pmid32393502', 'pmid32697030', 'pmid32710149', 'pmid32347027', 'pmid32548209', 'pmid32656307', 'pmid32501451', 'pmid32475151', 'pmid32175719', 'pmid32566158', 'pmid32589524', 'pmid32483035', 'pmid32697344', 'pmid32543015', 'pmid32503846', 'pmid32433304', 'pmid32554344', 'pmid32314729', 'pmid32296181', 'pmid32585180', 'pmid32512244', 'pmid32690058', 'pmid32475179', 'pmid32554998', 'pmid32686780', 'pmid32513515', 'pmid32346680', 'pmid32679506', 'pmid32

In [3]:
## load the archive data
with open('results/archives/auth_litcovid_set.txt', "rb") as temp:
    auth_litcovid_set = pickle.load(temp)
with open('results/archives/auth_preprint_set.txt', "rb") as temp:
    auth_preprint_set = pickle.load(temp)
with open('results/archives/text_litcovid_set.txt', "rb") as temp:
    text_litcovid_set = pickle.load(temp)
with open('results/archives/text_preprint_set.txt', "rb") as temp:
    text_preprint_set = pickle.load(temp)
    
print(auth_litcovid_set.head(n=2))
print(auth_preprint_set.head(n=2))
print(text_litcovid_set.head(n=2))
print(text_preprint_set.head(n=2))

            _id                                             author  \
0  pmid32633231  [{'@type': 'outbreak:Person', 'affiliation': [...   
1  pmid32633235  [{'@type': 'outbreak:Person', 'affiliation': [...   

                                               words  
0  [type, outbreakperson, affiliation, type, outb...  
1  [type, outbreakperson, affiliation, type, outb...  
                   _id                                             author  \
0  2020.01.10.20017145  [{'@type': 'outbreak:Person', 'affiliation': [...   
1  2020.01.23.20018549  [{'@type': 'outbreak:Person', 'affiliation': [...   

                                               words  
0  [type, outbreakperson, affiliation, type, outb...  
1  [type, outbreakperson, affiliation, type, outb...  
            _id                                           abstract  \
0  pmid32633231  The current pandemic restarts a debate on perm...   
1  pmid32633235  COVID-19, which is caused by the single-strand...   

                