In [1]:
import os
import pandas
from pandas import read_csv
import json
import pickle
from datetime import datetime
import sys
import requests

In [2]:
#### Create curatedBy Object
def generate_curator():
    todate = datetime.now()
    curatedByObject = {"@type": "Organization", "identifier": "covid19LST", "url": "https://www.covid19lst.org/", 
                              "name": "COVID-19 Literature Surveillance Team", "affiliation": "", 
                              "curationDate": todate.strftime("%Y-%m-%d")}
    return(curatedByObject)

In [3]:
def save_missing(missing):
    try:
        missing_list = pickle.load(open('results/pubs_not_yet_in_outbreak.txt','rb'))
        if missing != None:
            total_missing = list(set([*missing_list, *missing]))
            with open('results/pubs_not_yet_in_outbreak.txt','wb') as dmpfile:
                pickle.dump(total_missing,dmpfile)
    except:
        if missing != None:
            with open('results/pubs_not_yet_in_outbreak.txt','wb') as dmpfile:
                pickle.dump(total_missing,dmpfile)

In [4]:
#### Note that this script to be revised after the COVID19 LST reports are added to the outbreak.info database
#### The API can then be used to create the table associated each report to each PMID 
#### by querying the API for COVID19 LST reports and their 'isBasedOn' field

def generated_citedBy_dict():
    txtdmp = read_csv('results/update dumps/litcovid_citedBy.tsv', delimiter='\t', header=0, index_col=0)
    dictlist = []
    for i in range(len(txtdump)):
        tmpdict={'_id':txtdump.iloc[i]['_id'],'citedBy':[{'@type':'Publication',
                                                                'identifier':txtdump.iloc[i]['identifier'],
                                                                'name':txtdump.iloc[i]['name'],
                                                                'url':txtdump.iloc[i]['url']}]}
        dictlist.append(tmpdict)
    with open('results/update dumps/litcovid_citedBy.json', 'w', encoding='utf-8') as f:
        json.dump(dictlist, f)


## Download CSVs from google drive

https://drive.google.com/drive/folders/1603ahBNdt1SnSaYYBE-G8SA6qgRTQ6fF

Note that the covid19 LST dumps will be update dumps rather than whole database dumps in order to minimize their file size (and upload time) as whole dumps will get progressively larger and take longer to upload.

Note that this script uses the googledrive api which requires authentification even when accessing a public google drive. To fulfill this requirement without needing to manually log in, credentials from a service account are needed  The googledrive API is only used to read the files in the drive so that the newest ones (by date) can be identified, and their id's taken.

Additionally the the pydrive2 library (use to access the google drive api) sometimes has trouble finding the client_secrets.json file, so you may need to manually point to it.

The downloader uses the GoogleDriveDownloader library which is based off of requests and should not require the google drive api.

In [1]:
## This function identifies files uploaded after 2020.09.11 that have NOT yet been downloaded
## Note that this is the function if a service account is not available. It requires a login

def check_google():
    from pydrive2.auth import GoogleAuth
    from pydrive2.drive import GoogleDrive

    GoogleAuth.DEFAULT_SETTINGS['client_config_file'] = 'client_secrets.json' ##point to secrets file location
    gauth = GoogleAuth()
    #gauth.LocalWebserverAuth()

    drive = GoogleDrive(gauth)
    file_id = '1603ahBNdt1SnSaYYBE-G8SA6qgRTQ6fF'
    file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % file_id}).GetList()
    
    df = pandas.DataFrame(file_list)
    dfclean = df[['createdDate','id','title']].copy()
    dfclean['date'] = pandas.to_datetime(dfclean['createdDate'],format='%Y-%m-%d', errors='coerce')
    lastupdate = dfclean.loc[dfclean['createdDate']=='2020-09-11T01:53:29.639Z'].iloc[0]['date']
    dfnew = dfclean.loc[dfclean['date']>lastupdate]
    
    all_files = os.listdir('data/reports/')
    new_files = [item  for item in all_files if item not in dfnew['title'].unique().tolist()]
    reportdf = dfnew.loc[dfnew['title'].isin(new_files)]
    return(reportdf)

In [3]:
## This function identifies files uploaded after 2020.09.11 that have NOT yet been downloaded
## Note that this is the function if a service account IS available. 
def check_google():
    from pydrive2.auth import GoogleAuth
    from pydrive2.drive import GoogleDrive
    from pydrive2.auth import ServiceAccountCredentials
    
    gauth = GoogleAuth()
    scope = ['https://www.googleapis.com/auth/drive']
    gauth.credentials = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)
    drive = GoogleDrive(gauth)
    file_id = '1603ahBNdt1SnSaYYBE-G8SA6qgRTQ6fF'
    file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % file_id}).GetList()
    
    df = pandas.DataFrame(file_list)
    dfclean = df[['createdDate','id','title']].copy()
    dfclean['date'] = pandas.to_datetime(dfclean['createdDate'],format='%Y-%m-%d', errors='coerce')
    lastupdate = dfclean.loc[dfclean['createdDate']=='2020-09-11T01:53:29.639Z'].iloc[0]['date']
    dfnew = dfclean.loc[dfclean['date']>lastupdate]
    
    all_files = os.listdir('data/reports/')
    new_files = [item  for item in all_files if item not in dfnew['title'].unique().tolist()]
    reportdf = dfnew.loc[dfnew['title'].isin(new_files)]
    return(reportdf)

         kind                                 id                  etag  \
0  drive#file  1EyccnGO2UOun668cv4ddE6kg4jggm31J  "MTYwMDQ0OTE5MTAwMA"   
1  drive#file  1fxpNQRGhg8UNBIzSEBDKD0m-DjSB-1fE  "MTYwMDM5NDk3MDAwMA"   

                                            selfLink  \
0  https://www.googleapis.com/drive/v2/files/1Eyc...   
1  https://www.googleapis.com/drive/v2/files/1fxp...   

                                      webContentLink  \
0  https://drive.google.com/uc?id=1EyccnGO2UOun66...   
1  https://drive.google.com/uc?id=1fxpNQRGhg8UNBI...   

                                       alternateLink  \
0  https://drive.google.com/file/d/1EyccnGO2UOun6...   
1  https://drive.google.com/file/d/1fxpNQRGhg8UNB...   

                                           embedLink  \
0  https://drive.google.com/file/d/1EyccnGO2UOun6...   
1  https://drive.google.com/file/d/1fxpNQRGhg8UNB...   

                                            iconLink  \
0  https://drive-thirdparty.googleusercontent

In [None]:
## This is the function to actually conduct the download
def download_dumps(dumpdf):
    from google_drive_downloader import GoogleDriveDownloader as gdd
    for i in range(len(dumpdf)):
        title = dumpdf.iloc[i]['title']
        eachid = dumpdf.iloc[i]['id']
        gdd.download_file_from_google_drive(file_id=eachid,
                                            dest_path='data/table/'+title,
                                            unzip=False)    

## Parse the data dump

Note that this code still needs a downloader to pull data from either github or google drive

In [5]:
def fix_keywords(keywordstring):
    if keywordstring != keywordstring: ## Is it Nan?
        keywordlist = []
    elif keywordstring =="": ## Is it an empty string?
        keywordlist = []
    elif keywordstring == None: ## Is there no keywordstring?
        keywordlist = []
    else:
        keywordlist = keywordstring.lstrip('[').rstrip(']').replace('"','').split(',')
    return(keywordlist)

In [12]:
def generate_lst_dump(datadmp):
    cleandata = []
    authorObject = generate_curator()
    datadmp['_id'] = 'pmid'+datadmp['PMID'].astype(str)  
    for i in range(len(datadmp)):
        keywordlist = fix_keywords(datadmp.iloc[i]['Topics'])
        tmpdict={'_id':datadmp.iloc[i]['_id'],'keywords':keywordlist,
                 'covid19LST':{'@type':'Rating',
                                 'ratingExplanation':datadmp.iloc[i]['Methodology'],
                                 'ratingValue':datadmp.iloc[i]['LevelOfEvidence'],
                                 'reviewAspect':'Oxford 2011 Levels of Evidence',
                                 'author':authorObject}}
        cleandata.append(tmpdict)
    return(cleandata)

In [7]:
def update_filelist():
    all_files = os.listdir('data/tables/')
    updatefiles = all_files.remove('covid19LST_1st_dump.csv')
    initial_file = 'data/tables/covid19LST_1st_dump.csv'
    df = read_csv(initial_file,header=0,usecols=['PMID','Topics','LevelOfEvidence','Methodology','Updated Date'])
    if updatefiles!=None:
        for eachfile in updatefiles:
            tmpfile = read_csv('data/tables/'+eachfile,header=0,usecols=['PMID','Topics','LevelOfEvidence','Methodology','Updated Date'])
            df = pandas.concat((df,tmpfile),ignore_index=True)
    else:
        nochange=True
    df.sort_values('Updated Date',ascending=False,inplace=True)
    df.drop_duplicates(subset='PMID',keep='first',inplace=True)
    return(df)    

In [16]:
## Run an update

def run_loe_update():
    datadmp = update_filelist()
    dictlist = generate_lst_dump(datadmp)
    with open('results/update_dumps/lst_loe_annotations.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(dictlist, indent=4))
       

In [17]:
run_loe_update()