In [130]:
import os
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
from datetime import datetime
import re

pd.set_option("display.max_rows", 1000)


sns.set(style="ticks", color_codes=True)
sns.set(rc={'figure.figsize':(30,15)})
plt.figure(figsize=(16, 6))

<Figure size 1152x432 with 0 Axes>

<Figure size 1152x432 with 0 Axes>

In [131]:
pathBase = "./data/CORD-19-research-challenge"

# Data

## Load Metadata-File

In [132]:
dirs = os.listdir(pathBase)

print(dirs)

['.DS_Store', 'custom_license', 'metadata.readme', 'json_schema.txt', 'noncomm_use_subset', 'metadata.csv', 'biorxiv_medrxiv', 'COVID.DATA.LIC.AGMT.pdf', 'comm_use_subset']


In [133]:
rawMeta = pd.read_csv(pathBase + '/' + dirs[5])

In [136]:
# Parse Published Columns to dates

dates = []

dashNot = re.compile("^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$")
nameNot = re.compile("^(\d\d\d\d)\s(\w\w\w)\s(\d*)$")
yearmoNot = re.compile("^(\d\d\d\d)\s(\w\w\w)$")

for date in rawMeta["publish_time"].values:
    date = str(date) # Convert datee to string to avoid exceptions 
    tmp = None
    
    if(date == "nan"):
        tmp = "Unknown"

    # Match cases with dash notation e.g.: 1996-03-27
    elif(dashNot.match(date)):
        tmp = datetime.strptime(date, "%Y-%m-%d")

    # Match cases with name notation e.g.: 2018 Jun 31
    elif(nameNot.match(date)):
        if("-" in date):
            date = date[:11].rstrip()
            
        # Try/Except is being used for invalid dates such as Feb 31
        try:
            tmp = datetime.strptime(date, "%Y %b %d")
        except ValueError:
            # Removes the Day and simply adds 01 as the day
            tmp = datetime.strptime((date[:8] + " 01"), "%Y %b %d")

    # Match cases with Year + Monthshorthand e.g.: 2007 May
    elif(yearmoNot.match(date)):

        if("-" in date):
            # ===== IS THIS REASONABLE? =====
            # Fix the error-dates such as "2006 Jun-Dec" => "2006 Dec"
            frmt = date.split("-")
            date = frmt[0][:5] + frmt[1]
        tmp = datetime.strptime(date, "%Y %b")

    # Match Year only
    elif(len(date) == 4):
        tmp = datetime.strptime(date, "%Y")

    # Some dates come in "string-array"-notation:
    #['2020-02-05', '2020-02'], ['2020-02-04', '2020-02'], ['2020-02-04', '2020-02'], ['2019-09-11', '2020']
    elif("[]" in date):
        date = date[2:12]
        tmp = datetime.strptime(date, "%Y-%m-%d")

    dates.append(tmp)

    
print("DateTime-Conversion Done!")
   
extract = {"Reference-ID": rawMeta["doi"].values,
           "Title": rawMeta["title"].values,
           "Authors": rawMeta["authors"].values,
           "Abstract": rawMeta["abstract"].values,
           "Published": dates,
           "Has_Fulltext": rawMeta["has_full_text"].values,
           "Directory": rawMeta["full_text_file"].values
          }

DateTime-Conversion Done!


In [137]:
meta = pd.DataFrame(extract)

meta.Directory.fillna("n/a", inplace=True)
meta.Published.fillna(datetime(1, 1, 1), inplace=True)

# Remove otherwise corrupted Published-Values
#meta.drop(meta.index[((meta.Published == "Unknown") | (meta.Published == None)) & ((meta["Reference-ID"] != "Unknown") | (meta["Reference-ID"] != None))], inplace = True)


meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 7 columns):
Reference-ID    40750 non-null object
Title           43996 non-null object
Authors         41074 non-null object
Abstract        35806 non-null object
Published       44220 non-null object
Has_Fulltext    44220 non-null bool
Directory       44220 non-null object
dtypes: bool(1), object(6)
memory usage: 2.1+ MB


In [153]:
meta.loc[(meta.Published == "Unknown")]

#meta.loc[(meta["Reference-ID"].isna())]

Unnamed: 0,Reference-ID,Title,Authors,Abstract,Published,Has_Fulltext,Directory
8553,,,,,Unknown,False,custom_license
8995,,,,,Unknown,False,custom_license
8996,,,,,Unknown,False,custom_license
8998,,,,,Unknown,False,custom_license
8999,,,,,Unknown,False,custom_license
...,...,...,...,...,...,...,...
42227,10.7554/eLife.31257,MERS-CoV spillover at the camel-human interface,"Dudas, Gytis; Carvalho, Luiz Max; Rambaut, And...",Middle East respiratory syndrome coronavirus (...,Unknown,True,comm_use_subset
42935,10.7554/eLife.44345,ADAM17-dependent signaling is required for onc...,"Mikuličić, Snježana; Finke, Jérôme; Boukhallou...",Oncogenic human papillomaviruses (HPV) are sma...,Unknown,True,comm_use_subset
43413,10.7554/eLife.31662,Functional role of the type 1 pilus rod struct...,"Spaulding, Caitlin N; Schreiber, Henry Louis; ...","Uropathogenic E. coli (UPEC), which cause urin...",Unknown,True,comm_use_subset
43550,10.7189/jogh.05.020418,Etiology of community acquired pneumonia among...,"Mathew, Joseph L.; Singhi, Sunit; Ray, Pallab;...",BACKGROUND: Childhood community acquired pneum...,Unknown,True,comm_use_subset
