In [1]:
import os
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
from datetime import datetime
import re

pd.set_option("display.max_rows", 1000)


sns.set(style="ticks", color_codes=True)
sns.set(rc={'figure.figsize':(30,15)})
plt.figure(figsize=(16, 6))

<Figure size 1600x600 with 0 Axes>

In [2]:
pathBase = "./data/CORD-19-research-challenge"

# Data

## Load Metadata-File

In [3]:
dirs = os.listdir(pathBase)

print(dirs)

['.DS_Store', 'custom_license', 'metadata.readme', 'json_schema.txt', 'noncomm_use_subset', 'metadata.csv', 'biorxiv_medrxiv', 'COVID.DATA.LIC.AGMT.pdf', 'comm_use_subset']


In [4]:
rawMeta = pd.read_csv(pathBase + '/' + dirs[5])

#rawMeta.info()
#rawMeta.loc[rawMeta.publish_time.isnull()]

In [5]:
vals = rawMeta["publish_time"].unique()

vals



array(['1972-12-31', '1980-03-31', '1973-08-31', ..., '2009 Aug 23',
       '2015-10-10', '2019 Jan 10 Mar-Apr'], dtype=object)

In [18]:
x = ["2018 Feb 31",
     "2018 Jun 31",
    "2019 Feb 30",
    "2014 Apr 31",
    "2015 Feb 29",
    "2017 Apr 31",
    "2019 Feb 30",
    "2011 Sep 31",
    "2014 Feb 30",
    "2020 Feb 30",
    "2010 Apr 31"
]
nameNot = re.compile("^(\d\d\d\d)\s(\w\w\w)\s(\d*)")

for s in x:
    if(nameNot.match(s)):
        tmp = datetime.strptime(s, "%Y %b %d")
        print("Got DateTime: " + str(tmp))
    else: 
        print("no match...")



ValueError: day is out of range for month

In [31]:
# Parse Published Columns to dates

dates = []

for date in rawMeta["publish_time"].values:
    date = str(date) # Convert datee to string to avoid exceptions 
    tmp = None
    
    backup = date
    
    dashNot = re.compile("^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$")
    nameNot = re.compile("^(\d\d\d\d)\s(\w\w\w)\s(\d*)$")
    
    yearmoNot = re.compile("^(\d\d\d\d)\s(\w\w\w)$")
    
    testi = re.compile("^(\w\w\w)[-](\w\w\w)")
    
    
    if(date == "nan"):
        tmp = "Unknown"

    # Match cases with dash notation e.g.: 1996-03-27
    elif(dashNot.match(date)):
        tmp = datetime.strptime(date, "%Y-%m-%d")

    # Match cases with name notation e.g.: 2018 Jun 31
    elif(nameNot.match(date)):
        if("-" in date):
            date = date[:11].rstrip()

        try:
            tmp = datetime.strptime(date, "%Y %b %d")
        except ValueError:
            tmp = datetime.strptime((date[:8] + " 01"), "%Y %b %d")

    # Match cases with Year + Monthshorthand e.g.: 2007 May
    elif(yearmoNot.match(date)):

        if("-" in date):
            # ===== IS THIS REASONABLE? =====
            #print("[yearmoNot] Error-Entry: " + str(date))
            # Fix the error-dates such as "2006 Jun-Dec" => "2006 Dec"
            frmt = date.split("-")
            date = frmt[0][:5] + frmt[1]
        tmp = datetime.strptime(date, "%Y %b")

    # Match Year only
    elif(len(date) == 4):
        tmp = datetime.strptime(date, "%Y")

    # Some dates come in "string-array"-notation:
    #['2020-02-05', '2020-02'], ['2020-02-04', '2020-02'], ['2020-02-04', '2020-02'], ['2019-09-11', '2020']
    elif("[]" in date):
        date = date[2:12]
        tmp = datetime.strptime(date, "%Y-%m-%d")

    dates.append(tmp)

    
print("\nConversion Done!")
    
extract = {"Reference-ID": rawMeta["doi"].values,
           "Title": rawMeta["title"].values,
           "Authors": rawMeta["authors"].values,
           "Abstract": rawMeta["abstract"].values,
           "Published": rawMeta["publish_time"].values,
           "Has_Fulltext": rawMeta["has_full_text"].values,
           "Directory": rawMeta["full_text_file"].values
          }

ValueError occured in NameNotation
Fixed Days: 2018 Feb 01
ValueError occured in NameNotation
Fixed Days: 2018 Jun 01
ValueError occured in NameNotation
Fixed Days: 2019 Feb 01
ValueError occured in NameNotation
Fixed Days: 2014 Apr 01
ValueError occured in NameNotation
Fixed Days: 2015 Feb 01
ValueError occured in NameNotation
Fixed Days: 2017 Apr 01
ValueError occured in NameNotation
Fixed Days: 2019 Feb 01
ValueError occured in NameNotation
Fixed Days: 2011 Sep 01
ValueError occured in NameNotation
Fixed Days: 2014 Feb 01
ValueError occured in NameNotation
Fixed Days: 2020 Feb 01
ValueError occured in NameNotation
Fixed Days: 2010 Apr 01

Conversion Done!


In [32]:
meta = pd.DataFrame(extract)

meta.info()

meta.Published.fillna("Unknown", inplace=True)
meta.Directory.fillna("Fulltext N/A", inplace=True)

meta.sort_values("Published", ascending=False).head(n=10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 7 columns):
Reference-ID    40750 non-null object
Title           43996 non-null object
Authors         41074 non-null object
Abstract        35806 non-null object
Published       34197 non-null object
Has_Fulltext    44220 non-null bool
Directory       32829 non-null object
dtypes: bool(1), object(6)
memory usage: 2.1+ MB


Unnamed: 0,Reference-ID,Title,Authors,Abstract,Published,Has_Fulltext,Directory
39449,10.3390/ijerph17031002,Does Wuhan Need to be in Lockdown during the C...,"Wu, Jing; Gamber, Michelle; Sun, Wenjie",,"['2020-02-05', '2020-02']",False,Fulltext N/A
39450,10.3390/ijerph17030946,Public Opinion Polarization by Individual Reve...,"Chen, Tinggui; Li, Qianqian; Fu, Peihua; Yang,...",Social conflicts occur frequently during the s...,"['2020-02-04', '2020-02']",False,Fulltext N/A
39459,10.3390/ijerph17030949,Turnover Intention among Field Epidemiologists...,"Ryu, Sukhyun",The purpose of this study was to explore the l...,"['2020-02-04', '2020-02']",False,Fulltext N/A
39461,10.1007/s13365-019-00800-8,Enterovirus A71 capsid protein VP1 increases b...,"Wang, Wenjing; Sun, Jiandong; Wang, Nan; Sun, ...",Enterovirus A71 (EV-A71) is the major cause of...,"['2019-09-11', '2020']",False,Fulltext N/A
36334,,Understanding Helicases as a Means of Virus Co...,"Frick, D. N.; Lam, A. M. I.",Helicases are promising antiviral drug targets...,Unknown,False,Fulltext N/A
36344,10.1128/JVI.80.1.218-225.2006,ICP0 Prevents RNase L-Independent rRNA Cleavag...,"Sobol, Paul T.; Mossman, Karen L.",The classical interferon (IFN)-dependent antiv...,Unknown,False,Fulltext N/A
36343,10.1128/JVI.80.1.262-269.2006,Identification and Characterization of a Penae...,"Assavalapsakul, Wanchai; Smith, Duncan R.; Pan...","The yellow head virus is a positive-sense, sin...",Unknown,False,Fulltext N/A
36342,10.1128/JVI.80.2.941-950.2006,Monoclonal Antibodies Targeting the HR2 Domain...,"Lip, Kuo-Ming; Shen, Shuo; Yang, Xiaoming; Ken...",We have previously shown that an Escherichia c...,Unknown,False,Fulltext N/A
36341,10.1128/JVI.80.2.682-688.2006,Live-Cell Characterization and Analysis of a C...,"Santangelo, Philip; Nitin, Nitin; LaConte, Les...",Understanding viral pathogenesis is critical f...,Unknown,False,Fulltext N/A
36340,10.1128/JVI.80.2.689-696.2006,Defining the Cellular Target(s) of Porcine Rep...,"Kim, Jeong-Ki; Fahad, Al-Majhdi; Shanmukhappa,...",We produced a monoclonal antibody (MAb) (7G10)...,Unknown,False,Fulltext N/A
