In [1]:
import os
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
from datetime import datetime
import re

pd.set_option("display.max_rows", 1000)


sns.set(style="ticks", color_codes=True)
sns.set(rc={'figure.figsize':(30,15)})
plt.figure(figsize=(16, 6))

<Figure size 1600x600 with 0 Axes>

In [2]:
pathBase = "./data/CORD-19-research-challenge"

# Data

## Load Metadata-File

In [3]:
dirs = os.listdir(pathBase)

print(dirs)

['.DS_Store', 'custom_license', 'metadata.readme', 'json_schema.txt', 'noncomm_use_subset', 'metadata.csv', 'biorxiv_medrxiv', 'COVID.DATA.LIC.AGMT.pdf', 'comm_use_subset']


In [4]:
rawMeta = pd.read_csv(pathBase + '/' + dirs[5])

In [5]:
# Parse Published Columns to dates

dates = []

dashNot = re.compile("^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$")
nameNot = re.compile("^(\d\d\d\d)\s(\w\w\w)\s(\d*)$")
yearmoNot = re.compile("^(\d\d\d\d)\s(\w\w\w)$")

for date in rawMeta["publish_time"].values:
    date = str(date) # Convert datee to string to avoid exceptions 
    tmp = None
    
    if(date == "nan"):
        tmp = "Unknown"

    # Match cases with dash notation e.g.: 1996-03-27
    elif(dashNot.match(date)):
        tmp = datetime.strptime(date, "%Y-%m-%d")

    # Match cases with name notation e.g.: 2018 Jun 31
    elif(nameNot.match(date)):
        if("-" in date):
            date = date[:11].rstrip()
            
        # Try/Except is being used for invalid dates such as Feb 31
        try:
            tmp = datetime.strptime(date, "%Y %b %d")
        except ValueError:
            # Removes the Day and simply adds 01 as the day
            tmp = datetime.strptime((date[:8] + " 01"), "%Y %b %d")

    # Match cases with Year + Monthshorthand e.g.: 2007 May
    elif(yearmoNot.match(date)):

        if("-" in date):
            # ===== IS THIS REASONABLE? =====
            # Fix the error-dates such as "2006 Jun-Dec" => "2006 Dec"
            frmt = date.split("-")
            date = frmt[0][:5] + frmt[1]
        tmp = datetime.strptime(date, "%Y %b")

    # Match Year only
    elif(len(date) == 4):
        tmp = datetime.strptime(date, "%Y")

    # Some dates come in "string-array"-notation:
    #['2020-02-05', '2020-02'], ['2020-02-04', '2020-02'], ['2020-02-04', '2020-02'], ['2019-09-11', '2020']
    elif("[]" in date):
        date = date[2:12]
        tmp = datetime.strptime(date, "%Y-%m-%d")

    dates.append(tmp)

    
print("DateTime-Conversion Done!")
   
extract = {"Reference-ID": rawMeta["doi"].values,
           "Title": rawMeta["title"].values,
           "Authors": rawMeta["authors"].values,
           "Abstract": rawMeta["abstract"].values,
           "Published": dates,
           "Has_Fulltext": rawMeta["has_full_text"].values,
           "Directory": rawMeta["full_text_file"].values
          }

DateTime-Conversion Done!


In [6]:
meta = pd.DataFrame(extract)

meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 7 columns):
Reference-ID    40750 non-null object
Title           43996 non-null object
Authors         41074 non-null object
Abstract        35806 non-null object
Published       43981 non-null object
Has_Fulltext    44220 non-null bool
Directory       32829 non-null object
dtypes: bool(1), object(6)
memory usage: 2.1+ MB


In [7]:
meta.Directory.fillna("n/a", inplace=True)
meta.Published.fillna(pd.NaT, inplace=True)

# Drop all entries without Refernce ID, Title and publishing date;
# Since these are virtually worth nothing (All properties = NaN)
drop = meta.loc[(meta["Reference-ID"].isna()) & (meta["Title"].isna()) & (meta.Published == "Unknown")].index
meta.drop(drop, inplace=True)

meta.loc[(meta.Published == "Unknown")] = pd.NaT

In [15]:
meta.sort_values(by="Published")

Unnamed: 0,Reference-ID,Title,Authors,Abstract,Published,Has_Fulltext,Directory
26672,,ACUTE HEPATITIS ASSOCIATED WITH MOUSE LEUKEMIA...,"Nelson, John B.",Observations on the behavior of MHV (Pr) in th...,1955-10-31 00:00:00,False,noncomm_use_subset
26673,,THE ENHANCING EFFECT OF MURINE HEPATITIS VIRUS...,"Nelson, John B.",Pleuropneumoma-like organisms (PPLO) of the ca...,1957-08-01 00:00:00,True,noncomm_use_subset
28224,,Enhancement of the Pathogenicity of Mouse Hepa...,"Gledhill, A. W.",,1961-09-01 00:00:00,True,custom_license
26674,,THE CELLULAR NATURE OF GENETIC SUSCEPTIBILITY ...,"Kantoch, M.; Warwick, A.; Bang, F. B.",Using peritoneal macrophage cultures it was fo...,1963-05-01 00:00:00,True,noncomm_use_subset
26616,,AN ELECTRON MICROSCOPE STUDY OF THE DEVELOPMEN...,"David-Ferreira, J. F.; Manaker, R. A.",Samples taken at different intervals of time f...,1965-01-01 00:00:00,True,noncomm_use_subset
...,...,...,...,...,...,...,...
43550,NaT,NaT,NaT,NaT,NaT,NaT,NaT
43557,NaT,NaT,NaT,NaT,NaT,NaT,NaT
43671,10.1111/jvim.15548,Plasma and tissue angiotensin‐converting enzym...,"Larouche‐Lebel, Éva; Loughran, Kerry A.; Oyama...",BACKGROUND: Angiotensin‐converting enzyme 2 (A...,NaT,True,noncomm_use_subset
43950,10.1111/jvim.15481,Efficacy of an orally administered anti‐diarrh...,"Nixon, Sophie L.; Rose, Lindsay; Muller, Annik...",BACKGROUND: Acute diarrhea is a common clinica...,NaT,True,noncomm_use_subset


In [19]:
#meta.loc[(meta["Reference-ID"] == pd.NaT)]
#meta.iloc[43550]

Reference-ID                           10.1016/j.bmcl.2015.05.039
Title           Design, synthesis and evaluation of a series o...
Authors         Peters, Hannah L.; Jochmans, Dirk; de Wilde, A...
Abstract        Abstract A series of doubly flexible nucleosid...
Published                                     2015-08-01 00:00:00
Has_Fulltext                                                 True
Directory                                          custom_license
Name: 43620, dtype: object

## Analyze Titles

In [25]:
titles = meta.Title.values

words = titles[0].split(" ")

words
#for t in titles:
    

['Intrauterine',
 'virus',
 'infections',
 'and',
 'congenital',
 'heart',
 'disease']