In [1]:
import os
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
from datetime import datetime
import re

pd.set_option("display.max_rows", 1000)


sns.set(style="ticks", color_codes=True)
sns.set(rc={'figure.figsize':(30,15)})
plt.figure(figsize=(16, 6))

<Figure size 1600x600 with 0 Axes>

In [2]:
pathBase = "./data/CORD-19-research-challenge"

# Data

## Load Metadata-File

In [3]:
dirs = os.listdir(pathBase)

print(dirs)

['.DS_Store', 'custom_license', 'metadata.readme', 'json_schema.txt', 'noncomm_use_subset', 'metadata.csv', 'biorxiv_medrxiv', 'COVID.DATA.LIC.AGMT.pdf', 'comm_use_subset']


In [4]:
rawMeta = pd.read_csv(pathBase + '/' + dirs[5])

In [5]:
# Parse Published Columns to dates

dates = []

dashNot = re.compile("^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$")
nameNot = re.compile("^(\d\d\d\d)\s(\w\w\w)\s(\d*)$")
yearmoNot = re.compile("^(\d\d\d\d)\s(\w\w\w)$")

for date in rawMeta["publish_time"].values:
    date = str(date) # Convert datee to string to avoid exceptions 
    tmp = None
    
    if(date == "nan"):
        tmp = "Unknown"

    # Match cases with dash notation e.g.: 1996-03-27
    elif(dashNot.match(date)):
        tmp = datetime.strptime(date, "%Y-%m-%d")

    # Match cases with name notation e.g.: 2018 Jun 31
    elif(nameNot.match(date)):
        if("-" in date):
            date = date[:11].rstrip()
            
        # Try/Except is being used for invalid dates such as Feb 31
        try:
            tmp = datetime.strptime(date, "%Y %b %d")
        except ValueError:
            # Removes the Day and simply adds 01 as the day
            tmp = datetime.strptime((date[:8] + " 01"), "%Y %b %d")

    # Match cases with Year + Monthshorthand e.g.: 2007 May
    elif(yearmoNot.match(date)):

        if("-" in date):
            # ===== IS THIS REASONABLE? =====
            # Fix the error-dates such as "2006 Jun-Dec" => "2006 Dec"
            frmt = date.split("-")
            date = frmt[0][:5] + frmt[1]
        tmp = datetime.strptime(date, "%Y %b")

    # Match Year only
    elif(len(date) == 4):
        tmp = datetime.strptime(date, "%Y")

    # Some dates come in "string-array"-notation:
    #['2020-02-05', '2020-02'], ['2020-02-04', '2020-02'], ['2020-02-04', '2020-02'], ['2019-09-11', '2020']
    elif("[]" in date):
        date = date[2:12]
        tmp = datetime.strptime(date, "%Y-%m-%d")

    dates.append(tmp)

    
print("DateTime-Conversion Done!")
   
extract = {"Reference-ID": rawMeta["doi"].values,
           "Title": rawMeta["title"].values,
           "Authors": rawMeta["authors"].values,
           "Abstract": rawMeta["abstract"].values,
           "Published": dates,
           "Has_Fulltext": rawMeta["has_full_text"].values,
           "Directory": rawMeta["full_text_file"].values
          }

DateTime-Conversion Done!


In [6]:
meta = pd.DataFrame(extract)

meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 7 columns):
Reference-ID    40750 non-null object
Title           43996 non-null object
Authors         41074 non-null object
Abstract        35806 non-null object
Published       43981 non-null object
Has_Fulltext    44220 non-null bool
Directory       32829 non-null object
dtypes: bool(1), object(6)
memory usage: 2.1+ MB


In [7]:
meta.Directory.fillna("n/a", inplace=True)
meta.Published.fillna(pd.NaT, inplace=True)

# Drop all entries without Refernce ID, Title and publishing date;
# Since these are virtually worth nothing (All properties = NaN)
drop = meta.loc[(meta["Reference-ID"].isna()) & (meta["Title"].isna()) & (meta.Published == "Unknown")].index
meta.drop(drop, inplace=True)

meta.loc[(meta.Published == "Unknown")] = pd.NaT

In [26]:
meta.sort_values(by="Published", ascending=False)

Unnamed: 0,Reference-ID,Title,Authors,Abstract,Published,Has_Fulltext,Directory
3123,10.1016/B978-0-12-814966-9.00003-2,Chapter 3 Infectious Bronchitis Virus in Poult...,"Ennaji, Youssef; Khataby, Khadija; Ennaji, Mou...",Abstract Infectious bronchitis virus (IBV) is ...,2020-12-31 00:00:00,True,custom_license
3162,10.1016/B978-0-12-816331-3.00006-4,Chapter 6 Virus population dynamics examined w...,"Domingo, Esteban",Abstract Experimental evolution permits explor...,2020-12-31 00:00:00,True,custom_license
3066,10.1016/B978-0-12-811924-2.00019-5,Chapter 19 Current and New Approaches for Muco...,"Rhee, Joon Haeng",Abstract Mucosal surfaces are the interface be...,2020-12-31 00:00:00,True,custom_license
17773,10.1016/bs.ircmb.2019.10.004,Chapter Three Type I interferons and endoplasm...,"Sprooten, Jenny; Garg, Abhishek D.",Abstract Type I interferons (IFNs) comprise of...,2020-12-31 00:00:00,True,custom_license
17592,10.1016/bs.pmbts.2020.01.001,Chapter Seven Computer simulations of protein–...,"Loschwitz, Jennifer; Olubiyi, Olujide O.; Hub,...",Abstract The interactions between proteins and...,2020-12-31 00:00:00,True,custom_license
...,...,...,...,...,...,...,...
43550,NaT,NaT,NaT,NaT,NaT,NaT,NaT
43557,NaT,NaT,NaT,NaT,NaT,NaT,NaT
43671,10.1111/jvim.15548,Plasma and tissue angiotensin‐converting enzym...,"Larouche‐Lebel, Éva; Loughran, Kerry A.; Oyama...",BACKGROUND: Angiotensin‐converting enzyme 2 (A...,NaT,True,noncomm_use_subset
43950,10.1111/jvim.15481,Efficacy of an orally administered anti‐diarrh...,"Nixon, Sophie L.; Rose, Lindsay; Muller, Annik...",BACKGROUND: Acute diarrhea is a common clinica...,NaT,True,noncomm_use_subset


In [61]:
x = meta.iloc[3123]
tit = x.Title
abst = x.Abstract

print("Title: \n\n" + tit)
print("\n")
print("Abstract: \n\n" + abst)
print("\n")

splt = str.split(abst, " ")
print(str(len(splt)) + " Words in Abstract.")

Title: 

Chapter 3 Infectious Bronchitis Virus in Poultry: Molecular Epidemiology and Factors Leading to the Emergence and Reemergence of Novel Strains of Infectious Bronchitis Virus


Abstract: 

Abstract Infectious bronchitis virus (IBV) is a coronavirus that causes an acute and highly contagious disease in chickens. The virus can cause substantial economic losses throughout the poultry industry worldwide. It can affect the upper respiratory tract and the reproductive tract, and some strains can cause nephritis. The causative agent IBV is an RNA virus with great ability for mutation and recombination, thus capable of generating new virus strains that are difficult to control. There are many IBV strains found worldwide, including the Massachusetts, 4/91, D274, and QX-like strains that can be grouped under the classic or variant serotypes. In addition, new types of the virus continue to arise due to mutations and recombination events in the viral genome and even more factors, making th

## Analyze Titles

In [25]:
titles = meta.Title.values

words = titles[0].split(" ")

words
#for t in titles:
    

['Intrauterine',
 'virus',
 'infections',
 'and',
 'congenital',
 'heart',
 'disease']

# Use BioBERT for Stuff

## Initialize Tensorflow and BERT-Model

In [144]:
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.models import Model
import tensorflow.keras as keras
#import bert

print("Currently running Tensorflow v" + tf.version.VERSION)

Currently running Tensorflow v2.1.0


In [145]:
bertPath = "./biobert/model/"

In [146]:
max_seq_length = 128  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

# model is now a Keras BERT-Model
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

print("TF-BERT Model ready to use!")
model.summary()

TF-BERT Model ready to use!
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer_4 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                

## Import BioBERT-Weights (TODO)

* Informations about checkpoint files:
 * The .meta-File is a protocol buffer which saves the complete Tensorflow Graph(Graph = Model?) 
     * This contains all variables, operations, collections, etc.
 * The .index-File seems to be referencing a decision tree about the file (????)
     * "Kinda source": https://www.tensorflow.org/guide/checkpoint#loading_mechanics
 

* Tensorflow >v1 seems to be missing the "import_meta_graph()" function
    * The desired code would look like this: "saver = tf.train.import_graph_def((bertPath + bioBertMeta))"
    * https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/train/import_meta_graph

    
* How to import meta-graphs in Tensorflow v2?


In [159]:
print(type(model))

# Can't load the weights because the BioBERT Files aren't weights but TF (?) Checkpoints;
# model.load_weights("./biobert/model/model.ckpt")

bertCheckpoint = "model.ckpt-1000000.index"
bioBertMeta = "model.ckpt-1000000.meta"


# What does this even do?
#bert = tf.train.load_checkpoint((bertPath + bertCheckpoint))

test = tf.saved_model.load((bertPath + bertCheckpoint))



#with tf.Session() as sess:
#       saver = tf.train.import_graph_def((bertPath + bioBertMeta))


#bert = tf.train.import_meta_graph((bertPath + "model.ckpt"))

#cp_callback = keras.callback.ModelCheckpoint(filepath=(bertPath + "model.ckpt"))


<class 'tensorflow.python.keras.engine.training.Model'>


OSError: SavedModel file does not exist at: ./biobert/model/model.ckpt-1000000.index/{saved_model.pbtxt|saved_model.pb}