### Import libraries


In [2]:
!pip install pandas
!pip install requests
!pip install bert-serving-server --no-deps
!pip install scipy

Collecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/bb/71/8f53bdbcbc67c912b888b40def255767e475402e9df64050019149b1a943/pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (10.0MB)
[K     |████████████████████████████████| 10.0MB 1.1MB/s eta 0:00:01
Collecting pytz>=2017.2
[?25l  Downloading https://files.pythonhosted.org/packages/e7/f9/f0b53f88060247251bf481fa6ea62cd0d25bf1b11a87888e53ce5b7c8ad2/pytz-2019.3-py2.py3-none-any.whl (509kB)
[K     |████████████████████████████████| 512kB 26.5MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-1.0.3 pytz-2019.3
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting requests
[?25l  Downloading https://files.pythonhosted.org/packages/1a/70/1935c770cb3be6e3a8b78ced23d7e0f3b187f5cbfab4749523ed65d7c9b1/requests-2.23.0-py2.py3-none-any.whl (58kB)
[K     |████████████████████████████████| 61kB 954kB/s eta 0:00:011
[?25hCollecting urllib3!=1.25.0,!=1.

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path, PurePath
import pandas as pd
pd.set_option('display.max_colwidth', 100)

import requests
from requests.exceptions import HTTPError, ConnectionError
from ipywidgets import interact
import ipywidgets as widgets
import re
from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd
from IPython.display import display

from scipy.spatial import distance_matrix

!pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 


nltk.data.path.append("'../../data/nltk_data/'")

from nltk.corpus import stopwords

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/f6/1d/d925cfb4f324ede997f6d47bea4d9babba51b49e87a767c170b77005889d/nltk-3.4.5.zip (1.5MB)
[K     |████████████████████████████████| 1.5MB 961kB/s eta 0:00:01
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.4.5-cp36-none-any.whl size=1450719 sha256=46f3aa4a10e7cdb988d9199a8207127173fe290a8b3543b3873976fd96edbab6
  Stored in directory: /root/.cache/pip/wheels/96/86/f6/68ab24c23f207c0077381a5e3904b2815136b879538a24b483
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.4.5
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


### Connect to personal google drive google drive to enable data download
Requires you to have this data available on your personal drive: https://drive.google.com/open?id=1ZVxvPnrnA8ffGoFsVxJs75QL9li6AfG7

from google.colab import drive # for connecting to dataset on personal google drive
# mount personal google drive that has data uploaded (Requires verification)
drive.mount('/content/drive')

### Download data

In [4]:
# upload data and list contents
input_dir = "../../data/covid-19-research-challenge/"
list(Path(input_dir).glob('*'))

[PosixPath('../../data/covid-19-research-challenge/metadata.csv'),
 PosixPath('../../data/covid-19-research-challenge/comm_use_subset'),
 PosixPath('../../data/covid-19-research-challenge/noncomm_use_subset'),
 PosixPath('../../data/covid-19-research-challenge/biorxiv_medrxiv'),
 PosixPath('../../data/covid-19-research-challenge/COVID.DATA.LIC.AGMT.pdf'),
 PosixPath('../../data/covid-19-research-challenge/metadata.readme'),
 PosixPath('../../data/covid-19-research-challenge/json_schema.txt'),
 PosixPath('../../data/covid-19-research-challenge/custom_license')]

In [5]:
metadata_path = input_dir + 'metadata.csv'
metadata = pd.read_csv(metadata_path,
                               dtype={'Microsoft Academic Paper ID': str,
                                      'pubmed_id': str})

# Set the abstract to the paper title if it is null
metadata.abstract = metadata.abstract.fillna(metadata.title)
print("Number of articles before removing duplicates: %s " % len(metadata))

Number of articles before removing duplicates: 44220 


In [6]:
# Some papers are duplicated since they were collected from separate sources. Thanks Joerg Rings
duplicate_paper = ~(metadata.title.isnull() | metadata.abstract.isnull() | metadata.publish_time.isnull()) & (metadata.duplicated(subset=['title', 'abstract']))
metadata.dropna(subset=['publish_time', 'journal'])
metadata = metadata[~duplicate_paper].reset_index(drop=True)
print("Number of articles AFTER removing duplicates: %s " % len(metadata))

Number of articles AFTER removing duplicates: 42938 


### **TODO**

### Create Data Classes for the Research Dataset and Papers
These classes make it easier to navigate through the datasources. There is a class called ResearchPapers that wraps the entire dataset an provide useful functions to navigate through it, and Paper, that make it easier to view each paper.

In [13]:
def get(url, timeout=6):
    try:
        r = requests.get(url, timeout=timeout)
        return r.text
    except ConnectionError:
        print(f'Cannot connect to {url}')
        print(f'Remember to turn Internet ON in the Kaggle notebook settings')
    except HTTPError:
        print('Got http error', r.status, r.text)

# Convert the doi to a url
def doi_url(d): 
    return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'


class ResearchPapers:
    
    def __init__(self, metadata: pd.DataFrame):
        self.metadata = metadata
        
    def __getitem__(self, item):
        return Paper(self.metadata.iloc[item])
    
    def __len__(self):
        return len(self.metadata)
    
    def head(self, n):
        return ResearchPapers(self.metadata.head(n).copy().reset_index(drop=True))
    
    def tail(self, n):
        return ResearchPapers(self.metadata.tail(n).copy().reset_index(drop=True))
    
    def abstracts(self):
        return self.metadata.abstract.dropna()
    
    def titles(self):
        return self.metadata.title.dropna()
        
    def _repr_html_(self):
        return self.metadata._repr_html_()
    
class Paper:
    
    '''
    A single research paper
    '''
    def __init__(self, item):
        self.paper = item.to_frame().fillna('')
        self.paper.columns = ['Value']
    
    def doi(self):
        return self.paper.loc['doi'].values[0]
    
    def html(self):
        '''
        Load the paper from doi.org and display as HTML. Requires internet to be ON
        '''
        if self.doi():
            url = doi_url(self.doi()) 
            text = get(url)
            return widgets.HTML(text)
    
    def text(self):
        '''
        Load the paper from doi.org and display as text. Requires Internet to be ON
        '''
        text = get(self.doi())
        return text
    
    def abstract(self):
        return self.paper.loc['abstract'].values[0]
    
    def title(self):
        return self.paper.loc['title'].values[0]
    
    def authors(self, split=False):
        '''
        Get a list of authors
        '''
        authors = self.paper.loc['authors'].values[0]
        if not authors:
            return []
        if not split:
            return authors
        if authors.startswith('['):
            authors = authors.lstrip('[').rstrip(']')
            return [a.strip().replace("\'", "") for a in authors.split("\',")]
        
        # Todo: Handle cases where author names are separated by ","
        return [a.strip() for a in authors.split(';')]
        
    def _repr_html_(self):
        return self.paper._repr_html_()
    

papers = ResearchPapers(metadata)

#### Show a Paper

In [6]:
papers[1]

Unnamed: 0,Value
sha,
source_x,Elsevier
title,Coronaviruses in Balkan nephritis
doi,10.1016/0002-8703(80)90355-5
pmcid,
pubmed_id,6243850
license,els-covid
abstract,Coronaviruses in Balkan nephritis
publish_time,1980-03-31
authors,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;..."


#### Pull info from a paper

In [7]:
index=1
paper=papers[index]
print("Example paper #%s\nTitle: %s\nAuthors: %s " % (index, paper.title(), paper.authors(split=True)))


Example paper #1
Title: Coronaviruses in Balkan nephritis
Authors: ['Georgescu, Leonida', 'Diosi, Peter', 'Buţiu, Ioan', 'Plavoşin, Livia', 'Herzog, Georgeta'] 


### Text Preprocessing
To prepare the text for the search index we perform the following steps
1.   Remove punctuations and special characters
2.   Convert to lowercase
3.   Tokenize into individual tokens (words mostly)
4.   Remove stopwords like (and, to))
5.   Lemmatize

In [8]:
# Download the stop words we plan on using
# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download('wordnet')

In [9]:
# Hardcode the data we want to use in search
SEARCH_DISPLAY_COLUMNS = ['title', 'abstract', 'doi', 'authors', 'journal', 'publish_time']

In [10]:
english_stopwords = list(set(stopwords.words('english')))

def strip_characters(text):
    t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
    t = re.sub('/', ' ', t)
    t = t.replace("'",'')
    return t

def clean(text):
    t = text.lower()
    t = strip_characters(t)
    return t

def tokenize(text):
    words = nltk.word_tokenize(text)
    return list(set([word for word in words 
                     if len(word) > 1
                     and not word in english_stopwords
                     and not (word.isnumeric() and len(word) is not 4)
                     and (not word.isnumeric() or word.isalpha())] )
               )
    
def lemmatize(word_list,lemmatizer):
    # Init the Wordnet Lemmatizer
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output


def preprocess(text):
    t = clean(text)
    tokens = tokenize(t)
    lemmatizer=WordNetLemmatizer()
    tokens = lemmatize(tokens,lemmatizer)
    return tokens

In [11]:
class SearchResults:
    
    def __init__(self, 
                 data: pd.DataFrame,
                 columns = None):
        self.results = data
        if columns:
            self.results = self.results[columns]
            
    def __getitem__(self, item):
        return Paper(self.results.loc[item])
    
    def __len__(self):
        return len(self.results)
        
    def _repr_html_(self):
        return self.results._repr_html_()

class WordTokenIndex:
    
    def __init__(self, 
                 corpus: pd.DataFrame, 
                 columns=SEARCH_DISPLAY_COLUMNS):
        self.corpus = corpus
        raw_search_str = self.corpus.abstract.fillna('') + ' ' + self.corpus.title.fillna('')
        self.index = raw_search_str.apply(preprocess).to_frame()
        self.index.columns = ['terms']
        self.index.index = self.corpus.index
        self.columns = columns

    
    def search(self, search_string):
        search_terms = preprocess(search_string)
        result_index = self.index.terms.apply(lambda terms: any(i in terms for i in search_terms))
        results = self.corpus[result_index].copy().reset_index().rename(columns={'index':'paper'})
        return SearchResults(results, self.columns + ['paper'])

### Creating a search index¶ - Using a RankBM25 Search Index
We will create a simple search index that will just match search tokens in a document. First we tokenize the abstract and store it in a dataframe. Then we just match search terms against it.

RankBM25 is a python library that implements algorithms for a simple search index. https://pypi.org/project/rank-bm25/

In [12]:
#!pip install rank_bm25
# Create a prebaked search engine with existing package: https://pypi.org/project/rank-bm25/
from rank_bm25 import BM25Okapi

In [13]:
class RankBM25Index(WordTokenIndex):
    
    def __init__(self, corpus: pd.DataFrame, columns=SEARCH_DISPLAY_COLUMNS):
        super().__init__(corpus, columns)
        self.bm25 = BM25Okapi(self.index.terms.tolist())
        
    def search(self, search_string, n=4):
        search_terms = preprocess(search_string)
        doc_scores = self.bm25.get_scores(search_terms)
        ind = np.argsort(doc_scores)[::-1][:n]
        results = self.corpus.iloc[ind][self.columns]
        results['Score'] = doc_scores[ind]
        results = results[results.Score > 0]
        return SearchResults(results.reset_index(), self.columns + ['Score'])

### Create the index (This takes several minutes)

In [14]:
bm25_index = RankBM25Index(metadata)

### Search by date

In [18]:
# example output
query='curise ship'
n=50
results = bm25_index.search(query,n)
results.results.sort_values(by=['publish_time'], ascending=False).head(5)

Unnamed: 0,title,abstract,doi,authors,journal,publish_time,Score
23,Inhibition of SARS-CoV-2 infection (previously...,AbstractThe recent outbreak of coronavirus dis...,10.1101/2020.03.09.983247,Shuai Xia; Meiqin Liu; Chao Wang; Wei Xu; Qiao...,,2020-03-12,59.917332
45,Will novel virus go pandemic or be contained?,The repatriation of 565 Japanese citizens from...,10.1126/science.367.6478.610,"Kupferschmidt, Kai; Cohen, Jon",Science,2020-02-06,59.895739
49,Fuzzy logic approach for infectious disease di...,Abstract This paper presents a systematic revi...,10.1016/j.bbe.2019.09.004,"Arji, Goli; Ahmadi, Hossein; Nilashi, Mehrbakh...",Biocybernetics and Biomedical Engineering,2019-12-31,59.891173
33,Qu’apprend-t-on de nouveau des épidémies émerg...,Points essentiels L’Afrique et l’Asie du Sud-E...,10.1016/j.lpm.2019.09.036,"Malvy, Denis; Gaüzère, Bernard-Alex; Migliani,...",La Presse Médicale,2019-12-31,59.902953
9,Alpha herpesvirus egress and spread from neuro...,Alpha herpesviruses naturally infect the perip...,10.1101/729830,"Ambrosini, A. E.; Deshmukh, N.; Berry, M. J.; ...",,2019-08-08,59.972306


In [16]:
# example output
query='ACE spike'
n=50
results = bm25_index.search(query,n)
results.results.sort_values(by=['publish_time'], ascending=False).head(5)

Unnamed: 0,title,abstract,doi,authors,journal,publish_time,Score
39,Will novel virus go pandemic or be contained?,The repatriation of 565 Japanese citizens from...,10.1126/science.367.6478.610,"Kupferschmidt, Kai; Cohen, Jon",Science,2020-02-06,48.683726
5,EGR1 upregulation following Venezuelan equine ...,Abstract Venezuelan equine encephalitis virus ...,10.1016/j.virol.2019.10.016,"Dahal, Bibha; Lin, Shih-Chao; Carey, Brian D.;...",Virology,2020-01-02,48.831123
24,Identification of a Novel Linear B-Cell Epitop...,"Porcine deltacoronavirus (PDCoV), first identi...",10.3390/ijms21020648,"Fu, Jiayu; Chen, Rui; Hu, Jingfei; Qu, Huan; Z...",Int J Mol Sci,2020 Jan 19,48.71872
10,Clinician perceptions of respiratory infection...,Abstract Outbreaks of emerging and re-emerging...,10.1016/j.idh.2019.01.003,"Barratt, Ruth; Shaban, Ramon Z.; Gilbert, Gwen...","Infection, Disease & Health",2019-08-31,48.786863
17,ULK1/2 Restricts the Formation of Inducible SI...,Membraneless organelles (MLOs) are liquid-like...,10.1016/j.isci.2019.08.001,"Saul, Vera Vivian; Seibert, Markus; Krüger, Ma...",iScience,2019 Aug 6,48.757805


### Creating an Autocomplete Search bar with ranking by score
Here we provide a search bar with autocomplete. This uses IPywidgets interactive rendering of a TextBox.

In [19]:
def search_papers(SearchTerms: str):
    results_to_consider=200
    results_to_display=10
    # gather search results by score
    output = bm25_index.search(SearchTerms, n=results_to_consider)
    # sort results by recency
    # output=search_results.results.sort_values(by=['publish_time'], ascending=False).head(results_to_display)
    if len(output) > 0:
        display(output) 
    return output

searchbar = widgets.interactive(search_papers, SearchTerms='ACE spike')
searchbar

interactive(children=(Text(value='ACE spike', description='SearchTerms'), Output()), _dom_classes=('widget-int…

### TODO

In [19]:
# Do search with option to restrict years available

### Looking at the Covid Research Tasks
This dataset has a number of tasks. We will try to organize the papers according to the tasks

What is known about transmission, incubation, and environmental stability?
What do we know about COVID-19 risk factors?
What do we know about virus genetics, origin, and evolution?
What has been published about ethical and social science considerations?
What do we know about diagnostics and surveillance?
What has been published about medical care?
What do we know about non-pharmaceutical interventions?
What has been published about information sharing and inter-sectoral collaboration?
What do we know about vaccines and therapeutics?

In [7]:
tasks = [('What is known about transmission, incubation, and environmental stability?', 
        'transmission incubation environment coronavirus'),
        ('What do we know about COVID-19 risk factors?', 'risk factors'),
        ('What do we know about virus genetics, origin, and evolution?', 'genetics origin evolution'),
        ('What has been published about ethical and social science considerations','ethics ethical social'),
        ('What do we know about diagnostics and surveillance?','diagnose diagnostic surveillance'),
        ('What has been published about medical care?', 'medical care'),
        ('What do we know about vaccines and therapeutics?', 'vaccines vaccine vaccinate therapeutic therapeutics')] 
tasks = pd.DataFrame(tasks, columns=['Task', 'Keywords'])

#### Research papers for each task
Here we add a dropdown that allows for selection of tasks and show the search results

In [8]:
def show_task(Task):
    print(Task)
    keywords = tasks[tasks.Task == Task].Keywords.values[0]
    search_results = bm25_index.search(keywords, n=200)
    return search_results
    
results = interact(show_task, Task = tasks.Task.tolist());

interactive(children=(Dropdown(description='Task', options=('What is known about transmission, incubation, and…

# Create a BERT sentance encoding search engine 
From: https://towardsdatascience.com/building-a-search-engine-with-bert-and-tensorflow-c6fdc0186c8a
By: Denis Antyukhov
In this experiment, we will use a pre-trained BERT model checkpoint to build a general-purpose text feature extractor.

These things are sometimes referred to as Natural Language Understanding (NLU) modules, because the features they extract are relevant for a wide array of downstream NLP tasks.

One use for these features is in instance-based learning, which relies on computing the similarity of the query to the training samples.

We will illustrate this by building a simple Information Retrieval system using the BERT NLU module for feature extraction.

**The plan for this experiment is:**
1. getting the pre-trained BERT model checkpoint
2. extracting a sub-graph optimized for inference
3. creating a feature extractor with tf.Estimator
4. exploring vector space with T-SNE and Embedding Projector
5. implementing an Information Retrieval engine
6. accelerating search queries with math
7. building a covid research article recommendation system

### Step 1: getting the pre-trained model
We start with a pre-trained english BERT-base model checkpoint.

For configuring and optimizing the graph for inference we will use bert-as-a-service repository, which allows for serving BERT models for remote clients over TCP.

Having a remote BERT-server is beneficial in multi-host environments. However, in this part of the experiment we will focus on creating a local (in-process) feature extractor. This is useful if one wishes to avoid additional latency and potential failure modes introduced by a client-server architecture. Now, let us download the model and install the package.

Now, let us download the model and install the package.

!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
!pip install bert-serving-server --no-deps

## Step 2: optimizing the inference graph
Normally, to modify the model graph we would have to do some low-level TensorFlow programming. 

However, thanks to bert-as-a-service, we can configure the inference graph using a simple CLI interface.

There are a couple of parameters in the below snippet too look out for.

For each text sample, BERT-base model encoding layers output a tensor of shape **[sequence_len, encoder_dim],** with one vector per input token. To obtain a fixed representation, we need to apply some sort of pooling.

**POOL_STRAT** parameter defines the pooling strategy applied to the  **POOL_LAYER** encoding layer. The default value **REDUCE_MEAN** averages the vectors for all tokens in a sequence. This strategy works best for most sentence-level tasks, when the model is not fine-tuned. Another option is NONE, in which case no pooling is applied at all. This is useful for word-level tasks such as Named Entity Recognition or POS tagging. For a detailed discussion of other options check out the Han Xiao's [blog post.](https://hanxiao.github.io/2019/01/02/Serving-Google-BERT-in-Production-using-Tensorflow-and-ZeroMQ/)

**SEQ_LEN** affects the maximum length of sequences processed by the model. Smaller values increase the model inference speed almost linearly.

In [9]:
import os
import tensorflow as tf
sesh = tf.InteractiveSession()

from bert_serving.server.graph import optimize_graph
from bert_serving.server.helper import get_args_parser

# input dir
MODEL_DIR = '../../data/BERT/uncased_L-12_H-768_A-12/' #@param {type:"string"}
# output dir
GRAPH_DIR = '../../data/BERT/graph/' #@param {type:"string"}
# output filename
GRAPH_OUT = 'extractor.pbtxt' #@param {type:"string"}

POOL_STRAT = 'REDUCE_MEAN' #@param ['REDUCE_MEAN', 'REDUCE_MAX', "NONE"]
POOL_LAYER = '-2' #@param {type:"string"}
SEQ_LEN = '256' #@param {type:"string"}

In [1]:


tf.gfile.MkDir(GRAPH_DIR)

parser = get_args_parser()
carg = parser.parse_args(args=['-model_dir', MODEL_DIR,
                               '-graph_tmp_dir', GRAPH_DIR,
                               '-max_seq_len', str(SEQ_LEN),
                               '-pooling_layer', str(POOL_LAYER),
                               '-pooling_strategy', POOL_STRAT])

tmp_name, config = optimize_graph(carg)
graph_fout = os.path.join(GRAPH_DIR, GRAPH_OUT)

tf.gfile.Rename(
    tmp_name,
    graph_fout,
    overwrite=True
)
print("\nSerialized graph to {}".format(graph_fout))

W0324 13:59:07.741838 140502332016448 module_wrapper.py:139] From /home/fatken/.local/lib/python3.6/site-packages/bert_serving/server/helper.py:180: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.

W0324 13:59:07.742825 140502332016448 module_wrapper.py:139] From /home/fatken/.local/lib/python3.6/site-packages/bert_serving/server/helper.py:180: The name tf.logging.ERROR is deprecated. Please use tf.compat.v1.logging.ERROR instead.

I:[36mGRAPHOPT[0m:[gra:opt: 52]:model config: /home/fatken/data/BERT/uncased_L-12_H-768_A-12/bert_config.json
I0324 13:59:07.744890 140502332016448 graph.py:52] model config: /home/fatken/data/BERT/uncased_L-12_H-768_A-12/bert_config.json
I:[36mGRAPHOPT[0m:[gra:opt: 55]:checkpoint: /home/fatken/data/BERT/uncased_L-12_H-768_A-12/bert_model.ckpt
I0324 13:59:07.746072 140502332016448 graph.py:55] checkpoint: /home/fatken/data/BERT/uncased_L-12_H-768_A-12/bert_model.ckpt
I:[36mGRAPHOPT[0m:[gra:opt: 59


Serialized graph to /home/fatken/data/BERT/graph/extractor.pbtxt


Running the above snippet will put the BERT model graph and weights from  **MODEL_DIR** into a GraphDef object which will be serialized to a pbtxt file at **GRAPH_OUT**. The file will be smaller than the original model because the nodes and variables required for training will be removed. This results in a quite portable solution: for example the english base model only takes 389 MB after exporting.

### Step 3: creating a feature extractor
Now, we will use the serialized graph to build a feature extractor using the tf.Estimator API. We will need to define two things: **input_fn** and **model_fn**

In [10]:
import logging
import numpy as np

from tensorflow.python.estimator.estimator import Estimator
from tensorflow.python.estimator.run_config import RunConfig
from tensorflow.python.estimator.model_fn import EstimatorSpec
from tensorflow.keras.utils import Progbar

from bert_serving.server.bert.tokenization import FullTokenizer
from bert_serving.server.bert.extract_features import convert_lst_to_features


log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)
log.handlers = []

In [11]:
GRAPH_PATH = GRAPH_DIR + GRAPH_OUT #@param {type:"string"}
VOCAB_PATH = MODEL_DIR + "vocab.txt" #@param {type:"string"}

SEQ_LEN = 256 #@param {type:"integer"}

**input_fn** manages getting the data into the model. That includes executing the whole text preprocessing pipeline and preparing a feed_dict for BERT. 

First, each text sample is converted into a tf.Example instance containing the necessary features listed in **INPUT_NAMES**. The bert_tokenizer object contains  the WordPiece vocabulary and performs the text preprocessing. After that the examples are re-grouped by feature name in a **feed_dict**.

In [12]:
INPUT_NAMES = ['input_ids', 'input_mask', 'input_type_ids']
bert_tokenizer = FullTokenizer(VOCAB_PATH)

def build_feed_dict(texts):
    
    text_features = list(convert_lst_to_features(
        texts, SEQ_LEN, SEQ_LEN, 
        bert_tokenizer, log, False, False))

    target_shape = (len(texts), -1)

    feed_dict = {}
    for iname in INPUT_NAMES:
        features_i = np.array([getattr(f, iname) for f in text_features])
        features_i = features_i.reshape(target_shape).astype("int32")
        feed_dict[iname] = features_i

    return feed_dict

From /usr/local/lib/python3.6/dist-packages/bert_serving/server/bert/tokenization.py:75: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



tf.Estimators have a fun feature which makes them re-build and re-initialize the whole computational graph at each call to the predict function. 

So, in order to avoid the overhead, to the predict function we will pass a generator, which will yield the features to the model in a never-ending loop.

In [13]:
def build_input_fn(container):
    
    def gen():
        while True:
          try:
            yield build_feed_dict(container.get())
          except:
            yield build_feed_dict(container.get())

    def input_fn():
        return tf.data.Dataset.from_generator(
            gen,
            output_types={iname: tf.int32 for iname in INPUT_NAMES},
            output_shapes={iname: (None, None) for iname in INPUT_NAMES})
    return input_fn

class DataContainer:
  def __init__(self):
    self._texts = None
  
  def set(self, texts):
    if type(texts) is str:
      texts = [texts]
    self._texts = texts
    
  def get(self):
    return self._texts

**model_fn** contains the specification of the model. In our case, it is loaded from the pbtxt file we saved in the previous step. 

The features are mapped explicitly to the corresponding input nodes with input_map.

In [14]:
def model_fn(features, mode):
    with tf.gfile.GFile(GRAPH_PATH, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        
    output = tf.import_graph_def(graph_def,
                                 input_map={k + ':0': features[k] for k in INPUT_NAMES},
                                 return_elements=['final_encodes:0'])

    return EstimatorSpec(mode=mode, predictions={'output': output[0]})
  
estimator = Estimator(model_fn=model_fn)

Using temporary folder as model directory: /tmp/tmpm1xlif1p


Now we have everything we need to perform inference:

In [15]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def build_vectorizer(_estimator, _input_fn_builder, batch_size=128):
  container = DataContainer()
  predict_fn = _estimator.predict(_input_fn_builder(container), yield_single_examples=False)
  
  def vectorize(text, verbose=False):
    x = []
    bar = Progbar(len(text))
    for text_batch in batch(text, batch_size):
      container.set(text_batch)
      x.append(next(predict_fn)['output'])
      if verbose:
        bar.add(len(text_batch))
      
    r = np.vstack(x)
    return r
  
  return vectorize

In [16]:
bert_vectorizer = build_vectorizer(estimator, build_input_fn)

In [17]:
bert_vectorizer(2*['sample text']).shape

From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/array_ops.py:1475: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


(2, 768)

### Step 4: exploring vector space with Projector

*A* standalone version of BERT feature extractor is available in the [repository](https://github.com/gaphex/bert_experimental).

Using the vectorizer we will generate embeddings for articles from the CORD-19 benchmark (in this tutorial, the Reuters-21578 benchmark corpus was used previously)

To visualise and explore the embedding vector space in 3D we will use a dimensionality reduction technique called [T-SNE](https://distill.pub/2016/misread-tsne/).

Lets get the article embeddings first.

In [18]:
import nltk
from nltk.corpus import reuters
nltk.data.path.append("../../data/nltk_data/")

# nltk.download("reuters")
# nltk.download("punkt")

In [19]:
# REUTERS EXAMPLE
max_samples = 256
categories = ['wheat', 'tea', 'strategic-metal', 
              'housing', 'money-supply', 'fuel']

S, X, Y = [], [], []

for category in categories:
  print(category)
  
  sents = reuters.sents(categories=category)
  sents = [' '.join(sent) for sent in sents][:max_samples]
  X.append(bert_vectorizer(sents, verbose=True))
  Y += [category] * len(sents)
  S += sents
  
X = np.vstack(X) 
X.shape

wheat
tea
strategic-metal
housing
money-supply
fuel


(1134, 768)

In [38]:
with open("embeddings.tsv", "w") as fo:
  for x in X.astype('float16'):
    line = "\t".join([str(v) for v in x])
    fo.write(line + "\n")

with open("metadata.tsv", "w") as fo:
  fo.write("Label\tSentence\n")
  for y, s in zip(Y, S):
    fo.write("{}\t{}\n".format(y, s))

The interactive visualization of generated embeddings is available on the [Embedding Projector](https://projector.tensorflow.org/?config=https://gist.githubusercontent.com/gaphex/7262af1e151957b1e7c638f4922dfe57/raw/3b946229fc58cbefbca2a642502cf51d4f8e81c5/reuters_proj_config.json). **<--CLICK THAT TO GENERATE**

From the link you can run T-SNE yourself, or load a checkpoint using the bookmark in lower-right corner (loading works only on Chrome).

To reproduce the input files used for this visualization, run the code below. Then, download the files to your machine and upload to Projector

(you can dowload files from the menu opened by the ">" button in the upper-left)

In [0]:
from IPython.display import HTML

HTML("""
<video width="900" height="632" controls>
  <source src="https://storage.googleapis.com/bert_resourses/reuters_tsne_hd.mp4" type="video/mp4">
</video>
""")

### Create embeddings for CORD19 Articles
#### Title

In [20]:
# Convert pandas dataframe to nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader
# From: https://stackoverflow.com/questions/49088978/how-to-create-corpus-from-pandas-data-frame-to-operate-with-nltk/49104725
def CreateCorpusFromDataFrame(corpusfolder,df):
    for index, r in df.iterrows():
        id=index
        title=r['title']
        body=r['title']
        # handler text for not properly munged data
        try: 
          category=re.sub('/', '', r['journal']) # remove odd characters as writing to file
        except TypeError:
          continue
        fname=str(category)+'_'+str(id)+'.txt'
        corpusfile=open(corpusfolder+'/'+fname,'a+')
        corpusfile.write(str(body) +" " +str(title))
        corpusfile.close()

In [41]:
# create folder to hold CORD19 nltk
dirName = 'CORD19_nltk_title_only'
try:
    # Create target Directory
    os.mkdir(dirName)
except FileExistsError:
    pass

# create corpus
CreateCorpusFromDataFrame(dirName,metadata)
print("Corpus created in folder: %s" % dirName)

Corpus created in folder: CORD19_nltk_title_only


In [42]:
# Import the corpus reader
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

# Create NLTK data structure (with pattern matching to create the article names again)
CORD_corpus=CategorizedPlaintextCorpusReader(dirName,r'.*', cat_pattern=r'(.*)_.*.txt$') 

In [44]:
# total journals
print("Total number journals: %s" % (len(metadata.journal.unique())))

# select a subset of journals, where the journal will be the tag
num_journals=8
categories=metadata['journal'].value_counts()[:num_journals].index.tolist()
print ("\nPicking most common journals:")
categories



Total number journals: 3930

Picking most common journals:


['PLoS One',
 'Virology',
 'Emerg Infect Dis',
 'Viruses',
 'The Lancet',
 'Sci Rep',
 'Virus Research',
 'Vaccine']

In [0]:
#CORD19 Examples
max_samples = 5000

S, X, Y = [], [], []

for category in categories:
  print(category)
  
  sents = CORD_corpus.sents(categories=category)
  sents = [' '.join(sent) for sent in sents][:max_samples]
  X.append(bert_vectorizer(sents, verbose=True))
  Y += [category] * len(sents)
  S += sents
  
X = np.vstack(X) 
X.shape

In [0]:
# make folder in google drive to download files
location = '/content/drive/My Drive/'

with open(location + "embeddings_large.tsv", "w") as fo:
  for x in X.astype('float16'):
    line = "\t".join([str(v) for v in x])
    fo.write(line + "\n")

with open(location + "metadata_large.tsv", "w") as fo:
  fo.write("Label\tSentence\n")
  for y, s in zip(Y, S):
    fo.write("{}\t{}\n".format(y, s))

The interactive visualization of generated embeddings is available on the [Embedding Projector](https://projector.tensorflow.org/?config=https://gist.githubusercontent.com/gaphex/7262af1e151957b1e7c638f4922dfe57/raw/3b946229fc58cbefbca2a642502cf51d4f8e81c5/reuters_proj_config.json). **<--CLICK THAT TO GENERATE**

Then go to bottom right and load in those files

# Bert encoding of tasks.

In [22]:
tasks

Unnamed: 0,Task,Keywords
0,"What is known about transmission, incubation, ...",transmission incubation environment coronavirus
1,What do we know about COVID-19 risk factors?,risk factors
2,"What do we know about virus genetics, origin, ...",genetics origin evolution
3,What has been published about ethical and soci...,ethics ethical social
4,What do we know about diagnostics and surveill...,diagnose diagnostic surveillance
5,What has been published about medical care?,medical care
6,What do we know about vaccines and therapeutics?,vaccines vaccine vaccinate therapeutic therape...


In [90]:
tasks_bert = bert_vectorizer(tasks['Task'], verbose=True)
keywords_bert = bert_vectorizer(tasks['Keywords'], verbose=True)



# titles contain NA, dropped

In [24]:
metadata.shape

(42938, 15)

In [31]:
metadata.dropna(subset=['abstract', 'title'], inplace=True)
metadata.shape

(42714, 15)

# Bert embedding of titles

In [32]:
titles_bert = bert_vectorizer(metadata['title'], verbose = True)



In [33]:
abstracts_bert = bert_vectorizer(metadata['abstract'], verbose = True)



# Euclidean dist

In [85]:
tasks

Unnamed: 0,Task,Keywords
0,"What is known about transmission, incubation, and environmental stability?",transmission incubation environment coronavirus
1,What do we know about COVID-19 risk factors?,risk factors
2,"What do we know about virus genetics, origin, and evolution?",genetics origin evolution
3,What has been published about ethical and social science considerations,ethics ethical social
4,What do we know about diagnostics and surveillance?,diagnose diagnostic surveillance
5,What has been published about medical care?,medical care
6,What do we know about vaccines and therapeutics?,vaccines vaccine vaccinate therapeutic therapeutics


In [131]:
metadata[['title', 'abstract', 'publish_time']]

Unnamed: 0,title,abstract,publish_time
0,Intrauterine virus infections and congenital heart disease,Abstract The etiologic basis for the vast majority of cases of congenital heart disease remains ...,1972-12-31
1,Coronaviruses in Balkan nephritis,Coronaviruses in Balkan nephritis,1980-03-31
2,Cigarette smoking and coronary heart disease: new evidence and old reactions,Cigarette smoking and coronary heart disease: new evidence and old reactions,1980-03-31
3,Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus,"Abstract Middle-aged female identical twins, one of whom had systemic lupus erythematosus (SLE),...",1973-08-31
4,"Epidemiology of community-acquired respiratory tract infections in adults Incidence, etiology, a...",Abstract Upper respiratory tract infections are the most common types of infectious diseases amo...,1985-06-28
...,...,...,...
42933,Epidemiology and clinical profile of pathogens responsible for the hospitalization of children i...,This study aimed to identify a broad spectrum of respiratory pathogens from hospitalized and not...,2017 Nov 17
42934,Functional Analysis of the Transmembrane Domain in Paramyxovirus F Protein-Mediated Membrane Fusion,"Abstract To enter cells, enveloped viruses use fusion-mediating glycoproteins to facilitate the ...",2009-02-13
42935,Viral Entry Properties Required for Fitness in Humans Are Lost through Rapid Genomic Change duri...,Human parainfluenza viruses cause a large burden of human respiratory illness. While much resear...,2018 Jul 3
42936,Arenavirus reverse genetics: New approaches for the investigation of arenavirus biology and deve...,"Abstract Several arenaviruses, chiefly Lassa virus, cause hemorrhagic fever disease in humans an...",2011-03-15


## 1. euc dist between `article title` and `task string`

In [44]:
euc_mat = distance_matrix(titles_bert, tasks_bert, p = 2)

### Ranking by 1st task

In [132]:
euc_titles_tasks = pd.DataFrame(euc_mat, columns=[tasks['Task']], index=metadata.index)
combined = pd.concat([metadata[['title', 'abstract', 'publish_time']], euc_titles_tasks], axis=1)
combined.sort_values(by=[euc_titles_tasks.columns[0]], axis = 0, ascending=True)

Unnamed: 0,title,abstract,publish_time,"(What is known about transmission, incubation, and environmental stability?,)","(What do we know about COVID-19 risk factors?,)","(What do we know about virus genetics, origin, and evolution?,)","(What has been published about ethical and social science considerations,)","(What do we know about diagnostics and surveillance?,)","(What has been published about medical care?,)","(What do we know about vaccines and therapeutics?,)"
4514,Does reduced MHC diversity decrease viability of vertebrate populations?,Abstract Loss of genetic variation may render populations more vulnerable to pathogens due to in...,2010-03-31,7.471977,8.262564,8.146552,10.062410,9.388145,9.720213,8.880879
15019,What is the importance of zoonotic trichomonads for human health?,"Trichomonads are common parasites of many vertebrate and invertebrate species, with four species...",2014-07-31,7.545735,7.035960,7.121302,9.467658,8.368005,8.876764,7.280060
31649,What can we predict about viral evolution and emergence?,Predicting the emergence of infectious diseases has been touted as one of the most important goa...,,7.561812,6.478772,5.380471,8.933305,6.696497,7.961433,6.213515
22840,Dengue Virus Glycosylation: What Do We Know?,"In many infectious diseases caused by either viruses or bacteria, pathogen glycoproteins play im...",2017 Jul 25,7.669997,6.886530,6.838970,9.753249,8.235622,9.339000,7.297268
33133,Emerging diseases in Chiroptera: why bats?,A conference entitled ‘2nd International Berlin Bat Meeting: Bat Biology and Infectious Diseases...,,7.684107,7.700914,7.297066,9.376574,8.341376,8.819533,7.898508
...,...,...,...,...,...,...,...,...,...,...
19523,"1,3-Diphenyl-4,5-dihydro-1H-pyrazol-5-one","In the title pyrazolone derivative, C(15)H(12)N(2)O, the five-membered ring is approximately pla...",2012 Mar 10,16.430590,14.615978,15.655774,16.230066,16.086476,16.607075,15.287028
19953,"Ethyl 4-(5-bromo-1H-indol-3-yl)-2,6,6-trimethyl-5-oxo-1,4,5,6,7,8-hexa­hydro­quinoline-3-carboxy...","The title compound, C(23)H(25)BrN(2)O(3), crystallizes with two independent mol­ecules in the as...",2012 Nov 24,16.520008,14.794494,15.789667,16.342148,16.281157,16.946889,15.546038
4162,Nowo pojawiające się choroby zakaźne w aspekcie bezpieczeństwa krwi,"Abstract The risk of transfusion-related infectious diseases, the markers for which are routinel...",2013-09-30,16.528353,15.753838,15.839911,16.539292,15.951589,16.391425,15.952603
32346,CD133作为肺癌干细胞标记物的应用及其局限性,"Lung cancer is one of the most common tumor, which lacks of effective clinical treatment to lead...",,18.073277,17.089324,17.671355,18.142165,17.625425,17.349947,17.329107


### Ranking by 2nd task

In [133]:
euc_titles_tasks = pd.DataFrame(euc_mat, columns=[tasks['Task']], index=metadata.index)
combined = pd.concat([metadata[['title', 'abstract', 'publish_time']], euc_titles_tasks], axis=1)
combined.sort_values(by=[euc_titles_tasks.columns[1]], axis = 0, ascending=True)

Unnamed: 0,title,abstract,publish_time,"(What is known about transmission, incubation, and environmental stability?,)","(What do we know about COVID-19 risk factors?,)","(What do we know about virus genetics, origin, and evolution?,)","(What has been published about ethical and social science considerations,)","(What do we know about diagnostics and surveillance?,)","(What has been published about medical care?,)","(What do we know about vaccines and therapeutics?,)"
42520,What are the risks of COVID-19 infection in pregnant women?,What are the risks of COVID-19 infection in pregnant women?,2020-03-13,8.289875,5.980003,8.107699,10.069599,8.934867,9.449055,8.218084
38952,From Isolation to Coordination: How Can Telemedicine Help Combat the COVID-19 Outbreak?,The rapid spread of Coronavirus disease 2019 (COVID-19) presents China with a critical challenge...,2020-02-23,8.875359,6.330186,8.247651,10.166074,8.425106,9.476560,8.095946
39211,"If containment is not possible, how do we minimize mortality for COVID-19 and other emerging inf...",If COVID-19 containment policies fail and social distancing measures cannot be sustained until v...,2020-03-17,8.304019,6.363608,7.696973,10.311798,8.176689,9.653903,7.814902
31649,What can we predict about viral evolution and emergence?,Predicting the emergence of infectious diseases has been touted as one of the most important goa...,,7.561812,6.478772,5.380471,8.933305,6.696497,7.961433,6.213515
31003,Host genome polymorphisms and tuberculosis infection: What we have to say?,Several epidemiology studies suggest that host genetic factors play important roles in susceptib...,,8.363339,6.533615,6.597945,8.497352,7.196075,7.770903,6.609006
...,...,...,...,...,...,...,...,...,...,...
11507,"On vous demande, sachez répondre","On vous demande, sachez répondre",2014-03-31,15.678123,15.197685,15.241371,15.558457,15.224225,15.614729,15.245713
5462,SARS: Solución mediante Acciones y Respuestas Sensatas,SARS: Solución mediante Acciones y Respuestas Sensatas,2004-10-31,15.649980,15.320809,15.204449,15.386911,15.083672,15.625673,15.203720
4162,Nowo pojawiające się choroby zakaźne w aspekcie bezpieczeństwa krwi,"Abstract The risk of transfusion-related infectious diseases, the markers for which are routinel...",2013-09-30,16.528353,15.753838,15.839911,16.539292,15.951589,16.391425,15.952603
32346,CD133作为肺癌干细胞标记物的应用及其局限性,"Lung cancer is one of the most common tumor, which lacks of effective clinical treatment to lead...",,18.073277,17.089324,17.671355,18.142165,17.625425,17.349947,17.329107


## 2. Cos sim between `article title` and `task string`

In [106]:
cosine_sim = np.dot(titles_bert, tasks_bert.T) / np.dot(abs(titles_bert), abs(tasks_bert.T))

### Ranking by 1st task

In [134]:
cos_sim_titles_tasks = pd.DataFrame(cosine_sim, columns=[tasks['Task']], index=metadata.index)
combined = pd.concat([metadata[['title', 'abstract', 'publish_time']], cos_sim_titles_tasks], axis=1)
combined.sort_values(by=[euc_titles_tasks.columns[0]], axis = 0, ascending=False)

Unnamed: 0,title,abstract,publish_time,"(What is known about transmission, incubation, and environmental stability?,)","(What do we know about COVID-19 risk factors?,)","(What do we know about virus genetics, origin, and evolution?,)","(What has been published about ethical and social science considerations,)","(What do we know about diagnostics and surveillance?,)","(What has been published about medical care?,)","(What do we know about vaccines and therapeutics?,)"
4514,Does reduced MHC diversity decrease viability of vertebrate populations?,Abstract Loss of genetic variation may render populations more vulnerable to pathogens due to in...,2010-03-31,0.945989,0.910229,0.917137,0.848254,0.875401,0.867827,0.894747
16429,"Mass extinctions, biodiversity and mitochondrial function: are bats ‘special’ as reservoirs for ...","For the past 10–15 years, bats have attracted growing attention as reservoirs of emerging zoonot...",2011-12-31,0.943527,0.912471,0.929108,0.845349,0.894548,0.878056,0.906711
31649,What can we predict about viral evolution and emergence?,Predicting the emergence of infectious diseases has been touted as one of the most important goa...,,0.938635,0.956186,0.977765,0.891810,0.954431,0.921685,0.961587
31943,Does genetic diversity limit disease spread in natural host populations?,It is a commonly held view that genetically homogenous host populations are more vulnerable to i...,,0.937123,0.898754,0.929170,0.831097,0.872806,0.873130,0.890013
23327,When are pathogen genome sequences informative of transmission events?,Recent years have seen the development of numerous methodologies for reconstructing transmission...,2018 Feb 8,0.935807,0.945693,0.935872,0.868899,0.901674,0.891892,0.912916
...,...,...,...,...,...,...,...,...,...,...
19953,"Ethyl 4-(5-bromo-1H-indol-3-yl)-2,6,6-trimethyl-5-oxo-1,4,5,6,7,8-hexa­hydro­quinoline-3-carboxy...","The title compound, C(23)H(25)BrN(2)O(3), crystallizes with two independent mol­ecules in the as...",2012 Nov 24,0.475482,0.654451,0.509662,0.503665,0.494460,0.480067,0.572192
7154,Lars Olof Lennart Nilsson,Lars Olof Lennart Nilsson,2017-04-21,0.472488,0.615177,0.550706,0.581026,0.563877,0.545921,0.544292
4234,Table of contents,Table of contents,2004-07-31,0.453591,0.526695,0.494899,0.602868,0.509470,0.495276,0.524821
4258,Table of Contents,Table of Contents,2016-01-31,0.453591,0.526695,0.494899,0.602868,0.509470,0.495276,0.524821


In [128]:
np.where(metadata['title'] == 'Table of Contents')

(array([4109]),)

In [129]:
metadata[['title', 'abstract']][4105:4115]

Unnamed: 0,title,abstract
4254,Replication and Clearance of Respiratory Syncytial Virus Apoptosis Is an Important Pathway of Vi...,Human respiratory syncytial virus is an important cause of severe respiratory disease in young c...
4255,Molecular and Functional Analysis of the Human Prothrombinase Gene (HFGL2) and Its Role in Viral...,"In the present studies, we report the cloning and structural characterization of the HFGL2 gene ..."
4256,Cytokine and Chemokine Profiles in Lung Tissues from Fatal Cases of 2009 Pandemic Influenza A (H...,Pathological studies on fatal cases caused by 2009 pandemic influenza H1N1 virus (2009 pH1N1) re...
4257,Discordant Biological and Toxicological Species Responses to TLR3 Activation,Toll-like receptors (TLRs) are highly conserved type 1 membrane proteins that initiate a multipl...
4258,Table of Contents,Table of Contents
4259,"Clinicopathologic, Immunohistochemical, and Ultrastructural Findings of a Fatal Case of Middle E...",Middle East respiratory syndrome coronavirus (MERS-CoV) infection causes an acute respiratory il...
4260,Value of Autopsy Emphasized in the Case Report of a Single Patient with Middle East Respiratory ...,Value of Autopsy Emphasized in the Case Report of a Single Patient with Middle East Respiratory ...
4261,This Month in AJP,This Month in AJP
4262,"Cluster of acute hemorrhagic appendicitis among high school students in Wuhan, China",Abstract Background Features of a cluster of acute appendicitis that occurred among a high schoo...
4263,Presidential address: awakening the hero within,Presidential address: awakening the hero within


### Ranking by 2nd task

In [135]:
cos_sim_titles_tasks = pd.DataFrame(cosine_sim, columns=[tasks['Task']], index=metadata.index)
combined = pd.concat([metadata[['title', 'abstract', 'publish_time']], cos_sim_titles_tasks], axis=1)
combined.sort_values(by=[euc_titles_tasks.columns[1]], axis = 0, ascending=False)

Unnamed: 0,title,abstract,publish_time,"(What is known about transmission, incubation, and environmental stability?,)","(What do we know about COVID-19 risk factors?,)","(What do we know about virus genetics, origin, and evolution?,)","(What has been published about ethical and social science considerations,)","(What do we know about diagnostics and surveillance?,)","(What has been published about medical care?,)","(What do we know about vaccines and therapeutics?,)"
42520,What are the risks of COVID-19 infection in pregnant women?,What are the risks of COVID-19 infection in pregnant women?,2020-03-13,0.910828,0.963474,0.902180,0.839450,0.875165,0.870611,0.899239
39211,"If containment is not possible, how do we minimize mortality for COVID-19 and other emerging inf...",If COVID-19 containment policies fail and social distancing measures cannot be sustained until v...,2020-03-17,0.914765,0.960245,0.921732,0.827443,0.907813,0.861027,0.914858
38952,From Isolation to Coordination: How Can Telemedicine Help Combat the COVID-19 Outbreak?,The rapid spread of Coronavirus disease 2019 (COVID-19) presents China with a critical challenge...,2020-02-23,0.895850,0.957904,0.899822,0.832923,0.893977,0.865276,0.908554
31649,What can we predict about viral evolution and emergence?,Predicting the emergence of infectious diseases has been touted as one of the most important goa...,,0.938635,0.956186,0.977765,0.891810,0.954431,0.921685,0.961587
12202,What Is the Role of Newer Molecular Tests in the Management of CAP?,What Is the Role of Newer Molecular Tests in the Management of CAP?,2013-03-31,0.921972,0.952838,0.932948,0.902719,0.925414,0.918785,0.927892
...,...,...,...,...,...,...,...,...,...,...
16247,Travel and public health,Summary Increasing international travel and migration can interfere with public health in both t...,2008-12-31,0.586913,0.541174,0.533364,0.657754,0.558289,0.611529,0.552224
4258,Table of Contents,Table of Contents,2016-01-31,0.453591,0.526695,0.494899,0.602868,0.509470,0.495276,0.524821
4234,Table of contents,Table of contents,2004-07-31,0.453591,0.526695,0.494899,0.602868,0.509470,0.495276,0.524821
34892,Lee Jong-wook,Director general of the World Health Organization,,0.412749,0.520668,0.481679,0.471733,0.493933,0.457857,0.478153
