In [1]:
import json
import os
import pandas as pd
from pprint import pprint
import spacy
import time

nlp = spacy.load('en_core_web_sm')

In [2]:
# Set current working directory
os.chdir('/home/nrs/SideProjects/COVID19_Research_Analysis/')
os.getcwd()

'/home/nrs/SideProjects/COVID19_Research_Analysis'

In [3]:
def gather_paper_data(dirs, papers_info):
    for d in dirs:
        papers = os.listdir(d)
        
        for paper in papers:
            paper_path = os.path.join(d, paper)
            
            if os.path.isdir(paper_path):
                gather_paper_data([paper_path], papers_info)
            else:
                with open(paper_path, 'rb') as f:
                    file_data = json.load(f)

                    paper_id = file_data['paper_id']
                    title = file_data['metadata']['title']

                    try:
                        abstract_paragraphs = file_data['abstract']
                    except KeyError:  # Note: this occurs for pmc_json files since none of them have an abstract
                        abstract_paragraphs = []
                    abstract = []
                    for paragraph in abstract_paragraphs:
                        abstract.append(paragraph['text'])
                    abstract = '\n'.join(abstract)

                    try:
                        body_paragraphs = file_data['body_text']
                    except KeyError:
                        body_paragraphs = []
                    body = []
                    for paragraph in body_paragraphs:
                        body.append(paragraph['text'])
                    body = '\n'.join(body)

                    papers_info.append([paper_id, title, abstract, body])

    return papers_info

In [4]:
def filter_paper(text, keywords):
    text = ([word.lower().strip() for word in text.split(' ')])
    for keyword in keywords:
        keyword_parts = keyword.split(' ')
        if any(word in text for word in keyword_parts):
            return True
    return False

In [5]:
def clean_paper(text, tokenizer):
    cleaned_text = []
    stopwords = spacy.lang.en.stop_words.STOP_WORDS

    text_doc = nlp(text)
    text_tokens = ' '.join([
        token.lemma_.lower().strip() for token in text_doc 
        if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-'
    ])
    
    return text_tokens

***Docs*** <br>
Get all papers from directories: arxiv, biorxiv_medrxiv, comm_use_subset, noncomm_use_subset <br>
Convert into a dataframe

In [6]:
papers_info = []
gather_paper_data(['arxiv', 'biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset'], papers_info)
papers_df = pd.DataFrame(papers_info, columns=['paper_id', 'title', 'abstract', 'body'])
papers_df.shape

(27299, 4)

***Docs***<br>
Filter in relevant papers depending on the task

In [7]:
treatment_task_keywords = [
    'drug', 'patients', 'therapeutic', 'vaccine', 'animal', 'clinical', 'trial', 'prophylaxis', 
    'prophylactic', 'distribution', 'studies', 'immunity', 'model', 'prioritize', 'efficacy'
]
treatment_papers_df = papers_df[
    papers_df.apply(lambda paper: filter_paper(paper['abstract'], treatment_task_keywords), axis=1)
]
treatment_papers_df.shape

(8293, 4)

#### Clean Paper Abstracts and Bodies via Multiprocessing for Further Text Mining and NLP Analysis <br>
Performance comparison: <br>
Cleaning ~8000 abstracts: <br>
267 s (apply w/o optimizations); 135 s (threads); 114 s (processes) <br>
Cleaning 2000 bodies: <br>
667 s (apply w/o optimizations); 293 s (threads); 256 s (processes) <br>
Cleaning all bodies: <br>
1080 s (processes)

In [8]:
import dask.dataframe as dd

treatment_papers_data = dd.from_pandas(treatment_papers_df, npartitions=30)
tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [9]:
start = time.time()
treatment_papers_df['cleaned_abstract'] = treatment_papers_data.map_partitions(
    lambda df: df['abstract'].apply(
        lambda abstract: clean_paper(abstract, tokenizer)
    )
).compute(scheduler='processes')
end = time.time()

print('Exec time of cleaning paper abstracts (on multiple processes): ', end - start)

Exec time of cleaning paper abstracts (on multiple processes):  108.02366828918457


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
start = time.time()
treatment_papers_df['cleaned_body'] = treatment_papers_data.map_partitions(
    lambda df: df['body'].apply(
        lambda body: clean_paper(body, tokenizer)
    )
).compute(scheduler='processes')
end = time.time()

print('Exec time of cleaning paper bodies (on multiple processes): ', end - start)

Exec time of cleaning paper bodies (on multiple processes):  1085.143699645996


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
# Save to CSV file; don't want to recompute cleaned text
treatment_papers_df.to_csv('treatment_papers_cleaned.csv')

In [12]:
treatment_papers_df = pd.read_csv('treatment_papers_cleaned.csv')

### Task: What do we know about vaccines or therapeutics? 
* Effectiveness of drugs being developed and used to treat patients <br>
* Potential complication of Antibody-Dependent Enhancement (ADE) in vaccine recipients <br>
* Exploration of use of best animal models and their predictive value for a human vaccine <br>
* Capabilities to discover a therapeutic for the disease, and clinical effectiveness studies to discover therapeutics <br>
* Alternative models in prioritizing and distributing scarce, newly proven therapeutics and vaccines at scale <br>
* Efforts targeted at a universal coronavirus vaccine <br>
* Efforts to develop animal models and standardize challenge studies <br>
* Efforts to develop prophylaxis clinical studies and prioritize in healthcare workers <br>
* Approaches to evaluate risk for enhanced disease after vaccination <br>
* Assays to evaluate vaccine immune response and process development for vaccines <br>

### First Strategy: Dumb Filtering/Regex

In [13]:
drug_papers_df = treatment_papers_df[treatment_papers_df['cleaned_abstract'].str.contains('drug')]
drug_abstracts = drug_papers_df['abstract'].values  # numpy arrays
drug_bodies = drug_papers_df['body'].values  # numpy arrays

In [14]:
for abstract in drug_abstracts:
    sentences = abstract.split('.')
    for sentence in sentences:
        if 'drug' in sentence:
            print('Sentence: ', sentence)
    break  # Stop at 1 abstract

Sentence:  The appearance of a new dangerous and contagious disease requires the development of a drug therapy faster than what is foreseen by usual mechanisms
Sentence:   Many drug therapy developments consist in investigating through different clinical trials the effects of different specific drug combinations by delivering it into a test group of ill patients, meanwhile a placebo treatment is delivered to the remaining ill patients, known as the control group
Sentence:   We compare the above technique to a new technique in which all patients receive a different and reasonable combination of drugs and use this outcome to feed a Neural Network
Sentence:   By averaging out fluctuations and recognizing different patient features, the Neural Network learns the pattern that connects the patients initial state to the outcome of the treatments and therefore can predict the best drug therapy better than the above method
Sentence:   In contrast to many available works, we do not study any det

### Second Strategy: Productionize Elasticsearch implementation
We'll get a blazing fast implementation with easily pluggable search and text analysis functions from Elasticsearch. <br>
It's also excellent practice for deploying backends as lightweight Docker containers.

Database indexes are data structures used to speed up queries and retrievals on database records. These indexes will store a field and a reference to its corresponding record in a data structure like B-Trees. <br>
B-Trees are especially suited for fast search operations even after many insertions and deletions (due to it storing keys in sorted order and fast rebalancing operations). <br>
Insert, Delete, Search: O(t log_t(n)) where n = # keys, t = # keys in node, # disk operations = O(log_t(n)) <br> <br>

Elasticsearch, on the other hand, uses **inverted indexes**. Each word is indexed and points back to document(s) in which it was found and its location within the document. <br>

Start elasticsearch container (i.e. server) by building docker-compose.yml file: ```sudo `which docker-compose` up -d --build```

In [21]:
from elasticsearch import Elasticsearch, RequestError

es = Elasticsearch(hosts=["localhost"])
try:
    es.indices.create(index='covid19_papers')
except RequestError as e:
    print("Index covid19_papers already exists; continue with execution: ", str(e))

Index covid19_papers already exists; continue with execution:  RequestError(400, 'resource_already_exists_exception', 'index [covid19_papers/X4wA09UQRQSfmc3oxjUUig] already exists')


In [15]:
def rec_to_actions(df, index, data_type):
    for record in df.to_dict(orient="records"):
        yield('{ "index" : { "_index" : "%s", "_type" : "%s" }}'% (index, data_type))
        yield(json.dumps(record))

In [16]:
INDEX = 'covid19_papers'
DATA_TYPE = 'record'

def upload_papers_to_es_idx(es_idx, data_type):
    chunk_size = 1000
    idx = 0
    while idx < papers_df.shape[0]:
        if idx + chunk_size < papers_df.shape[0]:
            max_idx = idx + chunk_size
        else:
            max_idx = papers_df.shape[0]
        print('Range: ', idx, max_idx)
        
        r = es.bulk(rec_to_actions(papers_df[idx:max_idx], es_idx, data_type))
        print(not r["errors"])
        
        idx = max_idx

In [17]:
res = es.search(
    index="covid19_papers", 
    body={
        "from": 0,
        "size": 20,
        "query": {"match_all": {}}
    }
)
for hit in res['hits']['hits']:
    print("%(paper_id)s | %(title)s: %(abstract)s" % hit["_source"])
    print("\n\n")

which local spread has already occurred and testing availability is delayed. Since stay-at-home orders reduce infection growth rates, early implementation when infection counts are still low would be most beneficial.
Funding None.



c0b561d09eef775c862a1bee9559f1c707f42e97 | Evidence of economic segregation from mobility lockdown during COVID-19 epidemic: In response to the COVID-19 pandemic, National governments have applied lockdown restrictions to reduce the infection rate. We perform a massive analysis on near real-time Italian data provided by Facebook to investigate how lockdown strategies affect economic conditions of individuals and local governments. We model the change in mobility as an exogenous shock similar to a natural disaster. We identify two ways through which mobility restrictions affect Italian citizens. First, we find that the impact of lockdown is stronger in municipalities with higher fiscal capacity. Second, we find a segregation effect, since mobility restricti