In [11]:
import json
import os
import pandas as pd
from pprint import pprint
import spacy
nlp = spacy.load('en_core_web_sm')

In [27]:
def gather_paper_data(dirs, papers_info):
    for d in dirs:
        papers = os.listdir(d)
        
        for paper in papers:
            paper_path = os.path.join(d, paper)
            
            if os.path.isdir(paper_path):
                gather_paper_data([paper_path], papers_info)
            else:
                with open(paper_path, 'rb') as f:
                    file_data = json.load(f)

                    paper_id = file_data['paper_id']
                    title = file_data['metadata']['title']

                    try:
                        abstract_paragraphs = file_data['abstract']
                    except KeyError:  # Note: this occurs for pmc_json files since none of them have an abstract
                        abstract_paragraphs = []
                    abstract = []
                    for paragraph in abstract_paragraphs:
                        abstract.append(paragraph['text'])
                    abstract = '\n'.join(abstract)

                    try:
                        body_paragraphs = file_data['body_text']
                    except KeyError:
                        body_paragraphs = []
                    body = []
                    for paragraph in body_paragraphs:
                        body.append(paragraph['text'])
                    body = '\n'.join(body)

                    papers_info.append([paper_id, title, abstract, body])

    return papers_info

In [32]:
def filter_paper(text, keywords):
    text = ([word.lower().strip() for word in text.split(' ')])
    for keyword in keywords:
        keyword_parts = keyword.split(' ')
        if all(word in text for word in keyword_parts):
            return True
    return False

In [28]:
# TODO: Profile this method and see which part is most time consuming and find ways to optimize it
def clean_paper(text):
    cleaned_text = []
    stopwords = spacy.lang.en.stop_words.STOP_WORDS

    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    text_doc = nlp(text)
    text_tokens = ' '.join([
        token.lemma_.lower().strip() for token in text_doc 
        if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-'
    ])
    
    return text_tokens

In [29]:
papers_info = []
gather_paper_data(['arxiv', 'biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset'], papers_info)
papers_df = pd.DataFrame(papers_info, columns=['paper_id', 'title', 'abstract', 'body'])
papers_df.shape

(27299, 4)

### Task: What do we know about vaccines or therapeutics? 
* Effectiveness of drugs being developed and used to treat patients <br>
* Potential complication of Antibody-Dependent Enhancement (ADE) in vaccine recipients <br>
* Exploration of use of best animal models and their predictive value for a human vaccine <br>
* Capabilities to discover a therapeutic for the disease, and clinical effectiveness studies to discover therapeutics <br>
* Alternative models in prioritizing and distributing scarce, newly proven therapeutics and vaccines at scale <br>
* Efforts targeted at a universal coronavirus vaccine <br>
* Efforts to develop animal models and standardize challenge studies <br>
* Efforts to develop prophylaxis clinical studies and prioritize in healthcare workers <br>
* Approaches to evaluate risk for enhanced disease after vaccination <br>
* Assays to evaluate vaccine immune response and process development for vaccines <br>

In [33]:
# First Strategy: Use regex to search for keywords and extract relevant information in that context
treatment_task_keywords = [
    'drug', 'patients', 'therapeutic', 'vaccine', 'animal', 'clinical', 'trial', 'prophylaxis', 'prophylactic',
    'distribution', 'studies', 'immunity', 'model', 'prioritize', 'distribute'
]
treatment_papers_df = papers_df[
    papers_df.apply(lambda paper: filter_paper(paper['abstract'], treatment_task_keywords), axis=1)
]
treatment_papers_df.shape

(8223, 4)

In [34]:
treatment_papers_df['cleaned_abstract'] = treatment_papers_df['abstract'].apply(
    lambda abstract: clean_paper(abstract)
)
treatment_papers_df['cleaned_body'] = treatment_papers_df['body'].apply(
    lambda body: clean_paper(body)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


KeyboardInterrupt: 

In [None]:
print(treatment_papers_df['abstract'].head())
print(treatment_papers_df['cleaned_abstract'].head())