# Abstract
The objective of this notebook will be to perform keyword analysis on the papers for Covid-19 risk factors.

In [1]:
import re
import glob
import json
from os import path
import pandas as np
import pandas as pd
from datetime import datetime

In [None]:
papers_df = pd.read_csv('../output/papers_df.csv', 
                        index_col=0, 
                        keep_default_na=False,
                       parse_dates=['publish_time'])

In [2]:
risk_factors = ['diabetes','hypertension', 'heart disease','cancer',
                'smoking status','history of lung disease','local climate',
                'elderly','small children','immune compromised groups', 
                'age deciles among adults between the ages of 15 and 65', 
                'race/ethnicity', 'education', 'income', 'insurance status',
                 'housing status', 'immigration status', 'prison inmates', 
                'mental hospital inpatients', 'long-term care facility residents',
                'health workers','first responders', 'hospital staff',
                'nursing home staff', 'prison staff', 'pregnancy status','neonates']

In [3]:
transmission_keywords = ['transmi', 'sneez', 'contact trac', 'reproduc', 'environ']

In [4]:
smoke_keywords = ['smok']

In [5]:
covid_keywords = ['COVID-19', 'HCoV-19', 'CORD-19' ,'2019-nCoV', 'Wuhan coronavirus', 'SARS-CoV-2', 'SARS-COV-2']
covid_keywords = [word.lower() for word in covid_keywords]

In [14]:
design_list = [ 'retrospective cohort', 'cross-sectional case-control','prevalence survey', 
               'systematic review , meta-analysis','matched case-control',
               'medical records review', 'observational case series','time series analysis',
               'pseudo-randomized controlled trials' ,'randomized controlled trials']

In [6]:
# covid_papers = papers_df[papers_df['text_body'].apply(lambda x : 
#                                                 any(key in x.lower() for
#                                                     key in covid_keywords)
#                                                )]

In [7]:
def tokenize(text):
    text = text.lower().strip()
    if len(text) == 0:
        return ''
    if text[-1] == 's':
        text = text[:-1]
    return text

## Creating a list of paper dictionaries

In [8]:
json_files = glob.glob(f'../data/**/*.json', recursive=True)

In [11]:
papers = []
for file in json_files:
    name = path.basename(file)
    with open(file) as f:
        data = json.load(f)
    paper_obj = {'id':name,
                 'data':data}
    papers.append(paper_obj)    

### Filtering for covid related papers

In [12]:
covid_papers = []
for paper in papers:
    body = paper['data']['body_text']
    for paragraph in body:
        text = paragraph['text']
        match = any(key.lower() in text.lower()
                   for key in covid_keywords)
        if match:
            covid_papers.append(paper)
            break

In [13]:
len(covid_papers)

1367

## Analysis for different sections in the papers

In [None]:
sections = {}
for paper in covid_papers:
    body = paper['data']['body_text']
    paper_sections = set()
    for paragraph in body:
        if 'section' in paragraph:
            paper_sections.add(paragraph['section'])
    
    for section in paper_sections:
        section = tokenize(section)
        if section in sections:
            sections[section] += 1
        else:
            sections[section] = 1

In [None]:
sorted(sections.items(), key=lambda x: x[1], reverse=True)

### Sections vs Designs

In [None]:
design_secs = {}
for paper in covid_papers:
    body = paper['data']['body_text']
    relevant = set()
    for paragraph in body:
        text = paragraph['text']
        if(any(design in text.lower() for design in design_list)):
            relevant.add(paragraph['section'])
    for section in relevant:
        section = tokenize(section)
        if section in design_secs:
            design_secs[section] += 1
        else:
            design_secs[section] = 1

### Frequency table of sections where at least one of the designs appeared the most times

In [None]:
sorted(design_secs.items(), key=lambda x: x[1], reverse=True)

In [18]:
words = ['transmission', 'the stability']
pattern = r'"text": "([^"]*(%s)[^"]*)((?!"section").)*"section": "([^"]*)"' % "|".join(words)
print(pattern)
regex = re.compile(pattern, re.DOTALL)

with open(json_files[0], "r") as infile:
    filetext = infile.read()
    results = regex.findall(filetext)
    #for match in results:
        #print ('Match in section "%s":\n %s \n-------------------' % (match[3], match[0]))

"text": "([^"]*(transmission|the stability)[^"]*)((?!"section").)*"section": "([^"]*)"


## Design vs Risk Analysis

In [None]:
for paper in papers:
    