# Abstract
The objective of this notebook will be to perform keyword analysis on the papers for Covid-19 risk factors.

In [1]:
import re
import glob
import json
from os import path
import pandas as np
import pandas as pd
from datetime import datetime

In [None]:
papers_df = pd.read_csv('../output/papers_df.csv', 
                        index_col=0, 
                        keep_default_na=False,
                       parse_dates=['publish_time'])

In [2]:
risk_factors = ['diabetes','hypertension', 'heart disease','cancer',
                'smoking status','history of lung disease','local climate',
                'elderly','small children','immune compromised groups', 
                'age deciles among adults between the ages of 15 and 65', 
                'race/ethnicity', 'education', 'income', 'insurance status',
                 'housing status', 'immigration status', 'prison inmates', 
                'mental hospital inpatients', 'long-term care facility residents',
                'health workers','first responders', 'hospital staff',
                'nursing home staff', 'prison staff', 'pregnancy status','neonates']

In [3]:
transmission_keywords = ['transmi', 'sneez', 'contact trac', 'reproduc', 'environ']

In [4]:
smoke_keywords = ['smok']

In [5]:
covid_keywords = ['COVID-19', 'HCoV-19', 'CORD-19' ,'2019-nCoV', 'Wuhan coronavirus', 'SARS-CoV-2', 'SARS-COV-2']
covid_keywords = [word.lower() for word in covid_keywords]

In [14]:
design_list = [ 'retrospective cohort', 'cross-sectional case-control','prevalence survey', 
               'systematic review , meta-analysis','matched case-control',
               'medical records review', 'observational case series','time series analysis',
               'pseudo-randomized controlled trials' ,'randomized controlled trials']

In [6]:
# covid_papers = papers_df[papers_df['text_body'].apply(lambda x : 
#                                                 any(key in x.lower() for
#                                                     key in covid_keywords)
#                                                )]

In [7]:
def tokenize(text):
    text = text.lower().strip()
    if len(text) == 0:
        return ''
    if text[-1] == 's':
        text = text[:-1]
    return text

## Creating a list of paper dictionaries

In [8]:
json_files = glob.glob(f'../data/**/*.json', recursive=True)

In [11]:
papers = []
for file in json_files:
    name = path.basename(file)
    with open(file) as f:
        data = json.load(f)
    paper_obj = {'id':name,
                 'data':data}
    papers.append(paper_obj)    

### Filtering for covid related papers

In [12]:
covid_papers = []
for paper in papers:
    body = paper['data']['body_text']
    for paragraph in body:
        text = paragraph['text']
        match = any(key.lower() in text.lower()
                   for key in covid_keywords)
        if match:
            covid_papers.append(paper)
            break

In [13]:
len(covid_papers)

1367

## Analysis for different sections in the papers

In [None]:
sections = {}
for paper in covid_papers:
    body = paper['data']['body_text']
    paper_sections = set()
    for paragraph in body:
        if 'section' in paragraph:
            paper_sections.add(paragraph['section'])
    
    for section in paper_sections:
        section = tokenize(section)
        if section in sections:
            sections[section] += 1
        else:
            sections[section] = 1

In [None]:
sorted(sections.items(), key=lambda x: x[1], reverse=True)

### Sections vs Designs

In [None]:
design_secs = {}
for paper in covid_papers:
    body = paper['data']['body_text']
    relevant = set()
    for paragraph in body:
        text = paragraph['text']
        if(any(design in text.lower() for design in design_list)):
            relevant.add(paragraph['section'])
    for section in relevant:
        section = tokenize(section)
        if section in design_secs:
            design_secs[section] += 1
        else:
            design_secs[section] = 1

### Frequency table of sections where at least one of the designs appeared the most times

In [None]:
sorted(design_secs.items(), key=lambda x: x[1], reverse=True)

In [60]:
design_pattern = r'"text": "([^"]*(%s)[^"]*)((?!"section").)*"section": "([^"]*)"' % "|".join(design_list)
risk_pattern = r'"text": "([^"]*(%s)[^"]*)((?!"section").)*"section": "([^"]*)"' % "|".join(risk_factors)
design_regex = re.compile(design_pattern, re.DOTALL)
risk_regex = re.compile(risk_pattern, re.DOTALL)


match_arr = []
for paper in covid_papers:
    body = json.dumps(paper['data']['body_text'])
    designs = design_regex.findall(body)
    risks = risk_regex.findall(body)
    match_obj = {}
    if designs:
        match_obj['design'] = [(match[3], match[0]) for match in designs]
    if risks:
        match_obj['risk'] = [(match[3], match[0]) for match in risks]
    if match_obj:
        match_obj['paper'] = paper['data']['metadata']['title']
        match_arr.append(match_obj)

In [65]:
risk_design_same_section = []
for match in match_arr:
    if 'design' not in match:
        continue
    designs = match['design']
    for design in designs:
        for factor in risk_factors:
            if factor in design[0]:
                risk_design_same_section.append(match)

In [66]:
len(risk_design_same_section)

0

There seem to be no sections that mention both a design choice and a risk factor. So, the next analysis will check if the same paper mentions it, albeit in different sections

In [64]:
for match in match_arr:
    if len(match.keys()) > 2:
        print(match)

{'design': [('Corticosteroids and NSAIDs', 'It has now been more than 70 years that corticosteroids (CS) are pivotal for RA management and their role as remission inducer and bridging therapy for the management of disease flare has recently been renewed by the latest update of EULAR recommendations for RA treatment [36] . Even though CS efficacy in rapidly suppressing inflammation during RA initial course or flares is well recognized [37] , their downside is the broad spectrum of adverse events, including severe infections and the high risk of developing comorbidities further increasing the risk of infection [27, 38] . Although RCTs conducted in the past with CS showed no higher risk of infections in RA patients [39] [40] [41] , cohort and case-control studies reported increased rates of overall infections in RA patients treated with CS, according to a dosedependent fashion [42] . The majority of these infectious events are of bacterial etiology, but RA patients receiving CS exhibit a 

## Design vs Risk Analysis

In [32]:
print(words)

['retrospective cohort', 'cross-sectional case-control', 'prevalence survey', 'systematic review , meta-analysis', 'matched case-control', 'medical records review', 'observational case series', 'time series analysis', 'pseudo-randomized controlled trials', 'randomized controlled trials']


In [50]:
print(risk_factors)

['diabetes', 'hypertension', 'heart disease', 'cancer', 'smoking status', 'history of lung disease', 'local climate', 'elderly', 'small children', 'immune compromised groups', 'age deciles among adults between the ages of 15 and 65', 'race/ethnicity', 'education', 'income', 'insurance status', 'housing status', 'immigration status', 'prison inmates', 'mental hospital inpatients', 'long-term care facility residents', 'health workers', 'first responders', 'hospital staff', 'nursing home staff', 'prison staff', 'pregnancy status', 'neonates']


In [38]:
print(match_arr[0][1])

It has now been more than 70 years that corticosteroids (CS) are pivotal for RA management and their role as remission inducer and bridging therapy for the management of disease flare has recently been renewed by the latest update of EULAR recommendations for RA treatment [36] . Even though CS efficacy in rapidly suppressing inflammation during RA initial course or flares is well recognized [37] , their downside is the broad spectrum of adverse events, including severe infections and the high risk of developing comorbidities further increasing the risk of infection [27, 38] . Although RCTs conducted in the past with CS showed no higher risk of infections in RA patients [39] [40] [41] , cohort and case-control studies reported increased rates of overall infections in RA patients treated with CS, according to a dosedependent fashion [42] . The majority of these infectious events are of bacterial etiology, but RA patients receiving CS exhibit a greater risk of developing even viral infect

In [49]:
for factor in risk_factors:
    for match in match_arr:
        if factor.lower() in match[1].lower():
            print(factor, '\n', match)
            print('=======================================================================================')