# Abstract
The objective of this notebook will be to perform keyword analysis on the papers for Covid-19 risk factors.

In [68]:
import re
import glob
import json
from os import path
import pandas as np
import pandas as pd
from datetime import datetime

In [242]:
sample = pd.read_csv('../../../Downloads/co_infect.csv', index_col = 0, keep_default_na=False)

In [245]:
sample['is_covid_related'] = sample['doc_id'].apply(lambda x: x.lower() in covid_papers['doc_id'].values)
sample = sample.merge(papers_df, how='left', on='doc_id')
sample = sample.rename(columns={'text': 'section_text'})
sample = sample.sort_values(by='is_covid_related').reset_index(drop=True)
sample.to_json('../../../Downloads/co_infect.json', orient='records')

In [7]:
def tokenize(text):
    text = text.lower().strip()
    if len(text) == 0:
        return ''
    if text[-1] == 's':
        text = text[:-1]
    return text

In [69]:
papers_df = pd.read_csv('../output/papers_df.csv', 
                        index_col=0, 
                        keep_default_na=False,
                       parse_dates=['publish_time'])

In [122]:
sample = papers_df[papers_df['doc_id'] == '5dc4268a42adf3d5c55c87b7f6518de600b057c5']['text_body'].iloc[0]
any(key.lower() in sample.lower() for key in covid_keywords)

True

In [14]:
design_list = [ 'retrospective cohort', 'cross-sectional case-control','prevalence survey', 
               'systematic review , meta-analysis','matched case-control',
               'medical records review', 'observational case series','time series analysis',
               'pseudo-randomized controlled trials' ,'randomized controlled trials']

In [70]:
risk_factors = ['diabetes','hypertension', 'heart disease','cancer',
                'smoking status','history of lung disease','local climate',
                'elderly','small children','immune compromised groups', 
                'age deciles among adults between the ages of 15 and 65', 
                'race/ethnicity', 'education', 'income', 'insurance status',
                 'housing status', 'immigration status', 'prison inmates', 
                'mental hospital inpatients', 'long-term care facility residents',
                'health workers','first responders', 'hospital staff',
                'nursing home staff', 'prison staff', 'pregnancy status','neonates']

In [71]:
transmission_keywords = ['transmi', 'sneez', 'contact trac', 'reproduc', 'environ']

In [72]:
smoke_keywords = ['smok']

In [150]:
covid_keywords = ['COVID-19', 'HCoV-19', 'CORD-19' ,'2019-nCoV', 'Wuhan coronavirus', 'SARS-CoV-2', 'covid']
covid_keywords = [word.lower() for word in covid_keywords]

In [215]:
covid_papers = papers_df[papers_df['text_body'].apply(lambda x: 
                                                     any(key.lower() in x.lower()
                                                        for key in covid_keywords))]

In [152]:
covid_papers.head(2)

Unnamed: 0,doc_id,title,abstract,text_body,publish_time,authors,journal,doi,H index
14116,8605e1247b39f6fa0d909b356eb198aacd09843c,Modeling the dynamics of novel coronavirus (20...,The present paper describes the mathematical m...,One of the greatest assignments given to human...,2020-03-14,"Khan, Muhammad Altaf; Atangana, Abdon",Alexandria Engineering Journal,10.1016/j.aej.2020.02.033,
14180,24b996db83d4d2622d87fb583e6dc6fa277a636f,Clinical and CT imaging features of 2019 novel...,,"Dear Editor, Tang JW, et al. and colleagues ha...",2020-03-03,"Zhu, Ying; Liu, Yang-Li; Li, Zi-Ping; Kuang, J...",Journal of Infection,10.1016/j.jinf.2020.02.022,90.0


In [153]:
covid_papers.shape

(1275, 9)

In [154]:
risk_df = pd.read_csv('../../../Downloads/risk_doc.csv', index_col = 0)

In [162]:
risk_df['is_covid_related'] = risk_df['doc_id'].apply(lambda x: x.lower() in covid_papers['doc_id'].values)

In [233]:
risk_covid_join = risk_df.merge(papers_df, how='left', on='doc_id')

In [235]:
risk_covid_join = risk_covid_join.rename(columns={
    'text': 'relevant_section',
    'study': 'design',
    'text_body': 'full_text'
})

In [236]:
risk_covid_join = risk_covid_join.sort_values(by='is_covid_related').reset_index(drop=True)

In [179]:
risk_df.to_csv('../../../Downloads/risk_doc.csv')

In [238]:
risk_covid_join.to_json('../../../Downloads/risk_covid_join.json', orient='records')

In [206]:
relevant_docs = []
for doc_id in set(covid_risk_df['doc_id'].values):
    for paper in covid_papers:
        if paper['id'].split('.')[0] == doc_id:
            text = json.dumps(paper['data']['body_text']).lower()
            break
    relevant_docs.append({
        'doc_id': doc_id,
        'text': text,
        'title': paper['data']['metadata']['title']
    })

In [212]:
covid_pattern = r'"text": "([^"]*(%s)[^"]*)((?!"section").)*"section": "([^"]*)"' % "|".join(covid_keywords)
covid_regex = re.compile(covid_pattern, re.DOTALL)


match_arr = []
for doc in relevant_docs:
    title = doc['title']
    doc_id = doc['doc_id']
    text = doc['text'].lower()
    matches = covid_regex.findall(text)
    match_obj = {}
    if matches:
        match_obj['title'] = title
        match_obj['matches'] = [(match[0], match[1]) for match in designs]
        match_obj['doc_id'] = doc_id
        match_arr.append(match_obj)

In [213]:
[match['title'] for match in match_arr]

['Community acquired respiratory virus infections in cancer patientsdGuideline on diagnosis and management by the Infectious Diseases Working Party of the German Society for haematology and Medical Oncology',
 "Minimising prescribing errors in the ICU DJ Melia, S Saha Queen' s Hospital, Romford, UK Critical Care",
 'Coronavirus Disease 2019 (COVID-19) Pandemic and Pregnancy',
 'P001 Sepsis impairs the capillary response within hypoxic capillaries and decreases erythrocyte oxygen-dependent ATP efflux P002 Lower serum immunoglobulin G2 level does not predispose to severe flu']

In [178]:
for doc in relevant_docs:
    doc_id = doc['doc_id']
    text = doc['text'].lower()
    for key in covid_keywords:
        if key in text:
            print(doc_id, key)

bf49df2796b5ffe29995062a4359dbb926777bfc covid
5dc4268a42adf3d5c55c87b7f6518de600b057c5 covid-19
5dc4268a42adf3d5c55c87b7f6518de600b057c5 sars-cov-2
5dc4268a42adf3d5c55c87b7f6518de600b057c5 covid
5dc4268a42adf3d5c55c87b7f6518de600b057c5 covid-19
5dc4268a42adf3d5c55c87b7f6518de600b057c5 sars-cov-2
5dc4268a42adf3d5c55c87b7f6518de600b057c5 covid
6aadca94314fe7e81e278fbbef178f2a5bf4f538 covid
6aadca94314fe7e81e278fbbef178f2a5bf4f538 covid
b30770ae30b35cdfaf0a173863e74e93edbb0329 covid
b30770ae30b35cdfaf0a173863e74e93edbb0329 covid


## Creating a list of paper dictionaries

In [180]:
json_files = glob.glob(f'../data/**/*.json', recursive=True)

In [181]:
papers = []
for file in json_files:
    name = path.basename(file)
    with open(file) as f:
        data = json.load(f)
    paper_obj = {'id':name,
                 'data':data}
    papers.append(paper_obj)    

### Filtering for covid related papers

In [182]:
covid_papers = []
for paper in papers:
    body = paper['data']['body_text']
    for paragraph in body:
        text = paragraph['text']
        match = any(key.lower() in text.lower()
                   for key in covid_keywords)
        if match:
            covid_papers.append(paper)
            break

In [13]:
len(covid_papers)

1367

## Analysis for different sections in the papers

In [None]:
sections = {}
for paper in covid_papers:
    body = paper['data']['body_text']
    paper_sections = set()
    for paragraph in body:
        if 'section' in paragraph:
            paper_sections.add(paragraph['section'])
    
    for section in paper_sections:
        section = tokenize(section)
        if section in sections:
            sections[section] += 1
        else:
            sections[section] = 1

In [None]:
sorted(sections.items(), key=lambda x: x[1], reverse=True)

### Sections vs Designs

In [None]:
design_secs = {}
for paper in covid_papers:
    body = paper['data']['body_text']
    relevant = set()
    for paragraph in body:
        text = paragraph['text']
        if(any(design in text.lower() for design in design_list)):
            relevant.add(paragraph['section'])
    for section in relevant:
        section = tokenize(section)
        if section in design_secs:
            design_secs[section] += 1
        else:
            design_secs[section] = 1

#### **Frequency table of sections where at least one of the designs appeared the most times**

In [None]:
sorted(design_secs.items(), key=lambda x: x[1], reverse=True)

## Analysis on risk factors alongside design choices
Extracting papers mentionning risk factors for covid-19. Eventual goal is to rank these papers by the quality of their design choices.

In [60]:
design_pattern = r'"text": "([^"]*(%s)[^"]*)((?!"section").)*"section": "([^"]*)"' % "|".join(design_list)
risk_pattern = r'"text": "([^"]*(%s)[^"]*)((?!"section").)*"section": "([^"]*)"' % "|".join(risk_factors)
design_regex = re.compile(design_pattern, re.DOTALL)
risk_regex = re.compile(risk_pattern, re.DOTALL)


match_arr = []
for paper in covid_papers:
    body = json.dumps(paper['data']['body_text'])
    designs = design_regex.findall(body)
    risks = risk_regex.findall(body)
    match_obj = {}
    if designs:
        match_obj['design'] = [(match[3], match[0]) for match in designs]
    if risks:
        match_obj['risk'] = [(match[3], match[0]) for match in risks]
    if match_obj:
        match_obj['paper'] = paper['data']['metadata']['title']
        match_arr.append(match_obj)

In [65]:
risk_design_same_section = []
for match in match_arr:
    if 'design' not in match:
        continue
    designs = match['design']
    for design in designs:
        for factor in risk_factors:
            if factor in design[0]:
                risk_design_same_section.append(match)

In [66]:
len(risk_design_same_section)

0

There seem to be no sections that mention both a design choice and a risk factor. So, the next analysis will check if the same paper mentions it, albeit in different sections

In [64]:
for match in match_arr:
    if len(match.keys()) > 2:
        print(match)

{'design': [('Corticosteroids and NSAIDs', 'It has now been more than 70 years that corticosteroids (CS) are pivotal for RA management and their role as remission inducer and bridging therapy for the management of disease flare has recently been renewed by the latest update of EULAR recommendations for RA treatment [36] . Even though CS efficacy in rapidly suppressing inflammation during RA initial course or flares is well recognized [37] , their downside is the broad spectrum of adverse events, including severe infections and the high risk of developing comorbidities further increasing the risk of infection [27, 38] . Although RCTs conducted in the past with CS showed no higher risk of infections in RA patients [39] [40] [41] , cohort and case-control studies reported increased rates of overall infections in RA patients treated with CS, according to a dosedependent fashion [42] . The majority of these infectious events are of bacterial etiology, but RA patients receiving CS exhibit a 

## Design vs Risk Analysis

In [32]:
print(words)

['retrospective cohort', 'cross-sectional case-control', 'prevalence survey', 'systematic review , meta-analysis', 'matched case-control', 'medical records review', 'observational case series', 'time series analysis', 'pseudo-randomized controlled trials', 'randomized controlled trials']


In [50]:
print(risk_factors)

['diabetes', 'hypertension', 'heart disease', 'cancer', 'smoking status', 'history of lung disease', 'local climate', 'elderly', 'small children', 'immune compromised groups', 'age deciles among adults between the ages of 15 and 65', 'race/ethnicity', 'education', 'income', 'insurance status', 'housing status', 'immigration status', 'prison inmates', 'mental hospital inpatients', 'long-term care facility residents', 'health workers', 'first responders', 'hospital staff', 'nursing home staff', 'prison staff', 'pregnancy status', 'neonates']


In [38]:
print(match_arr[0][1])

It has now been more than 70 years that corticosteroids (CS) are pivotal for RA management and their role as remission inducer and bridging therapy for the management of disease flare has recently been renewed by the latest update of EULAR recommendations for RA treatment [36] . Even though CS efficacy in rapidly suppressing inflammation during RA initial course or flares is well recognized [37] , their downside is the broad spectrum of adverse events, including severe infections and the high risk of developing comorbidities further increasing the risk of infection [27, 38] . Although RCTs conducted in the past with CS showed no higher risk of infections in RA patients [39] [40] [41] , cohort and case-control studies reported increased rates of overall infections in RA patients treated with CS, according to a dosedependent fashion [42] . The majority of these infectious events are of bacterial etiology, but RA patients receiving CS exhibit a greater risk of developing even viral infect

In [49]:
for factor in risk_factors:
    for match in match_arr:
        if factor.lower() in match[1].lower():
            print(factor, '\n', match)
            print('=======================================================================================')