# Abstract
The objective of this notebook will be to perform keyword analysis on the papers for Covid-19 risk factors.

In [19]:
import re
import glob
import json
from os import path
import pandas as np
import pandas as pd
from datetime import datetime

In [3]:
papers_df = pd.read_csv('../output/papers_df.csv', 
                        index_col=0, 
                        keep_default_na=False,
                       parse_dates=['publish_time'])

In [4]:
transmission_keywords = ['transmi', 'sneez', 'contact trac', 'reproduc', 'environ']

In [5]:
smoke_keywords = ['smok']

In [6]:
covid_keywords = ['COVID-19', 'HCoV-19', 'CORD-19' ,'2019-nCoV', 'Wuhan coronavirus', 'SARS-CoV-2', 'SARS-COV-2']
covid_keywords = [word.lower() for word in covid_keywords]

In [71]:
design_list = [ 'retrospective cohort', 'cross-sectional case-control','prevalence survey', 
               'systematic review , meta-analysis','matched case-control',
               'medical records review', 'observational case series','time series analysis',
               'pseudo-randomized controlled trials' ,'randomized controlled trials']

In [11]:
# covid_papers = papers_df[papers_df['text_body'].apply(lambda x : 
#                                                 any(key in x.lower() for
#                                                     key in covid_keywords)
#                                                )]

In [62]:
def tokenize(text):
    text = text.lower().strip()
    if len(text) == 0:
        return ''
    if text[-1] == 's':
        text = text[:-1]
    return text

## Creating a list of paper dictionaries

In [22]:
json_files = glob.glob(f'../data/**/*.json', recursive=True)

In [24]:
paper_dicts = []
for file in json_files:
    name = path.basename(file)
    with open(file) as f:
        data = json.load(f)
    paper_obj = {'id':name,
                 'data':data}
    paper_dicts.append(paper_obj)    

### Filtering for covid related papers

In [81]:
covid_papers = []
for paper in papers:
    body = paper['data']['body_text']
    for paragraph in body:
        text = paragraph['text']
        match = any(key.lower() in text.lower()
                   for key in covid_keywords)
        if match:
            covid_papers.append(paper)
            break

In [82]:
len(covid_papers)

2160

## Analysis for different sections in the papers

In [83]:
sections = {}
for paper in covid_papers:
    body = paper['data']['body_text']
    paper_sections = set()
    for paragraph in body:
        if 'section' in paragraph:
            paper_sections.add(paragraph['section'])
    
    for section in paper_sections:
        section = tokenize(section)
        if section in sections:
            sections[section] += 1
        else:
            sections[section] = 1

In [84]:
sorted(sections.items(), key=lambda x: x[1], reverse=True)

[('introduction', 883),
 ('', 821),
 ('discussion', 699),
 ('conclusion', 372),
 ('result', 331),
 ('statistical analysi', 195),
 ('(which was not peer-reviewed)', 169),
 ('method', 168),
 ('declaration of competing interest', 118),
 ('declaration of interest', 88),
 ('data collection', 82),
 ('funding', 59),
 ('background', 58),
 ('results and discussion', 51),
 ('acknowledgment', 50),
 ('contributor', 50),
 ('role of the funding source', 50),
 ('added value of this study', 42),
 ('data', 41),
 ('conflicts of interest', 41),
 ('conflict of interest', 39),
 ('study design and participant', 39),
 ('implications of all the available evidence', 38),
 ('data source', 37),
 ('data sharing', 36),
 ('materials and method', 36),
 ('research in context', 36),
 ('11', 35),
 ('acknowledgement', 35),
 ('author contribution', 31),
 ('patient', 31),
 ('5', 30),
 ('model', 28),
 ('j o u r n a l p r e -p r o o f', 27),
 ('ethical approval', 25),
 ('limitation', 25),
 ('15', 24),
 ('data analysi', 23),

### Sections vs Designs

In [85]:
design_secs = {}
for paper in covid_papers:
    body = paper['data']['body_text']
    relevant = set()
    for paragraph in body:
        text = paragraph['text']
        if(any(design in text.lower() for design in design_list)):
            relevant.add(paragraph['section'])
    for section in relevant:
        section = tokenize(section)
        if section in design_secs:
            design_secs[section] += 1
        else:
            design_secs[section] = 1

### Frequency table of sections where at least one of the designs appeared the most times

In [86]:
sorted(design_secs.items(), key=lambda x: x[1], reverse=True)

[('', 8),
 ('discussion', 7),
 ('introduction', 3),
 ('method', 2),
 ('eligibility criteria', 2),
 ('inclusion and exclusion criteria', 2),
 ('traditional chinese medicine in the treatment of patients infected with sars-cov: clinical evidence',
  2),
 ('study design and participant', 2),
 ('treatment', 2),
 ('controls. 215', 1),
 ('literature search', 1),
 ('quality of included studie', 1),
 ('statistical analyse', 1),
 ('(which was not peer-reviewed)', 1),
 ('background', 1),
 ('ongoing human trials for covid-19', 1),
 ('strengths and limitations of this study', 1),
 ('data sources and data extraction', 1),
 ('description of studie', 1),
 ('methods:', 1),
 ('covid-19 case definitions:', 1),
 ('therapeutic', 1),
 ('study identification and selection', 1),
 ('potential therapeutic', 1),
 ('previous epidemics caused by coronaviruse', 1),
 ("guideline's structural setup and refining the topics and coverage of this guideline",
  1),
 ('drug treatment 6.3.1 antiviral treatment', 1),
 ('2.6.