# Abstract
The objective of this notebook will be to clean up working dataframes and output them as either JSONs for the front end or CSVs

In [90]:
import numpy as np
import pandas as pd
from datetime import datetime
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [2]:
papers_df = pd.read_csv('../output/papers_df.csv', 
                        index_col=0, 
                        keep_default_na=False,
                       parse_dates=['publish_time'])

In [63]:
downloads = '../../../Downloads/'

In [3]:
covid_keywords = ['COVID-19', 'HCoV-19', 'CORD-19' ,'2019-nCoV', 'Wuhan coronavirus', 'SARS-CoV-2', 'covid']

In [4]:
covid_papers = papers_df[papers_df['text_body'].apply(lambda x: 
                                                     any(key.lower() in x.lower()
                                                        for key in covid_keywords))]

In [77]:
def join_metadata(df):
    df['is_covid_related'] = df['doc_id'].apply(lambda x: x.lower() in covid_papers['doc_id'].values)
    df = df.merge(papers_df, how='left', on='doc_id')
    df = df.rename(columns={'text': 'section_text'})
    df = df.sort_values(by='is_covid_related', ascending = False).reset_index(drop=True)
    
    df = df[df['publish_time'] > datetime(2019, 1, 1)]
    
    count_dict = df.groupby('doc_id').count()['is_covid_related'].to_dict()
    df['count'] = df['doc_id'].apply(lambda x: count_dict[x])
    df = df.sort_values('count', ascending=False)
    df = df.drop('count', axis=1).reset_index(drop=True)
    return df

In [67]:
# Filtering cov_risk_design
cov_risk_design = pd.read_csv(downloads + 'cov_risk_design.csv',index_col = 0, keep_default_na=False)
cov_risk_design_rich = join_metadata(cov_risk_design)
cov_risk_design_rich.to_json(downloads + 'cov_risk_design.json', orient='records')

In [91]:
risk_design = pd.read_csv(downloads + 'risk_design.csv', index_col=0, keep_default_na=False)

In [92]:
risk_design_rich = join_metadata(risk_design)

In [93]:
risk_design_rich = risk_design_rich[risk_design_rich['is_covid_related']==True]

In [94]:
sections = [lemmatizer.lemmatize(section) for section 
            in list(risk_design_rich['section']) 
            if section != '']

In [95]:
freq_secs = {}
for section in sections:
    if section in freq_secs:
        freq_secs[section] += 1
    else:
        freq_secs[section] = 1

sorted(freq_secs.items(), key=lambda x: x[1], reverse=True)

[('discussion', 16),
 ('annex', 7),
 ('abstract', 6),
 ('result', 3),
 ('melatonin & other supportive adjuvant effects', 3),
 ('symptom', 3),
 ('acknowledgement', 3),
 ('j o u r n a l p r e -p r o o f', 2),
 ('introduction', 2),
 ('study design and participants', 2),
 ('use of drugs acting on renin angiotensin system', 2),
 ('role of underling cv comorbidities', 2),
 ('specifi c clinical management', 2),
 ('mortality', 2),
 ('prevalence of cvd in patients with covid-19', 2),
 ('dietary immunomodulatory', 2),
 ('melatonin effects in cytokine levels in human', 1),
 ('diabetes-related traits', 1),
 ('clinical characteristics and laboratory results', 1),
 ('predictors and outcomes', 1),
 ('validation of our network-based knowledge mining results', 1),
 ('plos one', 1),
 ('mers and pregnancy', 1),
 ('coronavirus disease 2019 (covid-19), a new form of respiratory and systemic',
  1),
 ('from the perspective of route of transmission', 1),
 ('figure. estimated covid-19 pediatric hospitalizatio

In [99]:
cov_risk_design_rich['section_text'].iloc[8]

'covid-19, a novel coronavirus outbreak starting in china, is now a rapidly developing public health emergency of international concern. the clinical spectrum of covid-19 disease is varied, and identifying factors associated with severe disease has been described as an urgent research priority. it has been noted that elderly patients with pre-existing comorbidities are more vulnerable to more severe disease. however, the specific symptoms and comorbidities that most strongly predict disease severity are unclear. we performed a systematic review and meta-analysis to identify the symptoms and comorbidities predictive of covid-19 severity.'