# Abstract
The objective of this notebook will be to clean up working dataframes and output them as either JSONs for the front end or CSVs

In [179]:
import re 
import numpy as np
import pandas as pd
from datetime import datetime
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
papers_df = pd.read_csv('../output/papers_df.csv', 
                        index_col=0, 
                        keep_default_na=False,
                       parse_dates=['publish_time'])

## Global Functions and Variables

In [2]:
downloads = '../../../Downloads/'

In [3]:
covid_keywords = ['COVID-19', 'HCoV-19', 'CORD-19' ,'2019-nCoV', 'Wuhan coronavirus', 'SARS-CoV-2', 'covid']

In [None]:
covid_papers = papers_df[papers_df['text_body'].apply(lambda x: 
                                                     any(key.lower() in x.lower()
                                                        for key in covid_keywords))]

In [4]:
def join_metadata(df):
    df['is_covid_related'] = df['doc_id'].apply(lambda x: x.lower() in covid_papers['doc_id'].values)
    df = df.merge(papers_df, how='left', on='doc_id')
    df = df.rename(columns={'text': 'section_text'})
    df = df.sort_values(by='is_covid_related', ascending = False).reset_index(drop=True)
    
    df = df[df['publish_time'] > datetime(2019, 1, 1)]
    
    count_dict = df.groupby('doc_id').count()['is_covid_related'].to_dict()
    df['count'] = df['doc_id'].apply(lambda x: count_dict[x])
    df = df.sort_values('count', ascending=False)
    df = df.drop('count', axis=1).reset_index(drop=True)
    return df

### Merging dataframe with metadata

In [None]:
# Filtering cov_risk_design
cov_risk_design = pd.read_csv(downloads + 'cov_risk_design.csv',index_col = 0, keep_default_na=False)
cov_risk_design_rich = join_metadata(cov_risk_design)
cov_risk_design_rich.to_json(downloads + 'cov_risk_design.json', orient='records')

In [None]:
risk_design = pd.read_csv(downloads + 'risk_design.csv', index_col=0, keep_default_na=False)

In [None]:
risk_design_rich = join_metadata(risk_design)

### Filtering sections for whether they are covid-related

In [None]:
risk_design_rich = risk_design_rich[risk_design_rich['is_covid_related']==True]

In [None]:
sections = [lemmatizer.lemmatize(section) for section 
            in list(risk_design_rich['section']) 
            if section != '']

### Frequency table for most common sections in covid related papers

In [None]:
freq_secs = {}
for section in sections:
    if section in freq_secs:
        freq_secs[section] += 1
    else:
        freq_secs[section] = 1

sorted(freq_secs.items(), key=lambda x: x[1], reverse=True)

## Experiment: Filtering noise by removing sentences with hyperlinks

In [5]:
cov_df = pd.read_json(downloads + 'cov_risk_design_summarized.json', orient='records')

In [192]:
def has_url(sent):
    url_match = re.findall('https?://', sent)
    return bool(url_match)


def is_valid_sent(sent):
    words = word_tokenize(sent)
    words = ["".join(re.findall("[a-zA-Z]+", word)) for word in words]
    words = [word for word in words if len(word) > 3]
    return len(words) > 2

In [227]:
def clean_bib(text):
    raw_sents = sent_tokenize(text)
    raw_sents = [sent for sent in raw_sents if is_valid_sent(sent)]
    clean_sents = []
    
    length = len(raw_sents)
    start = 0
    end = 5 if  (start - 5 < length) else (length - start) 
    win = [is_url(sent) for sent in raw_sents[start:end]]
        
    while end < length:
        if sum(win) > 0.5 * len(win):
            temp = end
            end += 5
            start = temp + 1
            win = [is_url(sent) for sent in raw_sents[start:end]]
        else:
            clean_sents.append(raw_sents[start])
            start += 1
            end += 1
            win = win[1:]
            try:
                win.append(has_url(raw_sents[end]))
            except:
                print('ERROR:', length, startt, e)
        
    return " ".join(clean_sents)

In [247]:
cov_df.columns

Index(['doc_id', 'covid19_in_text', 'risk_factor', 'section_text', 'section',
       'design_study', 'is_covid_related', 'title', 'abstract', 'text_body',
       'publish_time', 'authors', 'journal', 'doi', 'H index',
       'scibert_summary'],
      dtype='object')

In [250]:
sample = cov_df.iloc[-3]['section_text']

In [245]:
[len(re.findall("(http|doi|www)", sec)) for sec in cov_df['section_text'].values]

[0, 0, 0, 11, 11, 0, 2, 33, 0]

In [251]:
#risk_regrs = "(" + "|".join(risk_factors) + ")"
len(re.findall("staff", sample))

'the copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org /10.1101 /10. /2020 pooled severe incidence and fatality rate is significantly lower compared with sars and mers, which may explain why the novel coronavirus has spread so widely 46 . of note, there are regional and spatial differences in the incidence rate of covid-19. in our research, the pooled severity rate and mortality caused by covid-19 was found significantly higher in wuhan than that of the infected outside of wuhan (all for p < 0.01). on the other hand, disease incidence at the early stage of outbreak was higher than that at the late stage, which may be caused by the lack of recognitions and treatment experience for covid-19. moreover, the longer time from symptoms to hospitalization, the higher incidence rate of the mortality related to covid-19, highlighting the importance of timely medical treatment 30 . in addition, among the patients with 2019-ncov, the pooled infection rate of

### Next steps
- Show H Index of paper
- Rank by number of "http, doi or www" (lower rank for more occurences)
- 