# Abstract
## Taks for the notebook:
### What do we know about risk factors?
- Transmission dynamics of the virus, including the basic reproductive number, incubation period, serial interval, modes of transmission and environmental factors
- Severity of disease, including risk of fatality among symptomatic hospitalized patients, and high-risk patient groups
- Mutations in virusees

# Transmission


In [2]:
import os
import re
import sys
import json
import math
import glob
from pprint import pprint
from copy import deepcopy


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import spacy
from spacy.matcher import Matcher

## Building dataframe
Initializing `corona_df` with `title`, `abstract`, `body` and `source` of papers

In [3]:
root = '../corona_challenge/'
json_filenames = glob.glob(f'{root}/**/*.json', recursive=True)

In [4]:
# Re used from Maria's coe
def return_corona_df(json_filenames, df):

    for file_name in json_filenames:

        row = {x: None for x in df.columns}
        
        with open(file_name) as json_data:
            data = json.load(json_data)

            doc_id = data['paper_id']
            row['doc_id'] = doc_id
            row['title'] = data['metadata']['title']

            # Now need all of abstract. Put it all in
            # a list then use str.join() to split it
            # into paragraphs.

            abstract_list = [abst['text'] for abst in data['abstract']]
            abstract = "\n ".join(abstract_list)

            row['abstract'] = abstract

            # And lastly the body of the text.
            body_list = [bt['text'] for bt in data['body_text']]
            body = "\n ".join(body_list)

            row['text_body'] = body


            df = df.append(row, ignore_index=True)

    return df

In [5]:
paper_features = {
    "doc_id": [None],
    "title": [None],
    "abstract": [None],
    "text_body": [None]
}
corona_df = pd.DataFrame.from_dict(paper_features)

In [6]:
corona_df = return_corona_df(json_filenames, corona_df)

In [7]:
len(corona_df)

33376

In [8]:
corona_df.head(3)

Unnamed: 0,doc_id,title,abstract,text_body
0,,,,
1,86a998617c077f4fe2ab26214995a3548fbc0fc5,Middle East Respiratory Syndrome and Severe Ac...,The recent emergence of the Middle East respir...,While most CoVs cause the common cold in human...
2,948aaeb2e0be11ad90562bf10d462531a1f00eac,"Integrated, Multi-cohort Analysis Identifies C...",Graphical Abstract Highlights d MVS is a commo...,Clinically relevant respiratory viral signatur...


## Information extraction from metadata 

In [9]:
metadata_df = pd.read_csv(root + 'metadata.csv')

In [10]:
metadata_df.head(3)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,Samuel Minot; Stephen D Turner; Krista L Ternu...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/001727
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,Lin WANG; Xiang Li,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/003889
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,Corey T Watson; Karyn Meltz Steinberg; Tina A ...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/006866


In [11]:
metadata_df_for_join = metadata_df.loc[:, ['sha', 'publish_time', 'authors', 'journal', 'doi']]

In [12]:
papers_df = corona_df.merge(metadata_df_for_join, 
                            left_on='doc_id', right_on='sha', how='inner')

In [13]:
len(papers_df)

44227

In [14]:
papers_df.head(3)

Unnamed: 0,doc_id,title,abstract,text_body,sha,publish_time,authors,journal,doi
0,,,,,,2015-11-06,Sissel Juul; Fernando Izquierdo; Adam Hurst; X...,,10.1101/030742
1,,,,,,2016-06-23,Zulfazal Ahmed; Prasida Holla; Imran Ahmad; Sh...,,10.1101/060434
2,,,,,,2016-09-02,Fabienne Krauer; Maurane Riesen; Ludovic Revei...,,10.1101/073098


### Adding journal ratings as feature

In [15]:
journal_df = pd.read_csv('scimagoj_2018.csv', sep = ';')

In [16]:
journal_df.head(3)

Unnamed: 0,Rank,Sourceid,Title,Type,Issn,SJR,SJR Best Quartile,H index,Total Docs. (2018),Total Docs. (3years),Total Refs.,Total Cites (3years),Citable Docs. (3years),Cites / Doc. (2years),Ref. / Doc.,Country,Publisher,Coverage,Categories
0,1,28773,CA - A Cancer Journal for Clinicians,journal,"15424863, 00079235",72576,Q1,144,45,127,3078,20088,103,20685,6840,United States,Wiley-Blackwell,1950-ongoing,Hematology (Q1); Oncology (Q1)
1,2,19434,MMWR. Recommendations and reports : Morbidity ...,journal,"10575987, 15458601",48894,Q1,134,3,12,559,1043,12,8600,18633,United States,Centers for Disease Control and Prevention (CDC),1990-ongoing,Epidemiology (Q1); Health Information Manageme...
2,3,21100812243,Nature Reviews Materials,journal,20588437,34171,Q1,61,99,195,8124,7297,104,7016,8206,United Kingdom,Nature Publishing Group,2016-ongoing,"Biomaterials (Q1); Electronic, Optical and Mag..."


In [17]:
len(set(metadata_df['journal']))

4886

In [18]:
len(set(journal_df['Title']))

31378

In [19]:
len(set(metadata_df['journal']) & set(journal_df['Title']))

1689

In [20]:
df_smoke = papers_df.merge(journal_df.loc[:,['Title', 'H index']], 
                           left_on='journal', right_on='Title', how='left')

In [21]:
df_smoke = df_smoke.dropna(subset = ['abstract', 'text_body'])

In [22]:
df_smoke.head(3)

Unnamed: 0,doc_id,title,abstract,text_body,sha,publish_time,authors,journal,doi,Title,H index
14041,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,Evolutionary Medicine IV. Evolution and Emerge...,,The evolutionary history of humans is characte...,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,2016-12-31,"Scarpino, S.V.",Encyclopedia of Evolutionary Biology,10.1016/b978-0-12-800049-6.00293-6,,
14042,6599ebbef3d868afac9daa4f80fa075675cf03bc,International aviation emissions to 2025: Can ...,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...",6599ebbef3d868afac9daa4f80fa075675cf03bc,2009-01-31,"Macintosh, Andrew; Wallace, Lailey",Energy Policy,10.1016/j.enpol.2008.08.029,Energy Policy,178.0
14043,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,2 Mechanisms of diarrhoea,,Acute infections of the gastrointestinal tract...,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,1993-06-30,"Booth, I.W.; McNeish, A.S.",Baillière's Clinical Gastroenterology,10.1016/0950-3528(93)90041-p,,


### Filtering papers for transmission details

In [23]:
transmission_keywords = [
    'transmi', 'sneez', 'contact trac', 'reproduc', 'environ'
]

In [24]:
smoke_keywords = [
    'smok', 'pulm'
]

In [25]:
# Exact copy of Maria's function
abstracts = list(df_smoke['abstract'])
df_smoke['transmission_indicator'] = df_smoke['abstract'].apply(lambda x: 
                                                                any(keyword in x for 
                                                                    keyword in transmission_keywords))
df_smoke['smoke_indicator'] = df_smoke['abstract'].apply(lambda x: 
                                                                any(keyword in x for 
                                                                    keyword in smoke_keywords))
transmission_df = df_smoke[df_smoke['transmission_indicator'] == True].reset_index(drop = True)
smoke_df = df_smoke[df_smoke['smoke_indicator'] == True].reset_index(drop = True)

In [26]:
transmission_df.shape, smoke_df.shape

((4448, 13), (778, 13))

In [27]:
transmission_df.to_csv('transmission_df.csv')
smoke_df.to_csv('smoke_df.csv')

In [30]:
transmission_colab_df = transmission_df.sort_values(by = 'H index', ascending = False).reset_index(drop = True)
transmission_colab_df = transmission_colab_df.iloc[0:100, :]

In [31]:
smoke_colab_df = smoke_df.sort_values(by = 'H index', ascending = False).reset_index(drop = True)
smoke_colab_df = smoke_colab_df.iloc[0:100, :]

In [32]:
transmission_colab_df.to_csv('transmission_colab_df.csv')
smoke_colab_df.to_csv('smoke_colab_df.csv')