# Abstract
The objective of the notebook is to obtain a `csv` file named `papers_df.csv` that will contain the following information for each paper:
- Title
- DOI
- Abstract
- Body
- Journal
- Journal Rating(H Index)
- Authors
- Date Published


Note: The `doc_id` column will the key to stitch different tables together to obtain aforementionned data.

In [1]:
#relevant imports
import os
import json
import glob
import numpy as np
import pandas as pd

## `PaperLoader` class will load all papers for the challenge and provide an interface for us to obtain `Pandas Dataframes` to work with. The focus will be on:
- Obtaining Paper title, Abstract, Body
- Obtaining Authors, Journal of Publication, Publication Date and Publication Date
- Obtaining journal ratings

In [2]:
class PaperLoader():
    """
    Initializes PaperLoader class to read all .json files from root_directory
    """
    def __init__(self, root_dir):
        self.ROOT_DIR = root_dir
        self.JSON_FILES = glob.glob(f'{root}/**/*.json', recursive=True)
        self.PAPERS_COLUMN = {
                                "doc_id": [None],
                                "title": [None],
                                "abstract": [None],
                                "text_body": [None]
                                }
        self.PAPERS_DF = None
        
    """
    Creates a Pandas DataFrame from all json files in root_directory
    Each json file represents a paper. 
    Features extracted are: doc_id, title, abstract, text_body
    """
    def createPaperDf(self):
        self.PAPERS_DF = pd.DataFrame.from_dict(self.PAPERS_COLUMN)
    
        for file_name in self.JSON_FILES:

            row = {x: None for x in self.PAPERS_COLUMN}

            with open(file_name) as json_data:
                data = json.load(json_data)

                doc_id = data['paper_id']
                row['doc_id'] = doc_id
                row['title'] = data['metadata']['title']

                # Now need all of abstract. Put it all in
                # a list then use str.join() to split it
                # into paragraphs.

                abstract_list = [abst['text'] for abst in data['abstract']]
                abstract = "\n ".join(abstract_list)

                row['abstract'] = abstract

                # And lastly the body of the text.
                body_list = [bt['text'] for bt in data['body_text']]
                body = "\n ".join(body_list)

                row['text_body'] = body


                self.PAPERS_DF = self.PAPERS_DF.append(row, ignore_index=True)
        return self.PAPERS_DF
    
    """
    Joins paper information with information on journal for paper,
    authors, doi and published date
    """
    def mergeMetadata(self):
        metadata_df = pd.read_csv(self.ROOT_DIR + 'metadata.csv')
        metadata_df_for_join = metadata_df.loc[:, 
                                               ['sha', 'publish_time', 'authors', 'journal', 'doi']]
        self.PAPERS_DF = self.PAPERS_DF.merge(metadata_df_for_join, 
                            left_on='doc_id', right_on='sha', how='inner')
        return self.PAPERS_DF
    
    
    """
    Joins paper information with information on journal ratings
    Important column: H_Index
    """
    def mergeJournals(self):
        journal_df = pd.read_csv(root + '../scimagoj_2018.csv', sep = ';')
        papers_ratings_df = self.PAPERS_DF.merge(journal_df.loc[:,['Title', 'H index']], 
                           left_on='journal', right_on='Title', how='left')
        papers_ratings_df = papers_ratings_df.dropna(subset = ['abstract', 'text_body'])
        papers_ratings_df = papers_ratings_df.drop(['sha', 'Title'], 
                                                   axis=1).reset_index(drop = True)
        self.PAPERS_DF = papers_ratings_df
        
        return self.PAPERS_DF 

In [3]:
root = 'data/corona_challenge/'

In [4]:
paper_loader = PaperLoader(root)
paper_loader.createPaperDf()
paper_loader.mergeMetadata()
paper_loader.mergeJournals()
papers_df = paper_loader.PAPERS_DF

In [7]:
papers_df.head(2)

Unnamed: 0,doc_id,title,abstract,text_body,publish_time,authors,journal,doi,H index
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,word count: 194 22 Text word count: 5168 23 24...,"VP3, and VP0 (which is further processed to VP...",2020-01-11,Joseph C. Ward; Lidia Lasecka-Dykes; Chris Nei...,,10.1101/2020.01.10.901801,
1,00340eea543336d54adda18236424de6a5e91c9d,Analysis Title: Regaining perspective on SARS-...,"During the past three months, a new coronaviru...","In December 2019, a novel coronavirus, SARS-Co...",2020-03-20,Carla Mavian; Simone Marini; Costanza Manes; I...,,10.1101/2020.03.16.20034470,


In [8]:
papers_df.to_csv('output/papers_df')

### Filtering papers for transmission details

In [None]:
transmission_keywords = [
    'transmi', 'sneez', 'contact trac', 'reproduc', 'environ'
]

In [None]:
smoke_keywords = [
    'smok', 'pulm'
]

In [None]:
# Exact copy of Maria's function
abstracts = list(df_smoke['abstract'])
df_smoke['transmission_indicator'] = df_smoke['abstract'].apply(lambda x: 
                                                                any(keyword in x for 
                                                                    keyword in transmission_keywords))
df_smoke['smoke_indicator'] = df_smoke['abstract'].apply(lambda x: 
                                                                any(keyword in x for 
                                                                    keyword in smoke_keywords))
transmission_df = df_smoke[df_smoke['transmission_indicator'] == True].reset_index(drop = True)
smoke_df = df_smoke[df_smoke['smoke_indicator'] == True].reset_index(drop = True)

In [None]:
transmission_df.shape, smoke_df.shape

In [None]:
transmission_df.to_csv('transmission_df.csv')
smoke_df.to_csv('smoke_df.csv')

In [None]:
transmission_colab_df = transmission_df.sort_values(by = 'H index', ascending = False).reset_index(drop = True)
transmission_colab_df = transmission_colab_df.iloc[0:100, :]

In [None]:
smoke_colab_df = smoke_df.sort_values(by = 'H index', ascending = False).reset_index(drop = True)
smoke_colab_df = smoke_colab_df.iloc[0:100, :]

In [None]:
transmission_colab_df.to_csv('transmission_colab_df.csv')
smoke_colab_df.to_csv('smoke_colab_df.csv')