# Abstract
The objective of the notebook is to obtain a `csv` file named `papers_df.csv` that will contain the following information for each paper:
- Title
- DOI
- Abstract
- Body
- Journal
- Journal Rating(H Index)
- Authors
- Date Published


Note: The `doc_id` column will the key to stitch different tables together to obtain aforementionned data.

In [1]:
#relevant imports
import os
import json
import glob
import numpy as np
import pandas as pd

## `PaperLoader` class will load all papers for the challenge and provide an interface for us to obtain `Pandas Dataframes` to work with. The focus will be on:
- Obtaining Paper title, Abstract, Body
- Obtaining Authors, Journal of Publication, Publication Date and Publication Date
- Obtaining journal ratings

In [2]:
class PaperLoader():
    """
    Initializes PaperLoader class to read all .json files from root_directory
    """
    def __init__(self, root_dir):
        self.ROOT_DIR = root_dir
        self.JSON_FILES = glob.glob(f'{root}/**/*.json', recursive=True)
        self.PAPERS_COLUMN = {
                                "doc_id": [None],
                                "title": [None],
                                "abstract": [None],
                                "text_body": [None]
                                }
        self.PAPERS_DF = None
        
    """
    Creates a Pandas DataFrame from all json files in root_directory
    Each json file represents a paper. 
    Features extracted are: doc_id, title, abstract, text_body
    """
    def create_paper_df(self):
        self.PAPERS_DF = pd.DataFrame.from_dict(self.PAPERS_COLUMN)
    
        for file_name in self.JSON_FILES:

            row = {x: None for x in self.PAPERS_COLUMN}

            with open(file_name) as json_data:
                data = json.load(json_data)

                doc_id = data['paper_id']
                row['doc_id'] = doc_id
                row['title'] = data['metadata']['title']

                # Now need all of abstract. Put it all in
                # a list then use str.join() to split it
                # into paragraphs.

                abstract_list = [abst['text'] for abst in data['abstract']]
                abstract = "\n ".join(abstract_list)

                row['abstract'] = abstract

                # And lastly the body of the text.
                body_list = [bt['text'] for bt in data['body_text']]
                body = "\n ".join(body_list)

                row['text_body'] = body


                self.PAPERS_DF = self.PAPERS_DF.append(row, ignore_index=True)

    
    """
    Joins paper information with information on journal for paper,
    authors, doi and published date
    """
    def merge_metadata(self):
        metadata_df = pd.read_csv(self.ROOT_DIR + 'metadata.csv')
        metadata_df_for_join = metadata_df.loc[:, 
                                               ['sha', 'publish_time', 'authors', 'journal', 'doi']]
        self.PAPERS_DF = self.PAPERS_DF.merge(metadata_df_for_join, 
                            left_on='doc_id', right_on='sha', how='inner')

    
    
    """
    Joins paper information with information on journal ratings
    Important column: H_Index
    """
    def merge_journals(self):
        journal_df = pd.read_csv(root + 'scimagoj_2018.csv', sep = ';')
        papers_ratings_df = self.PAPERS_DF.merge(journal_df.loc[:,['Title', 'H index']], 
                           left_on='journal', right_on='Title', how='left')
        papers_ratings_df = papers_ratings_df.drop(['sha', 'Title'], 
                                                   axis=1).reset_index(drop = True)
        self.PAPERS_DF = papers_ratings_df

    
    def get_df(self):
        self.PAPERS_DF = self.PAPERS_DF.dropna(subset=['text_body'])
        return self.PAPERS_DF

In [3]:
root = '../data/'

In [4]:
paper_loader = PaperLoader(root)
paper_loader.create_paper_df()
paper_loader.merge_metadata()
paper_loader.merge_journals()
papers_df = paper_loader.get_df()

In [5]:
papers_df.head(2)

Unnamed: 0,doc_id,title,abstract,text_body,publish_time,authors,journal,doi,H index
14041,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,Evolutionary Medicine IV. Evolution and Emerge...,,The evolutionary history of humans is characte...,2016-12-31,"Scarpino, S.V.",Encyclopedia of Evolutionary Biology,10.1016/b978-0-12-800049-6.00293-6,
14042,6599ebbef3d868afac9daa4f80fa075675cf03bc,International aviation emissions to 2025: Can ...,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...",2009-01-31,"Macintosh, Andrew; Wallace, Lailey",Energy Policy,10.1016/j.enpol.2008.08.029,178.0


In [7]:
papers_df.to_csv('../output/papers_df.csv')