In [None]:
import os
import re
import sys
import glob
import json

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

root = '../../kaggle_data/'

### `PaperLoader` class will load all papers for the challenge and provide an interface for us to obtain `Pandas Dataframes` to work with. The focus will be on:
- Obtaining Paper title, Abstract, Body
- Obtaining Authors, Journal of Publication, Publication Date and Publication Date
- Obtaining journal ratings

In [None]:
class PaperLoader():
    """
    Initializes PaperLoader class to read all .json files from root_directory
    """
    def __init__(self, root_dir, no_bib=True):
        self.ROOT_DIR = root_dir
        self.JSON_FILES = glob.glob(f'{root}/**/*.json', recursive=True)
        self.PAPERS_COLUMN = {
                                "doc_id": [None],
                                "title": [None],
                                "abstract": [None],
                                "text_body": [None]
                                }
        self.PAPERS_DF = None
        self.NO_BIB = no_bib
    
    """
    Removes sections with more than 5 URL/DOI/HTTP instances
    """
    def __clean_bib(self, body_text):
        merged_body = []
        for segment in body_text:
            if len(merged_body) > 0:
                if merged_body[-1]['section'] == segment['section']:
                    merged_body[-1]['text'] += '\n' + segment['text']
                    continue
            merged_body.append(segment)
            
        merged_body = [segment for segment in merged_body 
                       if len(re.findall("(http|doi|www)", segment['text'])) <= 4]
        return merged_body

        
    """
    Creates a Pandas DataFrame from all json files in root_directory
    Each json file represents a paper. 
    Features extracted are: doc_id, title, abstract, text_body
    """
    def create_paper_df(self):
        self.PAPERS_DF = pd.DataFrame.from_dict(self.PAPERS_COLUMN)
    
        for i in tqdm(range(len(self.JSON_FILES))):
            file_name = self.JSON_FILES[i]
            row = {x: None for x in self.PAPERS_COLUMN}

            with open(file_name) as json_data:
                data = json.load(json_data)
            
                doc_id = data['paper_id']
                row['doc_id'] = doc_id
                row['title'] = data['metadata']['title']

                # Now need all of abstract. Put it all in
                # a list then use str.join() to split it
                # into paragraphs.

                if ('abstract' not in data or 'body_text' not in data):
                    continue
                else:
                    abstract_list = [abst['text'] for abst in data['abstract']]
                    abstract = "\n ".join(abstract_list)

                row['abstract'] = abstract

                # And lastly the body of the text.
                if self.NO_BIB:
                    body_list = self.__clean_bib(data['body_text'])
                else:
                    body_list = [bt['text'] for bt in data['body_text']]
                
                
                row['text_body'] = body_list


                self.PAPERS_DF = self.PAPERS_DF.append(row, ignore_index=True)

    
        
    """
    Joins paper information with information on journal for paper,
    authors, doi and published date
    """
    def merge_metadata(self):
        metadata_df = pd.read_csv(self.ROOT_DIR + 'metadata.csv')
        metadata_df_for_join = metadata_df.loc[:, 
                                               ['sha', 'publish_time', 'authors', 'journal', 'doi']]
        self.PAPERS_DF = self.PAPERS_DF.merge(metadata_df_for_join, 
                            left_on='doc_id', right_on='sha', how='inner')

    
    
    """
    Joins paper information with information on journal ratings
    Important column: H_Index
    """
    def merge_journals(self):
        journal_df = pd.read_csv(root + 'scimagoj_2018.csv', sep = ';')
        papers_ratings_df = self.PAPERS_DF.merge(journal_df.loc[:,['Title', 'H index']], 
                           left_on='journal', right_on='Title', how='left')
        papers_ratings_df = papers_ratings_df.drop(['sha', 'Title'], 
                                                   axis=1).reset_index(drop = True)
        self.PAPERS_DF = papers_ratings_df

    
    def get_df(self):
        self.PAPERS_DF = self.PAPERS_DF.dropna(subset=['abstract', 'text_body'])
        return self.PAPERS_DF

In [None]:
paper_loader = PaperLoader(root)
paper_loader.create_paper_df()
paper_loader.merge_metadata()
paper_loader.merge_journals()
papers_df = paper_loader.get_df()

In [None]:
papers_df.head(2)

## Filtering for covid-19 related papers released after 2019
There is a lot of noise in this dataset due to information about other strains of coronavirus so we will select only the papers that are related to Covid-19. 


Also, while the older papers may contain some important insight on the variance among the  different strains of coronavirus, for our purposes, we will only be looking at papers published on 2019 or later because that is when Covid-19 became popular.

#### `RelevantFilter` class will filter the `DataFrame` from `PaperLoader` and filter for covid-19 papers published on 2019 or later. 
We will need to supply a list of covid-related keywords to filter from to the `constructor`


In [None]:
class RelevantFilter():
    """
    Takes a list of key-words that the other methods use
    to filter for relevant papers
    """
    def __init__(self, keywords, year = '2019'):
        self.KEYWORDS = keywords
        self.YEAR = year
    
    def extract_recent(self, df):
        return df[df['publish_time'] >= self.YEAR]
    
    def filter_papers(self, df):
        cov_titles = [title for title in df['title'] 
                         if any((re.search(key,title.lower())) 
                         for key in self.KEYWORDS)]
        data = df[df['title'].isin(cov_titles)].reset_index(drop = True)
        return df

In [None]:
cov_list = [
    'covid-19','covid 19','novel coronavi',
    'cord-19','cord 19','2019-nCoV','cov_2',
    '2019 ncov','2019 cov','wuhan coronavi',    
]

In [None]:
covid_filter = RelevantFilter(cov_list, '2019')
covid_df = covid_filter.filter_papers(papers_df)
covid_df = covid_filter.extract_recent(covid_df)

In [None]:
covid_df.shape

In [None]:
covid_df.head(1)

In [None]:
covid_df.head(2)['title'].values

### Keyword Analysis

In [None]:
Design_list = [
    'retrospective cohort', 'cross-sectional case-control',
    'cross sectional case control', 'prevalence survey', 'systematic review ',
    ' meta-analysis', ' meta analysis'
    'matched case-control', 'matched case control', 'medical record review',
    'observational case series', 'time series analysis',
    'pseudo-randomized controlled trials',
    'pseudo randomized controlled trial', 'randomized controlled trials',
    'randomized controlled trial'
]

In [None]:
trans_risk = [
    'diabete', 'age', 'neonat', 'elderly,', 'cancer', 'histori', 'sputum',
    'stool', 'blood', 'urine', 'house', 'environmental', 'seasonal',
    'comorbidit', ' immune deficiency', 'liver', 'smok', 'age decil', 'heart',
    'lung', 'climate,', 'PPE use', 'touching face', 'immun', 'insur',
    'compromis', 'pregnan', 'race', 'ethnic', 'hyperten', 'child', 'tubercul',
    'mtb', 'tb', 'MTB', 'TB'
]

In [None]:
trans_outcom = ['risk','range','duration','asymptomatic',
 'infecti', 'reproducti',  'route', 
 'age','transmm'
 'stratifi',
 'period,',
 'health',
 'r0','shedd', 'viral'
 'period','incub',
 'generat',
 'factor',
 'interval,',
 'serial'
               ]

In [None]:
risk_factors = [
                        {
                        'name': 'smoking',
                        'pattern': 'smok'
                        },
                        {
                        'name': 'diabetes',
                        'pattern': 'diabete'
                        },
                        {
                        'name': 'pregnancy',
                        'pattern': 'pregnan'
                        },
                        {
                        'name': 'tuberculosis',
                        'pattern': '(tubercul|MTB|TB)'
                        }
                    ]

In [70]:
design_list = [
    'mathemat', 'profil', 'cross sectional case control',
    'matched case control', 'contact', 'surviv', 'tracing,', 'time to event',
    'time-to-event', 'risk factor analysis', 'logistic regression',
    'cross-sectional case-control', 'matched case-control',
    'observational case series', 'time series analysis', 'survival analysis',
    'investigati', 'model', 'outbreak', 'stochast', 'statist', 'analysi',
    'experiment', 'excret', 'investig'
    'retrospective cohort', 'cross-sectional case-control',
    'cross sectional case control', 'prevalence survey', 'systematic review ',
    'meta-analysis', 'meta analysis', 'matched case-control',
    'matched case control', 'medical record review',
    'observational case series', 'time series analysis',
    'pseudo-randomized controlled', 'pseudo randomized controlled',
    'randomized controlled', 'retrospective analysis', 'retrospective study',
    'retrospective studies'
]

In [71]:
outcome_list = [
    'risk', 'range', 'duration', 'asymptomatic', 'infecti', 'reproducti',
    'route', 'age', 'transmm'
    'stratifi', 'period,', 'health', 'r0', 'shedd', 'viral'
    'period', 'incub', 'generat', 'factor', 'interval,', 'serial'
]

In [67]:
class PaperAnalyzer():
    def __init__(self, parent_df):
        self.working_df = parent_df
        self.__ANALYZED_RISKS = False
        self.__RISK_MESSAGE = "Need to perform risk analyze first. Try calling self.analyze_risks()"

        
    def analyze_risks(self, risk_factors):
        risk_temp_arr = []    
        for index, row in tqdm(self.working_df.iterrows(), 
                               total=self.working_df.shape[0]):        
            for section in row['text_body']:
                section_row = row.copy()
                section_row['section'] = section['section']
                section_row['text_body'] = section['text']
                body = section_row['text_body'].lower()
                
                
                for factor in risk_factors:
                    name = factor['name']
                    matches = re.findall(factor['pattern'], body)
                    section_row['has_' + name + '?'] = False
                    section_row[name + '_count'] = 0
                    section_row[name + '_in_title'] = False
                        
                    if len(matches) > 0:
                        section_row['has_' + name + '?'] = True
                        section_row[name + '_count'] = len(matches)
                    if re.findall(factor['pattern'], section_row['title']):
                        section_row[name + '_in_title'] = True
                        # If the risk factor is in title of paper we reward with higher count
                        section_row[name + '_count'] += 10
            
            risk_temp_arr.append(section_row)
        
        self.working_df = pd.DataFrame(risk_temp_arr)
        self.__ANALYZED_RISKS = True

    
    def analyze_designs(self, design_list):
        if not self.__ANALYZED_RISKS:
            raise ValueError(self.__ERROR_MESSAGE)
        
        design_temp_arr = []
        for index, row in tqdm(self.working_df.iterrows(), 
                               total=self.working_df.shape[0]):
            design_matches = [re.findall(des, row['text_body'])
                                for des in design_list]
            design_matches = np.concatenate(design_matches)
            design_rank = 0
            if len(design_matches) > 0:
                if ('meta-analysis' or 'meta analysis') in design_matches:
                    design_rank = 6
                if ('randomized controlled') in design_matches:
                    design_rank = 5
                if ('pseudo-randomized controlled'
                    or 'pseudo randomized controlled') in design_matches:
                    design_rank = 4
                design_rank = design_rank + len(design_matches)
            row['design_matches'] = design_matches
            row['design_rank'] = design_rank
            design_temp_arr.append(row)
        
        self.working_df = pd.DataFrame(design_temp_arr)

    
    def analyze_outcomes(self, outcomes):
        if not self.__ANALYZED_RISKS:
            raise ValueError(self.__ERROR_MESSAGE)
        
        outcome_arr = []
        for index, row in tqdm(self.working_df.iterrows(), 
                               total=self.working_df.shape[0]):
            
            outcome_matches = [re.findall(outcome, row['text_body'])
                                for outcome in outcomes]
            outcome_matches = np.concatenate(outcome_matches)
            outcome_rank = 0
            outcome_rank = len(outcome_matches)
            
            row['outcome_matches'] = outcome_matches
            row['outcome_rank'] = outcome_rank
            
            outcome_arr.append(row)
        
        self.working_df = pd.DataFrame(outcome_arr)
        
    def perform_analysis(self, risk_factors, design_list, outcomes):
        print("Analyzing risks")
        self.analyze_risks(risk_factors)
        print("Analyzing study designs")
        self.analyze_designs(design_list)
        print("Analyzing outcomes")
        self.analyze_outcomes(outcomes)
    
    def get_df(self, risk_factor = None):
        if risk_factor:
            if not self.__ANALYZED_RISKS:
                raise ValueError(self.__ERROR_MESSAGE)
            return self.working_df[self_working_df['has_' + risk_factor + '?'] == True]
        return self.working_df

In [72]:
covid_analysis = PaperAnalyzer(covid_df)
covid_analysis.analyze_risks(risk_factors)

HBox(children=(FloatProgress(value=0.0, max=6495.0), HTML(value='')))




In [73]:
covid_analysis.analyze_designs(design_list)

HBox(children=(FloatProgress(value=0.0, max=6495.0), HTML(value='')))




In [74]:
covid_analysis.analyze_outcomes(outcome_list)

HBox(children=(FloatProgress(value=0.0, max=6495.0), HTML(value='')))




In [77]:
enriched_covid_df = covid_analysis.get_df()

In [78]:
enriched_covid_df.to_json("../../enriched_covid_df.json", orient='records')