In [5]:
import os
import re
import sys
import glob
import json
import string

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize 

stemmer = PorterStemmer()
root = '../../kaggle_data/'
stop_words = list(set(stopwords.words('english')))
stop_words.extend(['within', 'what', 'how', 'eg', 'ie'])

### `PaperLoader` class will load all papers for the challenge and provide an interface for us to obtain `Pandas Dataframes` to work with. The focus will be on:
- Obtaining Paper title, Abstract, Body
- Obtaining Authors, Journal of Publication, Publication Date and Publication Date
- Obtaining journal ratings

In [73]:
class PaperLoader():
    """
    Initializes PaperLoader class to read all .json files from root_directory
    """
    def __init__(self, root_dir, no_bib=True):
        self.ROOT_DIR = root_dir
        self.JSON_FILES = glob.glob(f'{root}/**/*.json', recursive=True)
        self.PAPERS_COLUMN = {
                                "doc_id": [None],
                                "title": [None],
                                "abstract": [None],
                                "text_body": [None]
                                }
        self.PAPERS_DF = None
        self.NO_BIB = no_bib
    
    """
    Removes sections with more than 5 URL/DOI/HTTP instances
    """
    def __clean_bib(self, body_text):
        merged_body = []
        for segment in body_text:
            if len(merged_body) > 0:
                if merged_body[-1]['section'] == segment['section']:
                    merged_body[-1]['text'] += '\n' + segment['text']
                    continue
            merged_body.append(segment)
            
        merged_body = [segment for segment in merged_body 
                       if len(re.findall("(http|doi|www)", segment['text'])) <= 4]
        return merged_body

        
    """
    Creates a Pandas DataFrame from all json files in root_directory
    Each json file represents a paper. 
    Features extracted are: doc_id, title, abstract, text_body
    """
    def create_paper_df(self):
        self.PAPERS_DF = pd.DataFrame.from_dict(self.PAPERS_COLUMN)
        df_list = []
        for i in tqdm(range(len(self.JSON_FILES))):
            file_name = self.JSON_FILES[i]
            row = {x: None for x in self.PAPERS_COLUMN}

            with open(file_name) as json_data:
                data = json.load(json_data)
            
                doc_id = data['paper_id']
                row['doc_id'] = doc_id
                row['title'] = data['metadata']['title']
                if len(row['title']) <= 1:
                    continue

                # Now need all of abstract. Put it all in
                # a list then use str.join() to split it
                # into paragraphs.

                if ('abstract' not in data or 'body_text' not in data):
                    continue
                else:
                    abstract_list = [abst['text'] for abst in data['abstract']]
                    abstract = "\n ".join(abstract_list)

                if len(abstract) <= 1:
                    continue
                
                row['abstract'] = abstract

                # And lastly the body of the text.
                # These clauses verify whether the user wants to clean up references
                if self.NO_BIB:
                    body_list = self.__clean_bib(data['body_text'])
                else:
                    body_list = [bt for bt in data['body_text']]
                
                
                row['text_body'] = body_list

                df_list.append(row)
        self.PAPERS_DF = pd.DataFrame(df_list)

    
        
    """
    Joins paper information with information on journal for paper,
    authors, doi and published date
    """
    def merge_metadata(self):
        metadata_df = pd.read_csv(self.ROOT_DIR + 'metadata.csv')
        metadata_df_for_join = metadata_df.loc[:, 
                                               ['sha', 'publish_time', 'authors', 'journal', 'doi']]
        self.PAPERS_DF = self.PAPERS_DF.merge(metadata_df_for_join, 
                            left_on='doc_id', right_on='sha', how='inner')

    
    
    """
    Joins paper information with information on journal ratings
    Important column: H_Index
    """
    def merge_journals(self):
        journal_df = pd.read_csv(root + 'scimagoj_2018.csv', sep = ';')
        papers_ratings_df = self.PAPERS_DF.merge(journal_df.loc[:,['Title', 'H index']], 
                           left_on='journal', right_on='Title', how='left')
        papers_ratings_df = papers_ratings_df.drop(['sha', 'Title'], 
                                                   axis=1).reset_index(drop = True)
        self.PAPERS_DF = papers_ratings_df

    
    def get_df(self):
        self.PAPERS_DF = self.PAPERS_DF.dropna(subset=['abstract', 'text_body'])
        return self.PAPERS_DF

In [74]:
paper_loader = PaperLoader(root)
paper_loader.create_paper_df()
paper_loader.merge_metadata()
paper_loader.merge_journals()
papers_df = paper_loader.get_df()

HBox(children=(FloatProgress(value=0.0, max=59311.0), HTML(value='')))




In [78]:
papers_df.head(2)

Unnamed: 0,doc_id,title,abstract,text_body,publish_time,authors,journal,doi,H index
0,306ef95a3a91e13a93bcc37fb2c509b67c0b5640,A Novel Approach for a Novel Pathogen: using a...,Thousands of people in the United States have ...,[{'text': 'The 2019 novel coronavirus (SARS-Co...,2020-03-12,"Bryson-Cahn, Chloe; Duchin, Jeffrey; Makarewic...",Clin Infect Dis,10.1093/cid/ciaa256,
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,International aviation emissions to 2025: Can ...,"International aviation is growing rapidly, res...","[{'text': 'Sixty years ago, civil aviation was...",2009-01-31,"Macintosh, Andrew; Wallace, Lailey",Energy Policy,10.1016/j.enpol.2008.08.029,178.0


In [79]:
papers_df.shape

(25323, 9)

## Filtering for covid-19 related papers released after 2019
There is a lot of noise in this dataset due to information about other strains of coronavirus so we will select only the papers that are related to Covid-19. 


Also, while the older papers may contain some important insight on the variance among the  different strains of coronavirus, for our purposes, we will only be looking at papers published on 2019 or later because that is when Covid-19 became popular.

In [80]:
# List of keywords for covid-19
cov_list = [
    'novel coronavi','covid','cov_2',
    'cord-19','cord 19','2019-nCoV',
    '2019 ncov','2019 cov','wuhan coronavi',    
]

#### `RelevantFilter` class will filter the `DataFrame` from `PaperLoader` and filter for covid-19 papers published on 2019 or later. 
We will need to supply a list of covid-related keywords to filter from to the `constructor`


In [81]:
class RelevantFilter():
    """
    Takes a list of key-words that the other methods use
    to filter for relevant papers
    """
    def __init__(self, keywords, year='2019'):
        self.KEYWORDS = keywords
        self.YEAR = year

    def extract_recent(self, df):
        return df[df['publish_time'] >= self.YEAR]

    def filter_papers(self, df):
        cov_pattern = re.compile('(' + "|".join(self.KEYWORDS) + ')',
                                 re.IGNORECASE)
        df = df[df['title'].apply(lambda x:
                                 len(cov_pattern.findall(x)) >= 1
                                  if x else False
                                 )]
        return df

In [82]:
covid_filter = RelevantFilter(cov_list, '2019')
covid_df = covid_filter.filter_papers(papers_df)
covid_df = covid_filter.extract_recent(covid_df)

In [83]:
covid_df.shape

(932, 9)

In [84]:
covid_df.head(1)

Unnamed: 0,doc_id,title,abstract,text_body,publish_time,authors,journal,doi,H index
0,306ef95a3a91e13a93bcc37fb2c509b67c0b5640,A Novel Approach for a Novel Pathogen: using a...,Thousands of people in the United States have ...,[{'text': 'The 2019 novel coronavirus (SARS-Co...,2020-03-12,"Bryson-Cahn, Chloe; Duchin, Jeffrey; Makarewic...",Clin Infect Dis,10.1093/cid/ciaa256,


In [85]:
list(covid_df.head(2)['title'].values)

['A Novel Approach for a Novel Pathogen: using a home assessment team to evaluate patients for 2019 novel coronavirus (SARS-CoV-2)',
 'Modeling the dynamics of novel coronavirus (2019-nCov) with fractional derivative']

### Keyword Analysis

In [86]:
risk_factors = [
                        {
                        'name': 'smoking',
                        'pattern': 'smok'
                        },
                        {
                        'name': 'diabetes',
                        'pattern': 'diabete'
                        },
                        {
                        'name': 'pregnancy',
                        'pattern': 'pregnan'
                        },
                        {
                        'name': 'tuberculosis',
                        'pattern': '(tubercul|MTB|TB)'
                        }
                    ]

In [87]:
design_list = [
    'mathemat', 'profil', 'cross sectional case control',
    'matched case control', 'contact', 'surviv', 'tracing,', 'time to event',
    'time-to-event', 'risk factor analysis', 'logistic regression',
    'cross-sectional case-control', 'matched case-control',
    'observational case series', 'time series analysis', 'survival analysis',
    'investigati', 'model', 'outbreak', 'stochast', 'statist', 'analysi',
    'experiment', 'excret', 'investig'
    'retrospective cohort', 'cross-sectional case-control',
    'cross sectional case control', 'prevalence survey', 'systematic review ',
    'meta-analysis', 'meta analysis', 'matched case-control',
    'matched case control', 'medical record review',
    'observational case series', 'time series analysis',
    'pseudo-randomized controlled', 'pseudo randomized controlled',
    'randomized controlled', 'retrospective analysis', 'retrospective study',
    'retrospective studies'
]

In [88]:
outcome_list = [
    'risk', 'range', 'duration', 'asymptomatic', 'infecti', 'reproducti',
    'route', 'age', 'transmm'
    'stratifi', 'period,', 'health', 'r0', 'shedd', 'viral'
    'period', 'incub', 'generat', 'factor', 'interval,', 'serial'
]

## `PaperAnalyzer` 
Will be filled with description

In [None]:
class PaperAnalyzer():
    def __init__(self, parent_df):
        self.working_df = parent_df.explode('text_body')
        self.working_df['section'] = self.working_df['text_body'].apply(
            lambda x: x['section'] if type(x) == dict else None)
        self.working_df['text_body'] = self.working_df['text_body'].apply(
            lambda x: x['text'] if type(x) == dict else None)
        self.working_df = self.working_df[self.working_df['text_body'].notna()]
        self.__ANALYZED_RISKS = False
        self.__RISK_MESSAGE = "Need to call analyze_risk()  first. Try calling self.analyze_risks()"

    def analyze_risks(self, risk_factors):
        risk_temp_arr = []
        for index, row in tqdm(self.working_df.iterrows(),
                               total=self.working_df.shape[0]):
            body = row['text_body'].lower()
            for factor in risk_factors:
                if type(factor) == dict:
                    name = factor['name']
                    pattern = factor['pattern']
                elif type(factor) == str:
                    name = factor
                    pattern = factor
                matches = re.findall(pattern, body)
                row['has_' + name + '?'] = False
                row[name + '_count'] = 0
                row[name + '_in_title'] = False

                if len(matches) > 0:
                    row['has_' + name + '?'] = True
                    row[name + '_count'] = len(matches)
                if re.findall(pattern, row['title']):
                    row[name + '_in_title'] = True
                    # If the risk factor is in title of paper we reward with higher count
                    row[name + '_count'] += 10

            risk_temp_arr.append(row)
        self.working_df = pd.DataFrame(risk_temp_arr)
        self.__ANALYZED_RISKS = True

    def analyze_designs(self, design_list):
        if not self.__ANALYZED_RISKS:
            raise ValueError(self.__ERROR_MESSAGE)

        design_temp_arr = []
        for index, row in tqdm(self.working_df.iterrows(),
                               total=self.working_df.shape[0]):
            design_matches = [
                re.findall(des, row['text_body']) for des in design_list
            ]
            design_matches = np.concatenate(design_matches)
            design_rank = 0
            if len(design_matches) > 0:
                if ('meta-analysis' or 'meta analysis') in design_matches:
                    design_rank = 6
                if ('randomized controlled') in design_matches:
                    design_rank = 5
                if ('pseudo-randomized controlled'
                        or 'pseudo randomized controlled') in design_matches:
                    design_rank = 4
                design_rank = design_rank + len(design_matches)
            row['design_matches'] = design_matches
            row['design_rank'] = design_rank
            design_temp_arr.append(row)

        self.working_df = pd.DataFrame(design_temp_arr)

    def analyze_outcomes(self, outcomes):
        if not self.__ANALYZED_RISKS:
            raise ValueError(self.__ERROR_MESSAGE)

        outcome_arr = []
        for index, row in tqdm(self.working_df.iterrows(),
                               total=self.working_df.shape[0]):

            outcome_matches = [
                re.findall(outcome, row['text_body']) for outcome in outcomes
            ]
            outcome_matches = np.concatenate(outcome_matches)
            outcome_rank = 0
            outcome_rank = len(outcome_matches)

            row['outcome_matches'] = outcome_matches
            row['outcome_rank'] = outcome_rank

            outcome_arr.append(row)

        self.working_df = pd.DataFrame(outcome_arr)

    def perform_analysis(self, risk_factors, design_list=None, outcomes=None):
        print("Analyzing risks")
        self.analyze_risks(risk_factors)
        print("Analyzing study designs")
        self.analyze_designs(design_list)
        print("Analyzing outcomes")
        self.analyze_outcomes(outcomes)

    def question_analysis(self, question):
        keywords = []
        # resolve questions into keywords of risk factors
        ####

        self.analyze_risks(keywords)

        if design_list:
            self.analyze_design(design_list)
        if outcomes:
            self.analyze_outcomes(outcomes)

    def get_df(self, risk_factor=None):
        if risk_factor:
            if not self.__ANALYZED_RISKS:
                raise ValueError(self.__ERROR_MESSAGE)
            return self.working_df[self_working_df['has_' + risk_factor +
                                                   '?'] == True]
        return self.working_df

In [None]:
covid_analysis = PaperAnalyzer(covid_df)
covid_analysis.analyze_risks(risk_factors)

In [None]:
covid_analysis.analyze_designs(design_list)

In [None]:
covid_analysis.analyze_outcomes(outcome_list)

In [None]:
enriched_covid_df = covid_analysis.get_df()

In [None]:
enriched_covid_df.to_json("../../enriched_covid_df.json", orient='records')

## `Question`
Will be filled with description

In [None]:
question_document = pd.read_csv('../../../Downloads/Kaggle COVID-19 medical dictionary - sub.question.matching.csv', index_col = 0)

In [None]:
class Question():
    def __init__(self, question, design_list=None, outcomes=None):
        self.DESIGN_LIST = design_list
        self.OUTCOMES = outcomes
        self.RISK = question
        self.risk_factors = None
        self.design_list = None
        self.outcome_list = None
        self.__resolve_question()
        if design_list:
            self.__resolve_design()
        if outcomes:
            self.__resolve_outcomes()

    def __cov_tokenize(self, sent):
        abbvr_pattern = re.compile('(e.g.|i.e.)')
        sent = abbvr_pattern.sub('', sent)
        remove_punct = str.maketrans({key: " " for key in string.punctuation})
        sent = sent.translate(remove_punct)
        return sent.replace('R', 'R0').replace('-', ' ')

    def __resolve_question(self):
        subquestion = self.RISK
        sub_q = self.__cov_tokenize(subquestion)
        print(sub_q)
        keywords = set([
            stemmer.stem(word) for word in word_tokenize(sub_q)
            if word.lower() not in stop_words and 'cov' not in word.lower()
            and word.lower().islower() #This checks and removes numbers
        ])
        self.risk_factors = list(keywords)

    def __resolve_design(self):
        design_keys = self.__cov_tokenize(self.DESIGN_LIST)
        design_keys = set([
            stemmer.stem(word) for word in word_tokenize(design_keys)
            if word.lower() not in stop_words
            and word.lower().islower() #This checks and removes numbers
        ])
        self.design_list = list(design_keys)

    def __resolve_outcomes(self):
        outcome_keys = self.__cov_tokenize(self.OUTCOMES)
        outcome_keys = set([
            stemmer.stem(word) for word in word_tokenize(outcome_keys)
            if word.lower() not in stop_words
            and word.lower().islower() #This checks and removes numbers
        ])
        self.outcome_list = list(outcome_keys)

    def get_keywords(self):
        result = {'risk': None, 'design': None, 'outcome': None}
        result['risk'] = self.risk_factors
        if self.design_list:
            result['design'] = self.design_list
        if self.outcome_list:
            result['outcome'] = self.outcome_list
        return result

In [None]:
sample_q = question_document.iloc[127]['Subquestion']

In [None]:
sample_d = question_document.iloc[127]['Design.list']
sample_d, type(sample_d)

In [None]:
sample_o = question_document.iloc[127]['Outcome.list']
sample_o, type(sample_o)

In [None]:
def analyze_question(df, question):
    reference_df = PaperAnalyzer(df)
    keys = question.get_keywords()
    reference_df.perform_analysis(keys['risk'], keys['design'], keys['outcome'])
    return reference_df.get_df()

In [None]:
question_df = analyze_question(covid_df, Question(sample_q, sample_d, sample_o))

In [None]:
expanded_df = covid_df.explode('text_body')
expanded_df['section'] = expanded_df['text_body'].apply(lambda x: x['section'] 
                                                        if type(x) == dict else None)
expanded_df['text_body'] = expanded_df['text_body'].apply(lambda x: x['text']
                                                         if type(x) == dict else None)
expanded_df = expanded_df[expanded_df['text_body'].notna()]

In [None]:
def analyze_risks(df, risk_factors):
        risk_temp_arr = []
        for index, row in tqdm(df.iterrows(),
                               total=df.shape[0]):
            body = row['text_body'].lower()
            for factor in risk_factors:
                if type(factor) == dict:
                    name = factor['name']
                    pattern = factor['pattern']
                elif type(factor) == str:
                    name = factor
                    pattern = factor
                matches = re.findall(pattern, body)
                row['has_' + name + '?'] = False
                row[name + '_count'] = 0
                row[name + '_in_title'] = False

                if len(matches) > 0:
                    row['has_' + name + '?'] = True
                    row[name + '_count'] = len(matches)
                if re.findall(pattern, row['title']):
                    row[name + '_in_title'] = True
                    # If the risk factor is in title of paper we reward with higher count
                    row[name + '_count'] += 10

            risk_temp_arr.append(row)
        df = pd.DataFrame(risk_temp_arr)
        return df

In [None]:
covid_section_df = PaperAnalyzer(covid_df).working_df

In [None]:
%%time
trial_1 = analyze_risks(covid_section_df,risk_factors)

In [None]:
%%time
trial_2 = analyze_risks_better(covid_section_df, risk_factors)

In [None]:
def analyze_risks_better(df, risk_factors):
    for factor in risk_factors:
        if type(factor) == dict:
            name = factor['name']
            pattern = factor['pattern']
        elif type(factor) == str:
            name = factor
            pattern = factor
        df[name + '_count'] = df['text_body'].apply(
            lambda x: len(re.findall(pattern, x)))
        df['has_' + name + '?'] = df[name + '_count'].apply(lambda x: x > 0)
        df[name + '_in_title'] = df['title'].apply(
            lambda x: len(re.findall(pattern, x)) > 0)
        df[name + '_count'] = df.apply(lambda x: x[name + '_count'] + 10
                                       if x[name + '_in_title'] else x,
                                       axis=1)
    return df