In [153]:
import os
import re
import sys
import glob
import json
import string
import editdistance

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

stemmer = PorterStemmer()
root = '../../kaggle_data/'
stop_words = list(set(stopwords.words('english')))
stop_words.extend(['within', 'what', 'how', 'eg', 'ie'])

## Data Parsing and Extraction

### `PaperLoader` class will load all papers for the challenge and provide an interface for us to obtain `DataFrames` to work with. The focus will be on:
- Obtaining Paper title, Abstract, Body
    - The text body is filtered to remove sections containing lots of citations and hyperlinks
- Obtaining Authors, Journal of Publication, Publication Date and Publication Date
- Obtaining journal ratings(H index) to potentially sort paper based on journal quality
    - For the journal ratings, we use a list we obtained from **INSERT LINK HERE**

In [15]:
class PaperLoader():
    """
    Loads, parses and merges metadata for papers
    """
    
    def __init__(self, root_dir, no_bib=True):
        """
        Initializes PaperLoader class to read all .json files from root_directory
            
            no_bib: if true, clean noisy sections with bibliographies
            root_dir: root directory for papers
        """
        self.ROOT_DIR = root_dir
        self.JSON_FILES = glob.glob(f'{root}/**/*.json', recursive=True)
        self.PAPERS_COLUMN = {
            "doc_id": [None],
            "title": [None],
            "abstract": [None],
            "text_body": [None]
        }
        self.PAPERS_DF = None
        self.NO_BIB = no_bib

    
    def __clean_bib(self, body_text, thres):
        """
        Removes sections with more than 5 URL/DOI/HTTP instances
            
            body_text: array of dictionaries for text_body
            thres: number of hyperlinks tolerated before removal 
        """
        # Sometimes, the text body has duplicate sections consecutively.
        merged_body = []
        for segment in body_text:
            # We will combine these duplicate sections
            if len(merged_body) > 0:
                if merged_body[-1]['section'] == segment['section']:
                    merged_body[-1]['text'] += '\n' + segment['text']
                    continue
            merged_body.append(segment)

        merged_body = [
            segment for segment in merged_body
            if len(re.findall("(http|doi|www)", segment['text'])) <= thres
        ]
        return merged_body


    def create_paper_df(self):
        """
        Creates a Pandas DataFrame from all json files in root_directory
        Each json file represents a paper. 
        Features extracted are: doc_id, title, abstract, text_body
        """
        df_list = []
        
        for i in tqdm(range(len(self.JSON_FILES))):
            file_name = self.JSON_FILES[i]
            
            #Initialize row for returned df. Each row represents a paper
            row = {x: None for x in self.PAPERS_COLUMN}

            with open(file_name) as json_data:
                data = json.load(json_data)

                row['doc_id'] = data['paper_id']
                row['title'] = data['metadata']['title']
                
                # If title is empty, we skip the paper
                if len(row['title']) <= 2:
                    continue

                # If a paper does not have an abstract of a body, we will skip it
                if ('abstract' not in data or 'body_text' not in data):
                    continue
                else:
                    # Now need all of the abstract. Put it all in
                    # a list then use str.join() 
                    abstract_list = [abst['text'] for abst in data['abstract']]
                    abstract = "\n ".join(abstract_list)

                # Skip the paper if abstract is empty
                if len(abstract) <= 2:
                    continue

                row['abstract'] = abstract

                # And lastly the body of the text.
                # These clauses check if the user wants to clean up references
                if self.NO_BIB:
                    body_list = self.__clean_bib(data['body_text'], 4)
                else:
                    body_list = [bt for bt in data['body_text']]

                row['text_body'] = body_list

                df_list.append(row)
        # create final dataframe
        self.PAPERS_DF = pd.DataFrame(df_list)


    def merge_metadata(self, metadata = 'metadata.csv'):
        """
            Joins paper information with information on journal for paper,
            authors, doi and published date  
                metadata: path to csv file containing metadata
        """
        metadata_df = pd.read_csv(self.ROOT_DIR + metadata)
        metadata_df = metadata_df.loc[:, 
                          ['sha', 'publish_time', 'authors', 'journal', 'doi']]
        self.PAPERS_DF = self.PAPERS_DF.merge(metadata_df,
                                              left_on='doc_id',
                                              right_on='sha',
                                              how='inner')

    def merge_journals(self):
        """
        Joins paper information with information on journal ratings
        Important column: H_Index
        """
        journal_df = pd.read_csv(root + 'scimagoj_2018.csv', sep=';')
        papers_ratings_df = self.PAPERS_DF.merge(
            journal_df.loc[:, ['Title', 'H index']],
            left_on='journal',
            right_on='Title',
            how='left')
        papers_ratings_df = papers_ratings_df.drop(
            ['sha', 'Title'], axis=1).reset_index(drop=True)
        self.PAPERS_DF = papers_ratings_df

    def get_df(self):
        """
        Returns processed dataframe
        """
        self.PAPERS_DF = self.PAPERS_DF.dropna(
            subset=['abstract', 'text_body'])
        return self.PAPERS_DF

We will now parse the papers from our data(root) directory and store them in `papers_df`.

In [16]:
paper_loader = PaperLoader(root)
paper_loader.create_paper_df()
paper_loader.merge_metadata()
paper_loader.merge_journals()
papers_df = paper_loader.get_df()

HBox(children=(FloatProgress(value=0.0, max=59311.0), HTML(value='')))




In [17]:
papers_df.head(2)

Unnamed: 0,doc_id,title,abstract,text_body,publish_time,authors,journal,doi,H index
0,306ef95a3a91e13a93bcc37fb2c509b67c0b5640,A Novel Approach for a Novel Pathogen: using a...,Thousands of people in the United States have ...,[{'text': 'The 2019 novel coronavirus (SARS-Co...,2020-03-12,"Bryson-Cahn, Chloe; Duchin, Jeffrey; Makarewic...",Clin Infect Dis,10.1093/cid/ciaa256,
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,International aviation emissions to 2025: Can ...,"International aviation is growing rapidly, res...","[{'text': 'Sixty years ago, civil aviation was...",2009-01-31,"Macintosh, Andrew; Wallace, Lailey",Energy Policy,10.1016/j.enpol.2008.08.029,178.0


In [18]:
papers_df.shape

(25312, 9)

## Filtering for covid-19 related papers released after 2019
There is a lot of noise in this dataset due to information about other strains of coronavirus so we will select only the papers that are related to Covid-19. 

While the older papers may contain some important insight on the variance among the  different strains of coronavirus, for our purposes, we will only be looking at papers published on 2019 or later because that is when Covid-19 was first discovered in humans.

In [19]:
# List of keywords for covid-19
cov_list = [
    'novel coronavi',
    'covid',
    'cov_2',
    'cord-19',
    'cord 19',
    '2019-nCoV',
    '2019 ncov',
    '2019 cov',
    'wuhan coronavi',
]

### `RelevantFilter` class will filter the dataframe from `PaperLoader` and filter for covid-19 papers published on 2019 or later. 
We will need to supply a list of covid-related keywords to filter from to the `constructor`


In [20]:
class RelevantFilter():
    
    def __init__(self, keywords, year='2019'):
        """
        constructor for RelevantFilter
            keywords: keywords to filter for
            year: papers written before this year will be discarded
        """
        self.KEYWORDS = keywords
        self.YEAR = year

    def extract_recent(self, df):
        """
        extracts documents published on or after self.YEAR
        """
        return df[df['publish_time'] >= self.YEAR]

    def filter_papers(self, df):
        """
        Filters for papers whose title have mention of 
        any of the terms in self.KEYWORDS
        """
        pattern = re.compile('(' + "|".join(self.KEYWORDS) + ')',
                                 re.IGNORECASE)
        # We will filter for rows with one or more matches 
        # for title and covid keywords
        df = df[df['title'].apply(lambda x: 
                                  len(pattern.findall(x)) >= 1
                                  if x else False)]
        
        return df

We will filter through `papers_df` to get only covid-19 related papers in `covid_df`

In [21]:
covid_filter = RelevantFilter(cov_list, '2019')
covid_df = covid_filter.filter_papers(papers_df)
covid_df = covid_filter.extract_recent(covid_df)

In [22]:
covid_df.shape

(929, 9)

In [23]:
covid_df.head(1)

Unnamed: 0,doc_id,title,abstract,text_body,publish_time,authors,journal,doi,H index
0,306ef95a3a91e13a93bcc37fb2c509b67c0b5640,A Novel Approach for a Novel Pathogen: using a...,Thousands of people in the United States have ...,[{'text': 'The 2019 novel coronavirus (SARS-Co...,2020-03-12,"Bryson-Cahn, Chloe; Duchin, Jeffrey; Makarewic...",Clin Infect Dis,10.1093/cid/ciaa256,


In [24]:
list(covid_df.head(2)['title'].values)

['A Novel Approach for a Novel Pathogen: using a home assessment team to evaluate patients for 2019 novel coronavirus (SARS-CoV-2)',
 'Modeling the dynamics of novel coronavirus (2019-nCov) with fractional derivative']

## Keyword Analysis (Phase 1)

We will now go through the papers to extract and rank excerpts that contain relevant information about risk factors for covid-19. 
We will do this through an analysis of:

- Risk factors for covid-19
- Study designs
    - We will use this to evaluate the quality of a paper's methodologies for our rankings
- Outcomes
    - We will incentivise excerpts to explicitly mention outcomes that we have found researchers look for(in our interviews)

**Note**: The list of keywords were all obtained from a crowdsourced medical dictionary researchers had assembled. You can find more details [here](https://docs.google.com/spreadsheets/d/1t2e3CHGxHJBiFgHeW0dfwtvCG4x0CDCzcTFX7yz9Z2E/edit#gid=1217643351)

In [114]:
risk_factors = [{
    'name': 'smoking',
    'pattern': 'smok'
}, {
    'name': 'diabetes',
    'pattern': 'diabete'
}, {
    'name': 'pregnancy',
    'pattern': 'pregnan'
}, {
    'name': 'tuberculosis',
    'pattern': '(tubercul|mtb|\btb[A-Za-z0-9]\b)'
}]

In [115]:
design_list = [
    'mathemat', 'profil', 'cross sectional case control',
    'matched case control', 'contact', 'surviv', 'tracing,', 'time to event',
    'time-to-event', 'risk factor analysis', 'logistic regression',
    'cross-sectional case-control', 'matched case-control',
    'observational case series', 'time series analysis', 'survival analysis',
    'investigati', 'model', 'outbreak', 'stochast', 'statist', 'analysi',
    'experiment', 'excret', 'investig'
    'retrospective cohort', 'cross-sectional case-control',
    'cross sectional case control', 'prevalence survey', 'systematic review ',
    'meta-analysis', 'meta analysis', 'matched case-control',
    'matched case control', 'medical record review',
    'observational case series', 'time series analysis',
    'pseudo-randomized controlled', 'pseudo randomized controlled',
    'randomized controlled', 'retrospective analysis', 'retrospective study',
    'retrospective studies'
]

In [116]:
outcome_list = [
    'risk', 'range', 'duration', 'asymptomatic', 'infecti', 'reproducti',
    'route', 'age', 'transmm'
    'stratifi', 'period,', 'health', 'r0', 'shedd', 'viral'
    'period', 'incub', 'generat', 'factor', 'interval,', 'serial'
]

### `PaperAnalyzer` class will take in a DataFrame of papers and then analyze each paper. 
The analysis is done with with its `analyze_risks()`, `analyze_designs()` and `analyze_outcomes()` methods that will analyze the risk factors, designs and outcomes respectively for excerpts in the paper. Finally, the `get_df()` method will return a new DataFrame with rankings for relevancy of excerpts. The rankings also factor in the `section` of the paper that the excerpt is from, with sections like **discussion** or **results** that seem to have pertinent, concise information ranked higher. Furthermore, these rankings are also normalized by the lenght of the excerpts

*Note: These rankings for sections were determined through our interviews with epidimiologists.*

In [129]:
class PaperAnalyzer():    
    """
    Takes in a dataframe of papers and set it up for analysis
    """
    # Setting up static constants
    DEFAULT_RISKS = risk_factors
    DEFAULT_DESIGNS = design_list
    DEFAULT_OUTCOMES = outcome_list
    
    def __init__(self, parent_df):
        """
        Explodes the passed dataframe on sections for more granular analysis
        Sets up ranks to be updated later by methods
        """
        # Section ratings
        self.section_ratings = {
                        'discus': 10,
                        'concl': 10,
                        'resul': 10,
                        'analy': 9,
                        'impli': 9,
                        'valu': 9,
                        'intro': 6
                        }
        
        self.df = parent_df.explode('text_body')
        # Extracting section headers
        self.df['section'] = self.df['text_body'].apply(lambda x: 
                                                        x['section'] 
                                                        if type(x) == dict 
                                                        else None)
        # Extracting section texts
        self.df['text_body'] = self.df['text_body'].apply(lambda x:
                                                          x['text'] 
                                                          if type(x) == dict 
                                                          else None)
        # Dropping rows where section text is empty
        self.df = self.df[self.df['text_body'].notna()]
        self.df['total_rank'] = 0
        # TQDM is used for progress bars
        tqdm.pandas()

    def analyze_risks(self, risk_factors):
        """
        Analyses papers in self.df for risk factors and returns a report df
        with columns has_{risk_factor}?, {risk_factor}_count, 
        {risk_factor}_in_title and updates {total_rank} for each row.
        The match_indices column is produced for ease of visualization
        in the web app.
        """
        if risk_factors == None:
            risk_factors = PaperAnalyzer.DEFAULT_RISKS
        
        if type(risk_factors[0]) == dict:
            patterns = [risk['pattern'] for risk in risk_factors]
        elif type(risk_factors[0]) == str:
            patterns = [risk for risk in risk_factors]
        
        self.df = self.df[self.df['text_body'].apply(lambda x:
                                                    any(re.compile(pattern, re.IGNORECASE).findall(x)
                                                       for pattern in patterns)
                                                    )]
        self.df['risk_factors'] = [[]] * len(self.df)
        self.df['match_indices'] = [[]] * len(self.df)
        for i in tqdm(range(len(risk_factors))):
            factor = risk_factors[i]
            if type(factor) == dict:
                name = factor['name']
                pattern = re.compile(factor['pattern'], re.IGNORECASE)
            elif type(factor) == str:
                name = factor
                pattern = re.compile(factor, re.IGNORECASE)
    
            self.df['_matches'] = self.df['text_body'].apply(lambda x: 
                                                                      [(m.start(), m.group()) 
                                                                       for m in pattern.finditer(x)])
            
            self.df[name + '_count'] = self.df['_matches'].apply(lambda x: len(x))
            self.df['has_' + name + '?'] = self.df[name + '_count'].apply(lambda x: x > 0)
            self.df[name + '_in_title'] = self.df['title'].apply(lambda x:
                                                                         len(pattern.findall(x)) > 0)
            self.df[name + '_count'] = self.df.apply(lambda x:
                                                             x[name + '_count'] + 10
                                                            if x[name + '_in_title'] 
                                                             else x[name + '_count'],
                                                            axis=1)
            self.df['total_rank'] += self.df[name + '_count']
            
            self.df['risk_factors'] = self.df.apply(lambda x: 
                                    x['risk_factors'] + [name] if x['has_' + name + '?']
                                    else x['risk_factors'],
                                   axis=1)
            self.df['match_indices'] = self.df.apply(lambda x: 
                                    x['match_indices'] + [n[0] for n in x['_matches']] if x['has_' + name + '?']
                                    else x['match_indices'],
                                   axis=1)
            self.df.drop('_matches', axis=1, inplace=True)
            

    def analyze_designs(self, design_list):
        """
        Analyses papers in self.df for study designs and returns a report df 
        with 'design' and 'design_rank'. 'design_rank' is decided upon from the 
        input in crowdsourced medical dictionary.
        """
        if design_list == None:
            design_list = PaperAnalyzer.DEFAULT_DESIGNS
        self.df['design'] = self.df['text_body'].progress_apply(lambda x:
                                                                      [re.findall(des, x, re.IGNORECASE) 
                                                                       for des in design_list])
        self.df['design_rank'] = self.df['design'].apply(lambda x:
                                                                len(x))
        self.df['total_rank'] += self.df['design_rank']

    def analyze_outcomes(self, outcomes):
        """
        Analyses papers in self.df for outcomes and returns a report df 
        with 'outcomes' and 'outcome_rank'. 'outcome_rank' is decided upon 
        by the frequency of mentions of outcomes in the excerpt
        """
        
        if outcomes == None:
            outcomes = PaperAnalyzer.DEFAULT_OUTCOMES
        self.df['outcomes'] = self.df['text_body'].progress_apply(lambda x:
                                                                        [re.findall(outcome, x, re.IGNORECASE)
                                                                         for outcome in outcomes])
        self.df['outcome_rank'] = self.df['outcomes'].apply(lambda x: len(x))
        self.df['total_rank'] += self.df['outcome_rank']

    def perform_analysis(self, risk_factors, design_list=None, outcomes=None):
        """
        This function is a wrapper function that provides interface
        to conduct analysis on all of risk factors, study designs and
        outcomes. Users may specify their own design_list or outcomes. If not,
        the default is used.
        """
        print("Analyzing risks")
        self.analyze_risks(risk_factors)
        print("Analyzing study designs")
        self.analyze_designs(design_list)
        print("Analyzing outcomes")
        self.analyze_outcomes(outcomes)

    def get_df(self, risk_factor=None):
        """
        Applies section ratings, updates total ratings and returns reporting df
            risk_factor: if specified, the returned df will only have excerpts
                            that mention this risk factor
        """
        self.df['section_rank'] = self.df['section'].apply(lambda x: self.section_ratings[x] 
                                                           if x in self.section_ratings else 5)
        self.df['total_rank'] += self.df['section_rank']
        
        # Normalizing total rank for length of excerpts
        self.df['total_rank'] = self.df.apply(lambda x: x['total_rank']
                                                     /(len(word_tokenize(x['text_body'])))
                                              , axis=1)
        if risk_factor:
            if not self.__ANALYZED_RISKS:
                raise ValueError(self.__ERROR_MESSAGE)
            return self.df[self.df['has_' + risk_factor +
                                                   '?'] == True]
        return self.df

In [None]:
def rank_design(design_keyword):
    design_rankings = {
        
    }
    
    min_distance = 1000000000
    for key in design_rankings.keys():
        distance = editdistance.eval(design_keyword, key)
        min_distance = min(min_distance, distance)
    
        

In [130]:
covid_analysis = PaperAnalyzer(covid_df)
covid_analysis.analyze_risks(risk_factors)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [131]:
covid_analysis.analyze_designs(design_list)

HBox(children=(FloatProgress(value=0.0, max=339.0), HTML(value='')))




In [132]:
covid_analysis.analyze_outcomes(outcome_list)

HBox(children=(FloatProgress(value=0.0, max=339.0), HTML(value='')))




In [133]:
enriched_covid_df = covid_analysis.get_df()

In [134]:
enriched_covid_df.shape

(339, 30)

In [128]:
enriched_covid_df.to_json("../../enriched_covid_df.json", orient='records')

In [150]:
risk_factors, enriched_covid_df.sort_values(by='total_rank', ascending=False).iloc[3]['text_body']

([{'name': 'smoking', 'pattern': 'smok'},
  {'name': 'diabetes', 'pattern': 'diabete'},
  {'name': 'pregnancy', 'pattern': 'pregnan'},
  {'name': 'tuberculosis', 'pattern': '(tubercul|mtb|\x08tb[A-Za-z0-9]\x08)'}],
 'According to the findings of the present study, hypertension, cardiovascular diseases, diabetes mellitus, smoking, COPD, malignancy, and chronic kidney disease were among the most prevalent underlying diseases among hospitalized patients with COVID-19, respectively.')

#### This marks the end of phase 1. The resulting dataframe will be stored as a json to be served by the web app.

## Question Search (Phase 2)
We will extend the capabilities from the `PaperAnalyzer` class and attempt to answer some questions.

### The `Question` class will decompose and resolve a question about risk factors.
The result will then be piped to an instance of `PaperAnalyzer` to conduct similar analysis. Users will be able to specify their own list of outcomes. If not specified, the default set of outcomes will be used.

In [None]:
class Question():
    """
    The purpose of this class is to resolve a question for 
    keyword searching
    """
    def __init__(self, question, design_list=None, outcomes=None):
        """
        The constuctor does most of the method-calling for question resolution
        """
        self.DESIGN_LIST = design_list
        self.OUTCOMES = outcomes
        self.RISK = question
        self.risk_factors = None
        self.design_list = None
        self.outcome_list = None
        self.__resolve_question()
        if design_list:
            self.__resolve_design()
        if outcomes:
            self.__resolve_outcomes()

    def __question_tokenize(self, sent):
        """
        Cleans the question string
        """
        abbvr_pattern = re.compile('(e.g.|i.e.)')
        sent = abbvr_pattern.sub('', sent)
        remove_punct_dict = {key: " " for key in string.punctuation}
        remove_punct_dict['.'] = ''
        remove_punct = str.maketrans(remove_punct_dict)
        sent = sent.translate(remove_punct)
        return sent.replace('R', 'R0').replace('-', ' ')

    def __resolve_question(self):
        """
        stems and removes irreleavnt words from questions
        to create keywords for keyword analysis
        """
        subquestion = self.RISK
        sub_q = self.__question_tokenize(subquestion)
        keywords = set([
            stemmer.stem(word) for word in word_tokenize(sub_q)
            if word.lower() not in stop_words and 'cov' not in word.lower()
            and word.lower().islower()  #This checks and removes numbers
        ])
        self.risk_factors = list(keywords)

    def __resolve_design(self):
        """
        Resolves study designs to allow for study-design evaluation
        """
        design_keys = self.__question_tokenize(self.DESIGN_LIST)
        design_keys = set([
            stemmer.stem(word) for word in word_tokenize(design_keys)
            if word.lower() not in stop_words
            and word.lower().islower()  #This checks and removes numbers
        ])
        self.design_list = list(design_keys)

    def __resolve_outcomes(self):
        """
        Resolves outcomes to allow for outcome evaluation
        """
        outcome_keys = self.__question_tokenize(self.OUTCOMES)
        outcome_keys = set([
            stemmer.stem(word) for word in word_tokenize(outcome_keys)
            if word.lower() not in stop_words
            and word.lower().islower()  #This checks and removes numbers
        ])
        self.outcome_list = list(outcome_keys)

    def get_keywords(self):
        """
        Returns keywords from earlier methods
        """
        result = {'risk': None, 'design': None, 'outcome': None}
        result['risk'] = self.risk_factors
        if self.design_list:
            result['design'] = self.design_list
        if self.outcome_list:
            result['outcome'] = self.outcome_list
        return result

In [None]:
def analyze_question(df, question):
    """
    Function to take in a Question instance and a 
    dataframe with covid-excerpts to perform 
    evaluation and rankings on information relevancy
    """
    reference_df = PaperAnalyzer(df)
    keys = question.get_keywords()
    reference_df.perform_analysis(keys['risk'], keys['design'],
                                  keys['outcome'])
    return reference_df.get_df()

In [None]:
question_document = pd.read_csv(
    '../../../Downloads/Kaggle COVID-19 medical dictionary - sub.question.matching.csv',
    index_col=0)

In [None]:
sample_q = question_document.iloc[127]['Subquestion']

In [None]:
sample_q = sent_tokenize(sample_q)[0]
sample_q

In [None]:
sample_d = question_document.iloc[127]['Design.list']
sample_d, type(sample_d)

In [None]:
sample_o = question_document.iloc[127]['Outcome.list']
sample_o, type(sample_o)

In [None]:
question_df = analyze_question(covid_df, Question(sample_q, sample_d,
                                                  sample_o))

In [None]:
question_df.iloc[0]['match_indices']

In [None]:
for i in range(10):
    print(question_df.sort_values(by='total_rank', ascending = False).iloc[i]['text_body'])
    print("==========================================================================")