## COVID-19 Open Research Dataset Challenge - What do we know about vaccines and therapuetics?
The following questions were analysed specifically: 
- Effectiveness of drugs being developed and tried to treat COVID-19 patients.
  - Clinical and bench trials to investigate less common viral inhibitors against COVID-19 such as naproxen, clarithromycin, and minocyclinethat that may exert effects on viral replication.
- Methods evaluating potential complication of Antibody-Dependent Enhancement (ADE) in vaccine recipients.
- Exploration of use of best animal models and their predictive value for a human vaccine.
- Capabilities to discover a therapeutic (not vaccine) for the disease, and clinical effectiveness studies to discover therapeutics, to include antiviral agents.
- Efforts targeted at a universal coronavirus vaccine.
- Efforts to develop animal models and standardize challenge studies
- Assays to evaluate vaccine immune response and process development for vaccines, alongside suitable animal models (in conjunction with therapeutics)

## Our approach - Creating a timeline visualizing the progress of vaccines/cures on COVID-19 and other similar viral diseases.
Our goal is to create an intuitive visualization of the progress of research on vaccines and therapuetics regarding COVID-19. Not only is this useful for professional researchers in having a quick overview of the clinical trial stages of each investigated vaccine/therapeutic, but also for the public, to have a better understanding of the time frame for which to expect a cure or solution. We decided to create vizualizations of research progress of other virusses as well as COVID-19, to get a better picture of the timescale and ammount of research that goes into making a vaccine or therapeutics.

Several steps were taken to create the visualizations:
1. Load and preprocess the data:
    - lemmatize all texts and remove stopwords
2. Categorize papers based on keywords 
    - using either string pattern matching or word embeddings
    - relevant words were manually selected based on the research questions and indicativaty of clinical stage trial (e.g. mouse vs human test subject, words expressing certainty etc.)
    - categories are: virus, clinical stage, drug type
3. Extract keywords/summaries from selected papers
    - TODO: write how we do this @Simon, @Silvan
5. Visualize extracted papers, links and summaries
    - TODO: explain how (after we know how) @Levi @Gloria


### 0.a Imports

In [None]:
# TODO: write your imports here
import os
import json

import pandas as pd
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.stem import WordNetLemmatizer

import pickle as pk
import numpy as np

#required additional nltk dependency
nltk.download('punkt')

# path to data
data_dir = '../../src'  
keyword_dir = '../../keywords'

### 0.b Functions

In [None]:
# As kaggle only allows notebook submissions, all functions should be in the notebook. Just copy your functions and paste them here.
          
def load_data(data_dir):
    """Load data from dataset data directory."""
    sha = []
    full_text = []

    subdir = [x for x in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir,x))]

    print(f"[INFO] Loading data from {data_dir}...")
    # loop through folders with json files
    for folder in tqdm(subdir, desc='reading folder'):
        
#             path = os.path.join(data_dir,folder, folder)
        path = os.path.join(data_dir,folder, folder, 'pdf_json')
        # loop through json files and scrape data
        try:
            for file in tqdm(os.listdir(path), desc='reading files'):
                file_path = os.path.join(path, file)

                # open file only if it is a file
                if os.path.isfile(file_path):
                    with open(file_path) as f:
                        data_json = json.load(f)
                        sha.append(data_json['paper_id'])

                        # combine abstract texts / process
                        combined_str = ''
                        for text in data_json['body_text']:
                            combined_str += text['text'].lower()

                        full_text.append(combined_str)

                else:
                    print('[WARNING]', file_path, 'not a file. Check pointed path directory in load_data().')
        except:
            print('[WARNING]', path, 'does not exist or something else is wrong lol')

    loaded_samples = len(sha)
    print(f"[INFO] Data loaded into dataset instance. {loaded_samples} samples added.")
    
    df = pd.DataFrame()
    df['sha'] = sha
    df['full_text'] = full_text
    
    return df

def clean_time(val):
    try:
        return datetime.strptime(val, '%Y-%m-%d')
    except:
        try:
            return datetime.strptime(val, '%Y %b %d')
        except:
            try:
                return datetime.strptime(val, '%Y %b')
            except:
                try:
                    return datetime.strptime(val, '%Y')
                except:
                    try:
                        return datetime.strptime('-'.join(val.split(' ')[:3]), '%Y-%b-%d')
                    except Exception as e:
                        return None

In [None]:
def tokenize_check(text):
    if isinstance(text, str):
        word_tokens = word_tokenize(text)
    elif isinstance(text, list):
        word_tokens = text
    else:
        raise TypeError
    return word_tokens
    

def remove_stopwords(text, remove_symbols=False):
    """ Tokenize and/or remove stopwords and/or unwanted symbols from string"""
    list_stopwords = set(stopwords.words('english'))
    # list of signs to be removed if parameter remove_symbols set to True
    list_symbols = ['.', ',', '(', ')', '[', ']']
    
    # check input type and tokenize if not already
    word_tokens = tokenize_check(text)

    # filter out stopwords
    text_without_stopwords = [w for w in word_tokens if not w in list_stopwords] 
    
    if remove_symbols is True:
        text_without_stopwords = [w for w in text_without_stopwords if not w in list_symbols]
    
    return text_without_stopwords

# from nltk.stem import WordNetLemmatizer 

def lemmatize(text):
    """ Tokenize and/or lemmatize string """
    lemmatizer = WordNetLemmatizer()
    
    # check input type and tokenize if not already
    word_tokens = tokenize_check(text)
    
    lemmatized_text = [lemmatizer.lemmatize(w) for w in word_tokens]
    
    return lemmatized_text

def find_keywords(text, df):
    """ Find relevant papers for the categories in df
    Returns a dictionary with the paper id's that match the categories
    It also stores the sentences where the matches have been found. This can be returned too if so the team decides """

    # Data cleaning:
    # Turn df into a dictionary with a list of key phrases
    # Lower all of them and remove null values
    dfd = {k: [x.lower() for x in v if not pd.isnull(x)] for k, v in df.to_dict('list').items()}
    
    matches = {}
    scores = {}
    
    # Remove redundant values (i.e., ['coronavirus', 'coronavirus disease'] can be left as ['coronavirus']; the element 'coronavirus disease' is useless)
    for k, v in dfd.items():
        # print(k)
        v = [x for x in v if not any([y in x for y in [z for z in v if z != x]])]
        dfd[k] = v

        # Find matches
        # Use the loop we're in where we've already cleaned the data to find the matches
        
        # if you use keyprhase, it handles phase i and phase ii the same way, it would count both..
        
        for sentence in sent_tokenize(text):
            for keyphrase in v:
                if keyphrase in sentence:
                    try:
                        already_a_match = sentence in matches[k]
                    except KeyError:
                        matches[k] = [sentence]
                    else:
                        if not already_a_match:
                            matches[k].append(sentence)
                            
        # score is scaled by the number of values to choose from
        if k in matches:
            scores[k] = len(matches)/len(v)

    # return the keys with the highest score. also return the sentences for this.
    if len(scores.keys()) > 0:
        max_score = list(scores.keys())[np.argmax(scores.values())]
        return max_score, matches[max_score]
    else:
        return 'nan','nan'

def summarize(text):
    # TODO @Simon @Silvan: extract keywords
    return 'summary'

#def visualize_data(data,keywords,summaries):
#    #TODO @Levi @Kwan: visualize data

### 0.c Relevant strings

In [None]:
# keywords that define the virus the paper is about (likely in title)
virus_keywords = pd.read_csv(keyword_dir+'/virus_keywords.csv')

# keywords describing clinical phase
clinical_stage_keywords = pd.read_csv(keyword_dir+'/phase_keywords.csv')

# keywords describing treatment types
drug_keywords = pd.read_csv(keyword_dir+'/drug_keywords.csv')

### 1. Load and Preprocess the data

In [None]:
# try the preloaded dataframe to speed up the process
try:
    df = pk.load(open('df.pkl','rb'))
except:
    # create dataset object
    meta_data = pd.read_csv(data_dir+'/metadata.csv')
    meta_data['publish_time'] = meta_data['publish_time'].apply(clean_time)
    full_texts = load_data(data_dir)

    # merge full text and metadata, so the paper selection can be performed either on full text
    # or abstract, if the full text is not available.
    df = pd.merge(meta_data,full_texts,on='sha',how='outer')
    df['full_text'][df['full_text'].isna()] = df['abstract'][df['full_text'].isna()]

    # drop papers with no abstract and no full text
    df = df.dropna(subset=['abstract','full_text'])
    df = df[df['full_text'] != 'Unknown']
    pk.dump(df,open('df.pkl','wb'))

In [None]:
df.head()

### 2. Define virus type, clinical stage and drug type

In [None]:
try:
    df = pk.load(open('df_kw.pkl','rb'))
except:
    # function on full text --> think about applying on full text or on abstract
    df['virus'], df['virus_sentence'] = zip(*df['abstract'].apply(find_keywords,df=virus_keywords))
    df['stage'], df['stage_sentence'] = zip(*df['abstract'].apply(find_keywords,df=clinical_stage_keywords))
    df['drug'], df['drug_sentence'] = zip(*df['abstract'].apply(find_keywords,df=drug_keywords))
    
    # drop papers with nan values?
    pk.dump(df,open('df_kw.pkl','wb'))

In [None]:
df

### 3. Summarize the texts

In [None]:
df['summary'] = df['full_text'].apply(summarize)

### 4. Visualize extracted papers, links and summaries

The next cell will first do some extra data cleaning
The final cell will output a web app, with a clickable web endpoint that you can use to display the cell content in full screen in the browser

In [None]:
#add some columns to the data
df['publish_time_month'] = df.publish_time.apply(lambda x: x.strftime('%Y-%m') if not pd.isna(x) else np.nan)
df = df.set_index('title')

In [None]:
from jupyter_plotly_dash import JupyterDash

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px
import numpy as np
from datetime import datetime
import traceback

'''
This is a prototype webapp for the CORD19 challenge on Kaggle
This involves a demo pandas dataframe, and sample visualisations
All data here is fictional!
'''

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
def make_bubbleplot(df):
    return px.scatter(\
                      pd.pivot_table(df.reset_index(), values=['title'], index=['virus', 'stage', 'publish_time_month'], aggfunc=np.count_nonzero).reset_index().rename(columns={'title':'count'}),\
                      x="publish_time_month", y="stage", color="virus", size='count',
                 hover_name="virus", title='Occurance of research tag per month per phase sized by #Occurances')

app = JupyterDash(__name__)

app.layout = html.Div(children=[
    html.H1(children='COVID-19: Visual Research Exploration Tool'),
    html.Marquee('The data in this tool is fictional!', draggable='true'),
    dcc.Tabs([
        dcc.Tab(label='Overview', children=[    
            dcc.Graph(
            id='stage-plot',
            figure=make_bubbleplot(df)
    )]),
        dcc.Tab(label='Discover', children=[
            html.Div('virus filter'),
            dcc.Dropdown(
                id=f'dropdown-virus',
                options=[{'label': k, 'value':k} for k in df.virus.unique() if not pd.isna(k)],
                multi=True,
                value=[k for k in df.virus.unique()]
            ),
            html.Div('stage filter'),
            dcc.Dropdown(
                id=f'dropdown-stage',
                options=[{'label': k, 'value':k} for k in df.stage.unique()],
                multi=True,
                value=[k for k in df.stage.unique()]
            ),
            html.Div('drug filter'),
            dcc.Dropdown(
                id=f'dropdown-drug',
                options=[{'label': k, 'value':k} for k in df.drug.unique()],
                multi=True,
                value=[k for k in df.drug.unique()]
            ),
            html.Div('x-axis'),
            dcc.Dropdown(
                id='x-axis',
                options=[{'label': k, 'value':k} for k in ['stage', 'virus', 'drug']],
                value='stage'
            ),
            html.Div('hue (color)'),
            dcc.Dropdown(
                id='hue-axis',
                options=[{'label': k, 'value':k} for k in ['stage', 'virus', 'drug']],
                value='virus'
            ),
            # ADD FILTER BASED ON VIRUS TYPE
            dcc.DatePickerRange(
                id='date-range',
                min_date_allowed=min(df.publish_time),
                max_date_allowed=max(df.publish_time),
                initial_visible_month=datetime(2020, 1, 1),
                start_date=datetime(2020, 1, 1),
                end_date = datetime(2020, 1, 31)
        ),
            dcc.Graph(
            id='discover-plot',
            figure=None,
        ),
            html.P(id='selected-element')
        ]
        )
    ]),
])

@app.callback(
    Output('selected-element', 'children'),
    [Input('discover-plot', 'clickData')]
    )
def show_point_data(data_dict):
    print(data_dict)
    title = data_dict['points'][0]['customdata'][0]
    abstract = df.loc[title]['abstract']
    summary = df.loc[title]['summary']
    return [f'SUMMARY {summary}',html.Br(),html.Br(), f'ABSTRACT:{abstract}']

@app.callback(
    Output('discover-plot', 'figure'),
    [Input('dropdown-virus', 'value'),
    Input('dropdown-stage', 'value'),
    Input('dropdown-drug', 'value'),
    Input('date-range', 'start_date'),
    Input('date-range', 'end_date'),
    Input('x-axis', 'value'),
    Input('hue-axis', 'value')
    ]
    )
def discover_plot(virus, stage, drug, start, end, x_ax, hue_ax):
    start = datetime.strptime(start.split('T')[0], '%Y-%m-%d')
    end = datetime.strptime(end.split('T')[0], '%Y-%m-%d')
    data = df[(df['publish_time'] >= start) & (df['publish_time']<= end)].copy(deep=True)
    data = data[(data.virus.isin(virus)) & (data.stage.isin(stage)) & (data.drug.isin(drug))]
    df['count'] = 1
    data= data.reset_index()
    fig = px.bar(data, x=x_ax, y='count', color=hue_ax, hover_data=['title', 'publish_time_month', 'virus', 'drug', 'stage'])
    return fig

app