### Import libraries


In [None]:
# scientific and numberical libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk # NLP libraries

# Jupyter relevant packages
from ipywidgets import interact
import ipywidgets as widgets
from IPython.display import display

#general libraries
from pathlib import Path, PurePath
import requests
from requests.exceptions import HTTPError, ConnectionError
import re, os, sys

In [None]:
# Add Covid19_Search_Tool/src to python path
nb_dir = os.path.split(os.getcwd())[0]
data_dir = os.path.join(nb_dir,'src')
if data_dir not in sys.path:
    sys.path.append(data_dir)

# Import local libraries
from utils import ResearchPapers
from nlp import SearchResults, WordTokenIndex, preprocess, RankBM25Index

### Download data from local folder
Requires visiting [COVID-19 Open Research Dataset Challenge (CORD-19)](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge), downloading the data (you need a Kaggle account), then moving and unzipping the data in Covid19_Search_Tool/data

In [None]:
# Download metadata from the CORD-19 dataset
data_path = os.path.join(os.getcwd(), "../data","CORD-19-research-challenge")
metadata_path = os.path.join(data_path, 'metadata.csv')
metadata = pd.read_csv(metadata_path,
                               dtype={'Microsoft Academic Paper ID': str,
                                      'pubmed_id': str})

# Set the abstract to the paper title if it is null
metadata.abstract = metadata.abstract.fillna(metadata.title)
print("Number of articles BEFORE removing duplicates: %s " % len(metadata))

# Some papers are duplicated since they were collected from separate sources. Thanks Joerg Rings
duplicate_paper = ~(metadata.title.isnull() | metadata.abstract.isnull() | metadata.publish_time.isnull()) & (metadata.duplicated(subset=['title', 'abstract']))
metadata.dropna(subset=['publish_time', 'journal'])
metadata = metadata[~duplicate_paper].reset_index(drop=True)
print("Number of articles AFTER removing duplicates: %s " % len(metadata))

### Create Data Classes for the Research Dataset and Papers
These classes make it easier to navigate through the datasources. There is a class called ResearchPapers that wraps the entire dataset an provide useful functions to navigate through it, and Paper, that make it easier to view each paper.

In [None]:
papers = ResearchPapers(metadata)

#### Show a Paper

In [None]:
papers[1]

#### Pull info from a paper

In [None]:
index=1
paper=papers[index]
print("Example paper #%s\nTitle: %s\nAuthors: %s " % (index, paper.title(), paper.authors(split=True)))

### Text Preprocessing
To prepare the text for the search index we perform the following steps (in Covid19_Search_Tool/src/nlp.py)
1.   Remove punctuations and special characters
2.   Convert to lowercase
3.   Tokenize into individual tokens (words mostly)
4.   Remove stopwords like (and, to))
5.   Lemmatize

In [None]:
# Hardcode the data we want to use in search
SEARCH_DISPLAY_COLUMNS = ['title', 'abstract', 'doi', 'authors', 'journal', 'publish_time']

### Create the index (This takes several minutes)

In [None]:
bm25_index = RankBM25Index(metadata, SEARCH_DISPLAY_COLUMNS)

### Search by date

In [None]:
# example output
query='curise ship'
n=50
results = bm25_index.search(query,n)
results.results.sort_values(by=['publish_time'], ascending=False).head(5)

In [None]:
# example output
query='ACE spike'
n=50
results = bm25_index.search(query,n)
results.results.sort_values(by=['publish_time'], ascending=False).head(5)

### Creating an Autocomplete Search bar with ranking by score
Here we provide a search bar with autocomplete. This uses IPywidgets interactive rendering of a TextBox.

In [None]:
def search_papers(SearchTerms: str):
    results_to_consider=200
    results_to_display=10
    # gather search results by score
    output = bm25_index.search(SearchTerms, n=results_to_consider)
    # sort results by recency
    # output=search_results.results.sort_values(by=['publish_time'], ascending=False).head(results_to_display)
    if len(output) > 0:
        display(output) 
    return output

searchbar = widgets.interactive(search_papers, SearchTerms='ACE spike')
searchbar

### TODO

In [None]:
# Do search with option to restrict years available

### Looking at the Covid Research Tasks
This dataset has a number of tasks. We will try to organize the papers according to the tasks

What is known about transmission, incubation, and environmental stability?
What do we know about COVID-19 risk factors?
What do we know about virus genetics, origin, and evolution?
What has been published about ethical and social science considerations?
What do we know about diagnostics and surveillance?
What has been published about medical care?
What do we know about non-pharmaceutical interventions?
What has been published about information sharing and inter-sectoral collaboration?
What do we know about vaccines and therapeutics?

In [None]:
tasks = [('What is known about transmission, incubation, and environmental stability?', 
        'transmission incubation environment coronavirus'),
        ('What do we know about COVID-19 risk factors?', 'risk factors'),
        ('What do we know about virus genetics, origin, and evolution?', 'genetics origin evolution'),
        ('What has been published about ethical and social science considerations','ethics ethical social'),
        ('What do we know about diagnostics and surveillance?','diagnose diagnostic surveillance'),
        ('What has been published about medical care?', 'medical care'),
        ('What do we know about vaccines and therapeutics?', 'vaccines vaccine vaccinate therapeutic therapeutics')] 
tasks = pd.DataFrame(tasks, columns=['Task', 'Keywords'])

#### Research papers for each task
Here we add a dropdown that allows for selection of tasks and show the search results

In [None]:
def show_task(Task):
    print(Task)
    keywords = tasks[tasks.Task == Task].Keywords.values[0]
    search_results = bm25_index.search(keywords, n=200)
    return search_results
    
results = interact(show_task, Task = tasks.Task.tolist());