In [3]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import glob
from nltk import tokenize
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from rank_bm25 import BM25Okapi
from collections import Counter


I0420 17:34:30.109962 4493012288 file_utils.py:41] PyTorch version 1.4.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)


def format_body_text(body_text):
    
    body = ""

    for di in body_text:
        text = di['text']
        body += text
    return body
    
    
def format_corpus_text(body_text, min_len=18, max_len=128):
    junk_text = "copyright"
    
    def remove_braces_brackets(body_text):
        body_text = re.sub(r'\([0-9]+\)', '', body_text)
        body_text = re.sub(r'\[[^)]*\]', '', body_text)
        return(body_text)
        
    body_text = remove_braces_brackets(body_text)
    text_lines = []
    token_lines = tokenize.sent_tokenize(body_text)
    for line in token_lines:
      
        words = line.split()
        if junk_text not in words:
             max_word_len = len(max(words, key=len))
             if (len(words) > min_len) and (len(words) < max_len) and max_word_len > 5:
                 text_lines.append(line)
    
    return(text_lines)

def find_filenames(folder):
    all_files = glob.glob(f'{folder}/**/*.json', recursive=True)
    print("Number of articles retrieved from the folder:", len(all_files))
    files = []

    for filename in all_files:
        with open(filename) as f:
            file = json.load(open(filename))
            files.append(file)
    return(files) 


def find_file_index(folder):
    all_files = glob.glob(f'{folder}/**/*.json', recursive=True)
    path_name = []
    path_dict = {}
    path_dict_inv = {}
    file_index = []


    for filename in all_files:
        last = filename.split('/')[-1]
        first = filename.replace(last, '')
        #print(first)
        #print(last)
        if first not in path_name:
            path_name.append(first)
            path_dict[first] = len(path_name)-1
            path_dict_inv[len(path_name)-1] = first
        file_index.append((path_dict[first], last))   
        
    print(len(file_index))
    return file_index, path_dict_inv 




In [9]:
def generate_clean_data(files):
    cleaned_text = []

    for file in tqdm(files):
        body_text = format_body_text(file['body_text'])
        body_text = body_text.replace('\n',' ')

        features = [
           file['metadata']['title'],
           format_authors(file['metadata']['authors'], with_affiliation=True),
           body_text]
        cleaned_text.append(features)
    
    col_names = [
       'title',
       'authors',
       'paragraphs']

    clean_df = pd.DataFrame(cleaned_text, columns=col_names)
    return(clean_df)


def find_index_text(file_index, path_dict, index):
    indexed_files = []
    
    for i in index:
        filename = path_dict[file_index[i][0]]+file_index[i][1]

        with open(filename) as f:
            file = json.load(open(filename))
            indexed_files.append(file)
        
    frame = generate_clean_data(indexed_files)
    return(frame)

In [6]:

class BM25Retriever(BM25Okapi):
    def __init__(self, lowercase=True, tokenizer=None, top_n=10, k1=1.5, b=0.75, epsilon=0.25):
        super().__init__("dummy", tokenizer=None, k1=k1, b=b, epsilon=epsilon)
        self.lowercase = lowercase
        self.top_n = top_n
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        self.tokenizer = tokenizer
        self.num_doc = 0
        self.corpus_size = 0
        self.nd = Counter({})
        
    def fit_retriever(self, documents):
        doc_list = [document for document in documents]
        #print(len(doc_list))
        if self.tokenizer:
            tokenized_text = [self.tokenizer(document) for document in doc_list]
        else:
            tokenized_text = [document.split(" ") for document in doc_list]
   
        #print(tokenized_text[0])
        self.corpus_size = self.corpus_size+len(tokenized_text)
        num_doc = 0
        for doc_tokens in tokenized_text:
            num_doc += len(doc_tokens)
        self.num_doc = self.num_doc+num_doc   
        self.avgdl = self.num_doc/self.corpus_size
        
        #print(self.corpus_size, self.num_doc, self.avgdl)
        nd = Counter(self._initialize(tokenized_text))
        self.nd = self.nd + nd
        #print(len(self.doc_freqs), len(self.doc_len))
        
    def compute_params(self):    
        self._calc_idf(self.nd)
        
    def compute_scores(self, query):
        if(self.tokenizer == None):
           tokenized_query = query.split(" ")
        else:
           tokenizer = self.tokenizer
           tokenized_query = tokenizer(query)
      
        doc_scores = self.get_scores(tokenized_query)

        #return top_n indices and scores as list
        sorted_scores = np.argsort(doc_scores)
        top_n = self.top_n
        out = zip(sorted_scores[-1:-top_n-1:-1],doc_scores[sorted_scores[-1:-top_n-1:-1]])
        return list(out)   
           


In [None]:
# Use Wordpiece tokenizer
bert_tokenizer =  BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
retriever = BM25Retriever(tokenizer=bert_tokenizer.tokenize)

sub_folders = glob.glob('./input/CORD-19-research-challenge/*/')
for folder in sub_folders[:1]:
    files = find_filenames(folder)
    if(len(files) > 0):
        frame = generate_clean_data(files)
    retriever.fit_retriever(frame['paragraphs'])
   
#Compute TF-IDF paramas
retriever.compute_params()
    

In [10]:
folders = './input/CORD-19-research-challenge/*/'

file_index, path_dict_inv = find_file_index(folders)
print(len(file_index))
print(path_dict_inv)


59311
59311
{0: './input/CORD-19-research-challenge/custom_license/custom_license/pmc_json/', 1: './input/CORD-19-research-challenge/custom_license/custom_license/pdf_json/', 2: './input/CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/pmc_json/', 3: './input/CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/pdf_json/', 4: './input/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/', 5: './input/CORD-19-research-challenge/comm_use_subset/comm_use_subset/pmc_json/', 6: './input/CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/'}


In [None]:
# Find top_n documents based on BM250 for the query 
query = "what is covid-19"
doc_scores = retriever.compute_scores(query)

#Select top_n documents
index = [score[0] for score in doc_scores]
text = find_index_text(file_index, path_dict_inv, index)



In [None]:

#reader = DocReader('./input/model/')
reader = DocReader('bert-large-uncased-whole-word-masking-finetuned-squad')

ans = reader.predict(df=text, query=query, n_best=5)
b_answer = reader.best_answer(ans)
print(b_answer)