In [19]:
import pandas as pd
import numpy as np
import re

import scipy

from transformers import BertForQuestionAnswering
from transformers import BertTokenizer, AutoTokenizer
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Process Dataset

In [2]:
df = pd.read_csv('datasets/QA.csv',index_col=False)

In [3]:
df.head()

Unnamed: 0,question,answer,source,link,last_updated
0,What is COVID-19?,COVID-19 is the disease caused by a new corona...,World Health Organization (WHO),https://www.who.int/news-room/q-a-detail,3/29/2021
1,What are the symptoms of COVID-19?,The most common symptoms of COVID-19 are fever...,World Health Organization (WHO),https://www.who.int/news-room/q-a-detail,3/29/2021
2,What happens to people who get COVID-19?,"Among those who develop symptoms, most (about ...",World Health Organization (WHO),https://www.who.int/news-room/q-a-detail,3/29/2021
3,Who is most at risk of severe illness from COV...,"People aged 60 years and over, and those with ...",World Health Organization (WHO),https://www.who.int/news-room/q-a-detail,3/29/2021
4,Are there long-term effects of COVID-19?,"Some people who have had COVID-19, whether the...",World Health Organization (WHO),https://www.who.int/news-room/q-a-detail,3/29/2021


In [4]:
df.shape

(831, 5)

Transform question column to lowercase

In [5]:
df['qprocessed'] = df['question'].str.lower()
df['qprocessed'] = df['qprocessed'].astype(str)

Remove numbers

In [6]:
df['qprocessed'] = [re.sub(r'\w*\d\w*','',i).strip() for i in df['qprocessed']]

Remove punctuations

In [7]:
df['qprocessed'] = df['qprocessed'].str.replace('[^ \w\s]','')

  df['qprocessed'] = df['qprocessed'].str.replace('[^ \w\s]','')


In [8]:
df['qprocessed']

0                                          what is covid
1                         what are the symptoms of covid
2                   what happens to people who get covid
3       who is most at risk of severe illness from covid
4                    are there longterm effects of covid
                             ...                        
826         what are the antibody levels postvaccination
827    how soon are antibody levels  formed postvacci...
828    if the vaccine will require regular or booster...
829                            will i need booster shots
830               does the vaccine require booster shots
Name: qprocessed, Length: 831, dtype: object

# Create embeddings

Load model for sentence transformer

In [9]:
name = 'mrm8488/bert-medium-finetuned-squadv2'

In [10]:
stmodel = SentenceTransformer(name)

Create embeddings of the processed questions column

In [11]:
sentence_embeddings = stmodel.encode(df['qprocessed'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Save embeddings as an array for future use (optional)

In [None]:
sentence_embeddings_save = np.array(sentence_embeddings)
np.save('files/sentence_embeddings.npy', sentence_embeddings_save)

In [None]:
#sentence_embeddings = np.load('datasets/sentence_embeddings.npy')

# Create functions for the processes

Function for processing the format and creating embeddings of the input question

In [12]:
def process_query(query):
    query = query.lower()
    query = re.sub(r'\w*\d\w*','',query).strip()
    query = re.sub(r'[^\w\s]','',query).strip()
    query = [query]
    query_embeddings = stmodel.encode(query)
    
    return query, query_embeddings

Function for getting the index of the closest question match and the cosine score

In [13]:
def semantic_search(query, query_embeddings):
    for query, query_embedding in zip(query, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])

        for index, distance in results[0:1]:
            cosine_score = 1-distance
            return index, cosine_score

Function for extracting the concise reply from the context

In [14]:
def search_reply(question, context):
    bertQA = BertForQuestionAnswering.from_pretrained(name)
    tokenizer = BertTokenizer.from_pretrained(name)
    nlp = pipeline('question-answering', model=bertQA, tokenizer=tokenizer)
    reply = nlp({'question': question,'context': context})
    return reply['answer']

# QA Process

Ask a question

In [15]:
search_input = 'What vaccines are available?'

In [17]:
query, query_embeddings = process_query(search_input)

index, cosine_score = semantic_search(query, query_embeddings)

In [22]:
if cosine_score >= 0.85 :
        
    match_question = str(df['question'][index])
    context = str(df['answer'][index])
    source = str(df['source'][index])
    date_accessed = str(df['last_updated'][index])
    link = str(df['link'][index])
        
    reply = search_reply(match_question, context)
    
    print(f"Answer: {reply} \n")
    print(f"Context: {context} \n")
    print(f"Source: {source} \n")
    print(f"Link: {link} \n")
    print(f"This information was last updated on {date_accessed}. \n")
    
else:
    print("Sorry, I don't know the answer yet.")

Answer: Sinovac and AstraZeneca vaccines 

Context: The government is currently in the initial phase of vaccine rollout with the availability of Sinovac and AstraZeneca vaccines in the country. Likewise, the country is in the advanced stages of negotiations with the COVAX Facility and various other vaccine manufacturers. For more information on the available vaccine brands in the Philippines, please visit DOH's website: https://doh.gov.ph/covid19vaccinestracker 

Source: Philippine Department of Health (DOH) 

Link: https://doh.gov.ph/COVID-19/FAQs 

This information was last updated on 3/31/2021. 

