In [1]:
from transformers import AutoTokenizer, BertForQuestionAnswering, pipeline
import tensorflow as tf
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import string
import glob
from os import truncate
from pathlib import Path
import textwrap
import time

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [4]:
#Creating user_ratings_df dataframe to store the user feedback
user_ratings_df = pd.DataFrame(columns=['User_Name','User_ID','Question','Answer','Probability', 'Rating', 'Response_Time'])
user_ratings_df

Unnamed: 0,User_Name,User_ID,Question,Answer,Probability,Rating,Response_Time


In [5]:
#Loading the models
tokenizer = AutoTokenizer.from_pretrained('deepset/bert-base-cased-squad2')
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)
wrapper = textwrap.TextWrapper(width=120) 

In [6]:
greetings_list = ['hi','hello','hey','morning','afternoon','evening']
account_list = ['login','account','locked','password','forgot','reset','unlock']

In [7]:
#Loading the reference text
working_dir = '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data'

txt_files  =  glob.glob(working_dir+'//*.txt')
print(txt_files)
mult_text_l = []

# append the different files content to a list
for file in txt_files:
    with open (file, 'r') as f:
        s_text_list = f.read()
        mult_text_l.append(s_text_list)

#Appending all files into a single reference text   
text = ' '.join(mult_text_l)
text = text.replace('\n', '')
text = re.sub("[!\"#$%&'‘’()*+,\-/:;<=>?@[\]^_`{|}~]", " ", text)
text = re.sub(' +', ' ', text)
print(text)


['/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/General queries.txt', '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/KNN & Regression specific queries.txt']
I am a virtual assistant here to help you with MLApps platform.To unlock your account please reset your password and if the issue still persists please email our support team.To login please reset your password and if the issue still persists please email our support team.To reset your password please click on forgot password option in login page.Please click on forgot password option in login page.The left side of the screen left panel has an option of Input data Click on the Browse option and upload dataset in CSV format here.The Overview tab provides you with relevant study resources tutorials sample datasets and a short overview to start with which helps you understand and comprehend your data correctly.On the left panel there s an option called Data selection where you can select you

In [8]:
full_stops = []
# creating a list of all full stops (pseudo for sentence end)
for i in range(len(text)):
  if text[i] == '.':
    full_stops.append(i)
print(full_stops)

[62, 174, 272, 348, 401, 533, 716, 898, 1093, 1299, 1602, 1817, 1974, 2141, 2307, 2438, 2549, 2634, 2731, 2932, 3199, 3358, 3444, 3559, 3739, 3908, 4189, 4345, 4511, 4783, 4930, 5017, 5307, 5492, 5710, 5785, 5939, 6217, 6559, 6656, 6849]


In [10]:
def user_ratings(question, answer, probability, rating, response_time):
    user_ratings_df.loc[len(user_ratings_df.index)] = ['User', len(user_ratings_df)+1, question, answer, probability, rating, response_time]

In [14]:
def question_answer(question):
    #starting time counter
    start_time = time.perf_counter()

    question = re.sub("[!\"#$%&'‘’()*+,\-/:;<=>?@[\]^_`{|}~]", "", question)
    print(f"Question: {question}")

    question_list = question.lower().split()
    if any(word in question_list for word in greetings_list):
        #ending time counter
        end_time = time.perf_counter()
        return 'Hi! How may I help you?',1,end_time-start_time;
    elif any(word in question_list for word in account_list):
        #ending time counter
        end_time = time.perf_counter()
        return 'Please reset your password and if the issue still persists, please email our support team.',1,end_time-start_time;
    else:
        #tokenize question and text as a pair
        output = nlp({'question': question, 'context':text})
        answer = output['answer']
        probability = output['score']
        #print(answer)
        #print(probability)

        #Returning Answer
        if answer.startswith("[CLS]") or answer.startswith("[SEP]") or answer.startswith(" "):
            answer = "Unable to find the answer to your question."
            #ending time counter
            #print(f"Answer: {answer}")
            end_time = time.perf_counter()
            return answer,probability,end_time-start_time;
        else:
            try:
                # Extracting complete sentence based on start and end positions of QA extract
                for j in range(len(full_stops)):
                    if full_stops[j] < output['start']:
                    #print(stops[j],ans['start'])
                        ans_start = full_stops[j]+1
                        ans_end = full_stops[j+1]+1
                answer = wrapper.fill(text[ans_start:ans_end])
                #print(answer)
                #ending time counter
                end_time = time.perf_counter()
                return answer,probability,end_time-start_time;
            except IndexError:
                #print(f"Answer: {answer}")
                #ending time counter
                end_time = time.perf_counter()
                return answer,probability,end_time-start_time;

In [14]:
question_answer('Where to upload dataset')

Question: Where to upload dataset
in CSV format
0.4086762070655823
[]
Answer: in CSV format


'in CSV format'

In [27]:
user_ratings_df

Unnamed: 0,User_Name,User_ID,Question,Answer,Probability,Rating
0,User,1,What is regression analysis,Regression analysis is a way of mathematically sorting out which of those variables does indeed have an impact.,0.317589,10
1,User,2,Where can I upload data?,The left side of the screen left panel has an option of Input data Click on the Browse option and upload dataset in CSV\nformat here.,0.204164,10
2,User,3,I can't login,"Please reset your password and if the issue still persists, please email our support team.",100.0,10


In [15]:
#Feedback for every question
question = input("\nHi! How may I help you? \n")
while True:
    answer, probability, response_time = question_answer(question)
    print(f"Answer: {answer}")
    print(f"Probability: {probability}")
    print(f"Response Time: {response_time}")
    if(answer != 'Hi! How may I help you?'):
        rating = int(input("\nOn a scale of 1-10, how was your conversation experience with us? "))
        print(f"User rating is : {rating}")
        #Saving the feedback to dataframe
        user_ratings(question, answer, probability, rating, response_time)

    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question(Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nThankyou!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break

Question: Hi
Answer: Hi! How may I help you?
Probability: 1
Response Time: 0.00012632900001108283
Question: What is regression analysis


  tensor = as_tensor(value)
  p_mask = np.asarray(


Answer: Regression analysis is a way of mathematically sorting out which of those variables does indeed have an impact.
Probability: 0.31758931279182434
Response Time: 4.52132278800002
User rating is : 10
Question: Im unable to login
Answer: Please reset your password and if the issue still persists, please email our support team.
Probability: 1
Response Time: 7.964799999626848e-05
User rating is : 10
Question: Where can I upload data
Answer: The left side of the screen left panel has an option of Input data Click on the Browse option and upload dataset in CSV
format here.
Probability: 0.20416371524333954
Response Time: 4.246650110000019
User rating is : 10

Thankyou!
