In [2]:
from transformers import AutoTokenizer, BertForQuestionAnswering, pipeline
import tensorflow as tf
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import string
import glob
from os import truncate
from pathlib import Path
import textwrap

In [28]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
#Creating user_ratings_df dataframe to store the user feedback
user_ratings_df = pd.DataFrame(columns=['User_Name','User_ID','Question','Answer','Probability', 'Rating'])
user_ratings_df

Unnamed: 0,User_Name,User_ID,Question,Answer,Probability,Rating


In [11]:
#Loading the models
tokenizer = AutoTokenizer.from_pretrained('deepset/bert-base-cased-squad2')
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)
# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=120) 

In [5]:
greetings_list = ['hi','hello','hey','morning','afternoon','evening']
account_list = ['login','account','locked','password','forgot','reset','unlock']

In [19]:
#Loading the reference text
working_dir = '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data'

txt_files  =  glob.glob(working_dir+'//*.txt')
print(txt_files)
mult_text_l = []

# append the different files content to a list
for file in txt_files:
    with open (file, 'r') as f:
        s_text_list = f.read()
        mult_text_l.append(s_text_list)

#Appending all files into a single reference text   
text = ' '.join(mult_text_l)
text = text.replace('\n', '')
text = re.sub("[!\"#$%&'‘’()*+,\-/:;<=>?@[\]^_`{|}~]", " ", text)
text = re.sub(' +', ' ', text)
print(text)


['/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/KNN & Regression specific queries (1) (1).txt', '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/Cluster Analysis (1).txt', '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/General queries.txt', '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/Decision_Tree (1).txt', '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/Binary logit queries (2).txt', '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/Regression_Manual (1).txt']
k nearest neighbors algorithm also known as KNN or k NN is a non parametric supervised learning classifier which uses proximity to make classifications or predictions about the grouping of an individual data point.The left panel has Advanced Options section where there is option to Set test sample percentage Use the toggle bar to set required percentage of test sample.The left panel has Advanced Option

In [20]:
full_stops = []
# creating a list of all full stops (pseudo for sentence end)
for i in range(len(text)):
  if text[i] == '.':
    full_stops.append(i)
print(full_stops)

[213, 370, 537, 703, 834, 945, 1030, 1127, 1328, 1595, 1754, 1840, 1955, 2135, 2304, 2585, 2741, 2907, 3179, 3326, 3413, 3703, 3888, 4106, 4181, 4335, 4613, 4955, 5052, 5245, 5471, 5803, 6122, 6555, 6792, 7096, 7361, 7605, 7807, 7940, 8123, 8305, 8500, 8706, 9009, 9209, 9294, 9426, 9584, 9651, 9758, 9825, 9892, 9999, 10246, 10466, 10567, 10673, 10809, 10848, 11001, 11082, 11227, 11300, 11405, 11503, 11623, 11711, 11732, 11807, 12103, 12246, 12339, 12408, 12469, 12571, 12665, 12840, 12861, 12900, 13137, 13227, 13323, 13667, 13779, 13864, 13961, 14094, 14278, 14460, 14655, 14861, 15168, 15370, 15638, 15797, 15883, 15998, 16178, 16347, 16628, 16784, 16950, 17219, 17363, 17450, 17737, 17922, 18140, 18215, 18369, 18647, 18989, 19086, 19279]


In [8]:
def user_ratings(question, answer, probability, rating):
    user_ratings_df.loc[len(user_ratings_df.index)] = ['User', len(user_ratings_df)+1, question, answer, probability, rating]

In [24]:
def question_answer(question):
    question = re.sub("[!\"#$%&'‘’()*+,\-/:;<=>?@[\]^_`{|}~]", "", question)
    print(f"Question: {question}")

    question_list = question.lower().split()
    if any(word in question_list for word in greetings_list):
        return 'Hi! How may I help you?',1;
    elif any(word in question_list for word in account_list):
        return 'Please reset your password and if the issue still persists, please email our support team.',1;
    else:
        #tokenize question and text as a pair
        output = nlp({'question': question, 'context':text})
        answer = output['answer']
        probability = output['score']
        #print(f"Pipeline Output: {answer}")
        #print(probability)
        #print(f"Start Token: {output['start']}")
        #print(f"End Token: {output['end']}")

        #Returning Answer
        if answer.startswith("[CLS]") or answer.startswith("[SEP]") or answer.startswith(" "):
            answer = "Unable to find the answer to your question."
            #print(f"Answer: {answer}")
            return answer,probability;
        else:
            try:
                # Extracting complete sentence based on start and end positions of QA extract
                for j in range(len(full_stops)):
                    if full_stops[j] < output['start']:
                    #print(stops[j],ans['start'])
                        ans_start = full_stops[j]+1
                        ans_end = full_stops[j+1]+1
                answer = wrapper.fill(text[ans_start:ans_end])
                #print(answer)
                return answer,probability;
            except IndexError:
                #print(f"Answer: {answer}")
                return answer,probability;

In [25]:
#Run this for the chatbot to start
#Feedback for every question
question = input("\nHi! How may I help you? \n")
while True:
    answer, probability = question_answer(question)
    print(f"Answer: {answer}")
    #print(f"Probaility: {probability}")
    if(answer != 'Hi! How may I help you?'):
        rating = int(input("\nOn a scale of 1-10, how was your conversation experience with us? "))
        print(f"User rating is : {rating}")
        user_ratings(question, answer, probability, rating)

    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question(Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nThankyou!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break

Question: Where to upload data
Answer:  The left side of the screen left panel has an option of Input data Click on the Browse option and upload dataset in CSV
format here.
User rating is : 10
Question: 


ValueError: `question` cannot be empty

In [29]:
user_ratings_df

Unnamed: 0,User_Name,User_ID,Question,Answer,Probability,Rating
0,User,1,What is regression analysis,Regression analysis is a way of mathematically sorting out which of those variables does indeed have an impact.,0.317589,10
1,User,2,Where can I see PCA visualization,Data visualization is the representation of data through use of common graphics such as charts plots infographics and\neven animations to communicate complex data relationships and data driven insights.,0.027532,10
2,User,3,What is regression analysis,Regression analysis is a way of mathematically sorting out which of those variables does indeed have an impact.,0.317589,10
3,User,4,What is regression analysis,Regression analysis is a way of mathematically sorting out which of those variables does indeed have an impact.,0.317589,10
4,User,5,What is regression analysis,Regression analysis is a way of mathematically sorting out which of those variables does indeed have an impact.,0.317589,10
5,User,6,What is regression analysis,Regression analysis is a way of mathematically sorting out which of those variables does indeed have an impact.,0.317589,10
6,User,7,What is VIF,Variance inflation factor or VIF is a measure of the amount of multicollinearity in a set of multiple regression\nvariables and for a regression model variable is equal to the ratio of the overall model variance to the variance of a\nmodel that includes only that single independent variable.,0.102602,10
7,User,8,Where can I upload data,The left side of the screen left panel has an option of Input data Click on the Browse option and upload dataset in CSV\nformat here.,0.204164,10
8,User,9,Where can I see PCA Visualization,Data visualization is the representation of data through use of common graphics such as charts plots infographics and\neven animations to communicate complex data relationships and data driven insights.,0.023745,6
9,User,10,n,Variance inflation factor or VIF is a measure of the amount of multicollinearity in a set of multiple regression\nvariables and for a regression model variable is equal to the ratio of the overall model variance to the variance of a\nmodel that includes only that single independent variable.,0.008362,0
