In [1]:
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
import tensorflow as tf
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import string
import glob
from os import truncate
from pathlib import Path

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
#Creating user_ratings_df dataframe to store the user feedback
user_ratings_df = pd.DataFrame(columns=['User_Name','User_ID','Question','Answer','Rating'])
user_ratings_df

Unnamed: 0,User_Name,User_ID,Question,Answer,Rating


In [4]:
#Loading the models
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [5]:
greetings_list = ['hi','hello','hey','morning','afternoon','evening']
account_list = ['login','account','locked','password','forgot','reset','unlock']

In [17]:
#Loading the reference text
working_dir = '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data'

txt_files  =  glob.glob(working_dir+'//*.txt')
print(txt_files)
mult_text_l = []

# append the different files content to a list
for file in txt_files:
    with open (file, 'r') as f:
        s_text_list = f.read()
        mult_text_l.append(s_text_list)

#Appending all files into a single reference text   
text = ' '.join(mult_text_l)
text = text.replace('\n', '')
text = re.sub("[!\"#$%&'‘’()*+,\-/:;<=>?@[\]^_`{|}~]", " ", text)
text = re.sub(' +', ' ', text)
print(text)


['/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/General queries.txt', '/Users/varshini/AMPBA/Capstone/Pilot_Models/Question_Answering/Data/KNN & Regression specific queries.txt']
I am a virtual assistant here to help you with MLApps platform.To unlock your account please reset your password and if the issue still persists please email our support team.To login please reset your password and if the issue still persists please email our support team.To reset your password please click on forgot password option in login page.Please click on forgot password option in login page.The left side of the screen left panel has an option of Input data Click on the Browse option and upload dataset in CSV format here.The Overview tab provides you with relevant study resources tutorials sample datasets and a short overview to start with which helps you understand and comprehend your data correctly.On the left panel there s an option called Data selection where you can select you

In [7]:
#similarity_score(ques, answers_list) function calculates the similarity score of all question and answer pair and returns the answer with highest similarity.
def similarity_score(ques, answers_list):
    scores_df = pd.DataFrame(columns=['question','answer','score'])

    embeddings1 = similarity_model.encode(question, convert_to_tensor=True)

    for sent in answers_list:
        embeddings2 = similarity_model.encode(sent, convert_to_tensor=True)
        cosine_score = util.cos_sim(embeddings1, embeddings2)
        score = '{:.4f}'.format(cosine_score[0][0])
        scores_df.loc[len(scores_df.index)] = [question, sent, score]

    scores_df = scores_df.sort_values(by=['score'], ascending=False)
    #print(scores_df)
    return scores_df['answer'].iloc[0]

In [8]:
def user_ratings(question, answer, rating):
    user_ratings_df.loc[len(user_ratings_df.index)] = ['User', len(user_ratings_df)+1, question, answer, rating]

In [20]:
def question_answer(question):
    question = re.sub("[!\"#$%&'‘’()*+,\-/:;<=>?@[\]^_`{|}~]", "", question)
    print(f"Question: {question}")

    question_list = question.lower().split()
    if any(word in question_list for word in greetings_list):
        return 'Hi! How may I help you?'
    elif any(word in question_list for word in account_list):
        return 'Please reset your password and if the issue still persists, please email our support team.'
    else:
        #tokenize question and text as a pair
        inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf", max_length=512, truncation=True)
        input_ids = inputs["input_ids"].numpy()[0]

        #string version of tokenized ids
        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)

        #model output using input
        output = model(inputs)
        #reconstructing the answer
        answer_start = tf.argmax(
            output.start_logits, axis=1
        ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
        answer_end = (
            tf.argmax(output.end_logits, axis=1) + 1
        ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        #print(f"Model Output: {answer}")
    
        #Returning Answer
        if answer.startswith("[CLS]") or answer.startswith("[SEP]") or answer.startswith(" "):
            answer = "Unable to find the answer to your question. Please contact ZeroCodeLearning.com"
            #print(f"Answer: {answer}")
            return answer
        else:
            try:
                temp_list = []
                answers_list = []
                #pattern matching the sentence
                temp_list = re.findall(r"([^.]*?%s[^.]*\.)" % answer, text.lower())
                #print(temp_list)
                if len(temp_list)>1:
                    for line in temp_list:
                        answers_list.append(line)
                    answer = similarity_score(question, answers_list)
                    #print(f"Answer: {answer}")
                    return answer
                else:
                    #print(temp_list)
                    answer = temp_list[0]
                    #print(f"Answer: {answer}")
                    return answer
            except IndexError:
                #print(f"Answer: {answer}")
                return answer

In [424]:
question_answer('Where to upload dataset')

Question: Where to upload dataset
the left side of the screen left panel has an option of input data click on the browse option and upload dataset in csv format
Answer: the left side of the screen left panel has an option of input data click on the browse option and upload dataset in csv format here.


'the left side of the screen left panel has an option of input data click on the browse option and upload dataset in csv format here.'

In [401]:
user_ratings_df

Unnamed: 0,User_Name,User_ID,Question,Answer,Rating
0,User,1,How to choose k,the square root of n,8
1,User,2,What is logistic regression,"a statistical analysis method to predict a binary outcome, such as yes or no, based on prior observations of a data set",10
2,User,3,What is linear regression,an attempt to model the relationship between two variables by fitting a linear equation to observed data,9
3,User,4,Where to upload input data?,Unable to find the answer to your question.,9
4,User,5,Where to upload the data,Unable to find the answer to your question.,8
5,User,6,Where to upload data,Unable to find the answer to your question.,9
6,User,7,How to choose k,Unable to find the answer to your question.,9
7,User,8,Where to upload data?,"\nto upload dataset, click on browse in the left panel and select a file in csv format.",10
8,User,9,Where should I upload data,"\nto upload dataset, click on browse in the left panel and select a file in csv format.",9
9,User,10,Where should I upload dataset,"\nto upload dataset, click on browse in the left panel and select a file in csv format.",9


In [19]:
#Run this for the chatbot to start
#Feedback for every question
question = input("\nHi! How may I help you? \n")
while True:
    answer = question_answer(question)
    print(f"Answer: {answer}")

    if(answer != 'Hi! How may I help you?'):
        rating = int(input("\nOn a scale of 1-10, how was your conversation experience with us? "))
        print(f"User rating is : {rating}")
        user_ratings(question, answer, rating)

    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question(Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nThankyou!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break

Question: Where can I see PCA visualozation
Model Output: the main panel
Answer after Pattern Matching: the main panel has pca visualization tab which shows 2d visualization of clusters.
User rating is : 10

Thankyou!
