In [103]:
# This file was written in Kaggle notebook.
# This is the main file of the chatbot.
# This file contains the code for training the model and predicting the output.
# This file also contains the code for the chatbot to interact with the user.

# Importing the libraries
# if the libraries are not installed in Kaggle notebook, then install them using "!pip install <library name>"
import numpy as np # np for creating the arrays
import nltk # Natural Language Toolkit for tokenizing the words
from nltk.stem.lancaster import LancasterStemmer # Lancaster Stemmer for stemming the words
import tensorflow as tf # Tensorflow for creating the model
import random # Random for randomizing the responses
import json # JSON for reading the intents file
import tflearn # TFLearn for training the model
import pickle # Pickle for saving the model

In [105]:
# Creating an object of Lancaster Stemmer
stemmer = LancasterStemmer()
# putting the path of the intents file in the variable
kaggle_directory = "/kaggle/input/"
# Reading the intents file
with open(kaggle_directory+"article/intents.json") as file:
    data = json.load(file)

In [106]:
words = []
labels = []
docs_x = []
docs_y = []

# Preprocessing the data
# data["intents"] is the value of the key "intents" in the dictionary data. This is a list of dictionaries itself.
# Each element in this list, is a dictionary which contains the keys "tag", "patterns" and "responses".
for intent in data["intents"]:
    # for every pattern in the list of patterns we will tokenize the words and add them to the list of words
    for pattern in intent["patterns"]:
        tokens = nltk.word_tokenize(pattern)
        words.extend(tokens)
        # we will add the tokenized words to the list of docs_x and the corresponding tag to the list of docs_y
        docs_x.append(tokens)
        docs_y.append(intent["tag"])
    # labels will contain all the tags
    if intent["tag"] not in labels:
        labels.append(intent["tag"])
# stemming the words and removing the duplicates.
# we will sort the list of words and labels and remove the the question mark from the list of words
words = sorted (list (set ([stemmer.stem(w.lower()) for w in words if w != "?"])))
labels = sorted (labels)

# print debugging statements
# print(words)
# print(labels)
# print(docs_x)
# print(docs_y)

# defining the training and output lists
training = []
output = []
out_empty = [0 for _ in range(len(labels))]

# creating the bag of words
for x, doc in enumerate(docs_x):
    bag = []
    tokens = [stemmer.stem(w) for w in doc]
    # for every word in the list of words, we will append the number of times it occurs in the list of tokens to the bag.
    for w in words:
        bag.append(tokens.count(w))

    # output_row will be a list of 0s with the index of the tag in the list of labels as 1
    output_row = out_empty[:]
    output_row[labels.index(docs_y[x])] = 1

    # appending the bag and output_row to the training and output lists respectively
    training.append(bag)
    output.append(output_row)

training = np.array(training)
output = np.array(output)

# saving the preprocessed data in a pickle file
with open("data.pickle", "wb") as f:
    pickle.dump((words, labels, training, output), f)
    
# we do not need to preprocess the data if it is already preprocessed
# so we will save the preprocessed data in a pickle file and load it from there
# if the pickle file is not present, then we will preprocess the data and save it in the pickle file
# note that if the intents file is changed, then the pickle file will not be updated. This should be done manually by deleting the pickle file.
# try:
#     with open("data.pickle", "rb") as f:
#         words, labels, training, output = pickle.load(f)
# except: put the preprocessing in the except.

In [107]:
# resetting the default graph
tf.compat.v1.reset_default_graph()
# defining the neural network
# we create a fully connected neural network with 3 hidden layers
# the input layer will have the same number of neurons as the number of words in the list of words
# the output layer will have the same number of neurons as the number of tags in the list of labels
# the activation function of the output layer will be softmax
# the hidden layers will have 8 neurons each.
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(output[0]), activation='softmax')
net = tflearn.regression(net)
model = tflearn.DNN(net)

In [108]:
model.fit(training, output, n_epoch=1000, batch_size = 8, show_metric= True)
model.save("model_first")
# if the model is already trained, then load it from the file
# model.load("model_first")

Training Step: 19999  | total loss: [1m[32m0.00034[0m[0m | time: 0.074s
| Adam | epoch: 1000 | loss: 0.00034 - acc: 1.0000 -- iter: 152/158
Training Step: 20000  | total loss: [1m[32m0.00035[0m[0m | time: 0.078s
| Adam | epoch: 1000 | loss: 0.00035 - acc: 1.0000 -- iter: 158/158
--


In [109]:
# function for tokenizing the words
# the words is the list which was created while preprocessing the data and contains all the words in the list of words
def tokenizing(sentence, words):
    bag = [0 for _ in range(len(words))]
    # tokenizing the words in the sentence and stemming them
    w = nltk.word_tokenize(sentence)
    w = [stemmer.stem(w.lower()) for w in w if w != "?"]
    # for every word in the words of the sentence, we will set the value of the corresponding index in the bag to 1
    for se in w:
        for i, w2 in enumerate(words):
            if w2 == se:
                bag[i]= 1
    return np.array(bag)

In [114]:
def chat_with_bot():
    print("Lets start chatting...(type quit to stop)")
    # main loop of the chatbot
    while (True):
        inp = input("ask your question: ")
        # if the user types quit, then break out of the loop
        if inp.lower() == "quit":
            break
        # tokenizing the input and predicting the output
        # this will actually be a list of probabilities
        result = model.predict([tokenizing(inp, words)])[0]
        # we pick the index of the maximum probability
        result_index = np.argmax(result)
        tag = labels[result_index]
        # if the probability is greater than 0.7, then we will print the corresponding response
        # however, with lower probabilities, the model is not very accurate and so we decided to print "I don't understand your question"
        if max(result) > 0.6:
            for tg in data["intents"]:
                if tg["tag"] == tag:
                    responses = tg["responses"]
                    break
            if tg["tag"] == tag:
                print(random.choice(responses))
        else:
            print("I don't understand your question")
        
chat_with_bot()

Lets start chatting...(type quit to stop)


ask your question:  what is the title of the paper?


An Automated Approach to Estimating Code Coverage Measures via Execution Logs


ask your question:  what is the paper about?


Automating Code Coverage using Execution Logs


ask your question:  what is the overall message of the paper?


It introduces LogCoCo, an automated code coverage estimation approach.


ask your question:  give me a brief abstract of the paper


HBase was selected for its widespread usage, serving millions of users in companies like Facebook and Twitter.


ask your question:  give me one strength of LogCoCO


The excessive instrumentation guarantees accurate measurements of code coverage. However, problems like deployment challenges and performance overhead are imposed. LogCoCo on the other hand, is easy to setup and imposes little performance overhead by analyzing the execution logs.


ask your question:  give me one weakness of LogCoCo


The excessive instrumentation guarantees accurate measurements of code coverage. However, problems like deployment challenges and performance overhead are imposed. LogCoCo on the other hand, is easy to setup and imposes little performance overhead by analyzing the execution logs.


ask your question:  What are the challenges of LogCoCO?


Generalizability: LogCoCo primarily focuses on server-side systems with extensive logging. To enhance its applicability, research studies can explore ways to adapt LogCoCo to mobile applications and client/desktop-based systems with limited or no logging. This expansion would make LogCoCo a more versatile tool. One way to do this expansion can be through focusing on developing automated tools or techniques that strategically insert logging statements into source code where there are limited logging available. This technique can also be used to reduce the amount of May labels.


ask your question:  can LogCoCo be used in mobile applications?


LogCoCo's approach of inferring system execution contexts is significant because it allows indirect detection of covered methods.


ask your question:  What are the issues associated with code coverage tools like JaCoCo in practice?


JaCoCo did not report code coverage measures for some modules, particularly those not directly invoked by the YCSB benchmark suite.


ask your question:  how many experiments were conducted?


Measuring code coverage in a DevOps-like environment is challenging due to rapid deployment processes.


ask your question:  what is the Hbase Experiment?


The experiment used the YCSB benchmark suite, originally developed by Yahoo!.


ask your question:  what are the engineering challenges of using JaCoCO?


The paper discusses software testing and code coverage measures.


ask your question:  what are the engineering challenges?


I don't understand your question


ask your question:  in the paper a breadth for search algorithm was used explain


The HBase experiment used a three-machine-cluster with specific hardware specs.


ask your question:  quit
