In [53]:
# import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
import json
import pickle
import string
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, GRU
from keras.optimizers import Adam

The dataset comes from this link: https://www.kaggle.com/datasets/gondimalladeepesh/nvidia-documentation-question-and-answer-pairs


In [54]:
# download necessary nltk libraries
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
df = pd.read_pickle('/content/NVIDIA_dataframe.pkl')

In [57]:
# remove missing values and duplicates
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Question,Answer,Category
0,What is Hybridizer?,Hybridizer is a compiler from Altimesh that en...,0
1,How does Hybridizer generate optimized code?,Hybridizer uses decorated symbols to express p...,0
2,What are some parallelization patterns mention...,The text mentions using parallelization patter...,0
3,How can you benefit from accelerators without ...,You can benefit from accelerators' compute hor...,0
4,What is an example of using Hybridizer?,An example in the text demonstrates using Para...,0
...,...,...,...
985,Describe the role of PyGDF in the realm of GPU...,PyGDF is a Python library offering GPU DataFra...,6
986,Where can one find resources to delve into adv...,For more in-depth knowledge of advanced Numba ...,0
997,What kind of problem does graph analysis aim t...,Graph analysis addresses problems involving re...,5
998,What is PageRank and how is it utilized in gra...,PageRank is an influential algorithm used for ...,5


In [23]:
# display dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 963 entries, 0 to 999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  963 non-null    object
 1   Answer    963 non-null    object
 2   Category  963 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 30.1+ KB


In [24]:
def preprocess_text(text):
  """
  Preprocess the input text by converting it to lowercase, removing stopwords
  and non-alphabetical characters, and applying lemmatization to each word.

  This function aims to standardize the input text to a form that is more suitable
  for natural language processing tasks by performing several preprocessing steps.
  First, it converts all characters in the text to lowercase to ensure uniformity.
  It then tokenizes the text into individual words and removes any stopwords as
  defined by the NLTK library for the English language, as well as any tokens
  that are not purely alphabetical. Finally, it applies lemmatization to each
  word to reduce it to its base or root form.

  Inputs:
  - text(str): The text to be preprocessed

  Returns:
  - str: The preprocessed text, with tokens joined back into a single string
  separated by spaces
  """
  # convert text to lowercase
  text = text.lower()

  # remove english stopwords later
  stop_words = set(stopwords.words('english'))

  # use word net lemmatizer
  lemmatizer = WordNetLemmatizer()

  # tokenize the text into individual words
  tokens = word_tokenize(text)

  # remove stopwords and non-alpha characters, and apply lemmatization
  tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in \
            stop_words and token not in string.punctuation and token.isalpha()]

  # join the processed tokens back into a single string
  return ' '.join(tokens)

df['Question'] = df['Question'].apply(preprocess_text)

In [25]:
X = df['Question'] # Features
y = df['Category'] # Target variable

# 80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# initialize tokenizer with max vocab size of 1,000 words
tokenizer = Tokenizer(num_words=1000)

# training the tokenizer on training data
tokenizer.fit_on_texts(X_train)

# convert training data and testing data into sequences of integers
# each integer is a specific word in the vocabulary learned by the tokenizer
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Perform padding on training data and testing data
X_train_padded = pad_sequences(X_train_sequences, maxlen=80)
X_test_padded = pad_sequences(X_test_sequences, maxlen=80)

In [27]:
# build the model
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=64, input_length=80))
model.add(GRU(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(GRU(64))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu'))
model.add(Dense(10, activation='softmax'))

In [28]:
# compile the model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [29]:
# model summary
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 80, 64)            64000     
                                                                 
 gru_4 (GRU)                 (None, 80, 128)           74496     
                                                                 
 dropout_2 (Dropout)         (None, 80, 128)           0         
                                                                 
 gru_5 (GRU)                 (None, 64)                37248     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 10)               

In [30]:
# train the model
history = model.fit(X_train_padded,
                    y_train,
                    epochs=10,
                    batch_size=8,
                    validation_data=(X_test_padded, y_test),
                    verbose=2)

Epoch 1/10
97/97 - 15s - loss: 1.9338 - accuracy: 0.3558 - val_loss: 1.6057 - val_accuracy: 0.5751 - 15s/epoch - 154ms/step
Epoch 2/10
97/97 - 8s - loss: 1.1999 - accuracy: 0.6312 - val_loss: 0.8792 - val_accuracy: 0.7047 - 8s/epoch - 87ms/step
Epoch 3/10
97/97 - 8s - loss: 0.6390 - accuracy: 0.7649 - val_loss: 0.7896 - val_accuracy: 0.7202 - 8s/epoch - 78ms/step
Epoch 4/10
97/97 - 9s - loss: 0.3898 - accuracy: 0.8662 - val_loss: 0.6639 - val_accuracy: 0.8031 - 9s/epoch - 88ms/step
Epoch 5/10
97/97 - 9s - loss: 0.2093 - accuracy: 0.9338 - val_loss: 0.7289 - val_accuracy: 0.7617 - 9s/epoch - 92ms/step
Epoch 6/10
97/97 - 7s - loss: 0.1392 - accuracy: 0.9584 - val_loss: 0.8074 - val_accuracy: 0.8135 - 7s/epoch - 77ms/step
Epoch 7/10
97/97 - 8s - loss: 0.1641 - accuracy: 0.9558 - val_loss: 0.8330 - val_accuracy: 0.8290 - 8s/epoch - 88ms/step
Epoch 8/10
97/97 - 8s - loss: 0.0555 - accuracy: 0.9909 - val_loss: 0.8896 - val_accuracy: 0.8135 - 8s/epoch - 87ms/step
Epoch 9/10
97/97 - 7s - loss:

In [31]:
# evaluate the model
losses_and_metrics = model.evaluate(X_test_padded, y_test, verbose=2)
print(losses_and_metrics)

7/7 - 0s - loss: 0.9075 - accuracy: 0.8394 - 230ms/epoch - 33ms/step
[0.9075230956077576, 0.8393782377243042]


In [32]:
# create a list of dictionaries to store categories
categoryDicts = [{} for _ in range(10)]

for _, row in df.iterrows():
    question = row['Question']
    answer = row['Answer']
    category = row['Category']

    # Add the question-answer pair to the corresponding category dictionary
    categoryDicts[category][question] = answer

# Initialize and train a TF-IDF vectorizer for each category
category_vectorizers = {}
for i in range(10):
    category_questions = list(categoryDicts[i].keys())
    vectorizer = TfidfVectorizer()
    vectorizer.fit(category_questions)
    category_vectorizers[i] = vectorizer

In [33]:
def cosine(query_vec, doc_vec):
  """
  Takes the cosine similarity between the vectorized
  user query and a vectorized question from the knowledge
  base

  Inputs:
    query_vec: user query vector
    doc_vec: question vector

  Returns:
    array: cosine similarities between query_vec and doc_vec

  """
  return cosine_similarity(query_vec, doc_vec)

In [35]:
def query_knowledge_base(query):
  """
    Preprocess the query and then tokenize the user's query
    and use the ML model to predict the most similar question
    and return the most relevant answer.

    Inputs:
        query (str): User's query or question.

    Returns:
        str: The most relevant answer from the knowledge base.
  """
  # Preprocess the user's question
  query = preprocess_text(query)

  # Convert the preprocessed text into a list of sequences
  new_question_seq = tokenizer.texts_to_sequences([query])

  # Pad the sequences to ensure they all have the same length for model input
  new_question_padded = pad_sequences(new_question_seq, maxlen=80)

  # Predict the probability distribution over categories for the padded sequence
  predicted_probs = model.predict(new_question_padded)

  # Determine the category with the highest probability as the predicted category
  predicted_category = np.argmax(predicted_probs)

  # Get the vectorizer for the predicted category
  vectorizer = category_vectorizers[predicted_category]
  query_vec = vectorizer.transform([query])


  # Compute similarities
  max_similarity = -1
  best_question = None
  for question in categoryDicts[predicted_category]:
      question_vec = vectorizer.transform([question])
      similarity = cosine(query_vec, question_vec)[0][0]
      if similarity > max_similarity:
          max_similarity = similarity
          best_question = question

  return categoryDicts[predicted_category][best_question]

In [36]:
def generate_feedback_response(feedback):
  """
  Selects and returns a random comment from a predefined
  list.

  If the user was not satified with the answer, the bot
  will return a negative feedback comment. Else, the user
  will return a positive feedback comment.


  Inputs:
  - feedback (str): user's feedback which is "yes" or "no"

  Returns:
  - str: A random positive or negative comment from a predefined list of comments


  """
  if feedback.lower() == "yes":
    comments = [
        "Great to hear that helped!",
        "I'm glad that was useful!",
        "Awesome!",
        "Glad I could assist!"
    ]



  if feedback.lower() == "no":
    comments = [
        "Apologies for that. We will try to improve.",
        "Thanks for your honesty. We will try to improve",
        "Sorry that wasn't what you were looking for. We will try to improve",
        "Thanks for your feedback. We will try to improve"
    ]


  return random.choice(comments)

In [37]:
def load_user_model(username):
    """
    Load a user model from a JSON file based on the provided response

    This function checks if a JSON file corresponding to the username exists
    within the 'user_models' directory. If the file exists, the function reads the
    file and returns the JSON object loaded from the file.If the file does not exist,
    the function returns None, indicating no user model was found for the given
    username.

    Inputs:
    - username (str): The username of the user whose model is to be loaded.

    Returns:
    - dict or None: The user model as a dictionary if the file exists and is
    successfully loaded, or None if the file does not exist
    """
    # Check if user model file exists
    if os.path.exists(f"user_models/{username}.json"):
        with open(f"user_models/{username}.json", "r") as file:
            return json.load(file)
    else:
        return None

In [38]:
def save_user_model(user_model):
  """
  Save a user model to a JSON file in the 'user_models' directory

  This function checks if the 'user_models' directory exists.
  If it does not exist, the directory is created.
  After ensuring the directory exists, the function saves the user model
  (passed as a dictionary) to a JSON file.

  The name of the file is derived from the 'username' key in the
  user_model dictionary.

  Inputs:
  - user_model (dict): The user model to be saved. This dictionary must include
  a key 'username' which is used to name the file.

  Returns:
  None
  """
  # Ensure the directory exists
  directory_path = "user_models"
  if not os.path.exists(directory_path):
      os.makedirs(directory_path)

  # Construct file path
  file_path = os.path.join(directory_path, f"{user_model['username']}.json")

  # Write the JSON data to a file
  with open(file_path, "w") as file:
      json.dump(user_model, file, indent=4)

In [39]:
def new_user_flow():
    """
    Handles the flow for new users interacting with Chris, the NVIDIA assistant.

    This function initiates a conversation with a new user where the bot
    introduces itself and asks for the user's name, then allowing the user to ask
    questions, and then storing their feedback to the answers provided by Chris.
    Chris responds with relevant information from the provided knowledge base.
    Depending on the user's feedback, it reacts with a comment using the
    generate_feedback_response(feedback) function.
    The user's feedback is stored in the specific user model for each user
    using the save_user_model(user_model) function. The user can exit the
    conversation at any point by typing 'exit'.

    Inputs:
    None

    Returns:
    None
    """
    print("Hi, I'm Chris, an NVIDIA assistant! What is your name?")

    # user enters their name
    username = input()
    if username.lower() == 'exit':
        print("Goodbye! Feel free to come back anytime.")
        exit()

    print(f"Hi {username}! I can help answer about any information related to NVIDIA. Feel free to ask your questions.")

    # create the user model
    user_model = {'username': username, 'likes': [], 'dislikes': []}

    # Interactive phase with bot
    while True:
        question = input()
        if question.lower() == 'exit':
            print("Goodbye! Feel free to come back anytime.")
            break

        answer = query_knowledge_base(question)
        print("Chris:", answer)
        feedback = input("Was this answer helpful? (Yes/No): ")

        # generate and print feedback response
        feedback_response = generate_feedback_response(feedback)
        print("Chris:", feedback_response)

        if feedback.lower() == "yes":
          user_model['likes'].append({'question': question, 'answer': answer, 'helpful': feedback.lower()})

        if feedback.lower() == "no":
          user_model['dislikes'].append({'question': question, 'answer': answer, 'helpful': feedback.lower()})

    # save the user model
    save_user_model(user_model)

In [40]:
def returning_user_flow(user_model):
    """
    Manages the interaction flow for returning users, allowing them to add more
    feedback to more questions they get feedback from asking.

    This function welcomes back a returning user and then allows the user to interact
    with Chris and the knowledge base.

    Inputs:
    - user_model (dict): A dictionary containing the user's information, including
    their username, likes and dislikes on answers they received to questions

    Returns:
    None
    """
    print(f"Welcome back, {user_model['username']}! It's Chris here, feel free to ask any NVIDIA related questions.")

    # Interactive phase with bot
    while True:
        question = input()
        if question.lower() == 'exit':
            print("Goodbye! Feel free to come back anytime.")
            break
        answer = query_knowledge_base(question)
        print("Chris:", answer)
        feedback = input("Was this answer helpful? (Yes/No): ")

        # generate and print feedback response
        feedback_response = generate_feedback_response(feedback)
        print("Chris:", feedback_response)


        if feedback.lower() == "yes":
          user_model['likes'].append({'question': question, 'answer': answer, 'helpful': feedback.lower()})
        if feedback.lower() == "no":
          user_model['dislikes'].append({'question': question, 'answer': answer, 'helpful': feedback.lower()})

    save_user_model(user_model)

In [41]:
def main():
  """
  The main entry point of the chatbot program focused on information in NVIDIA blogs

  This function initiates the conversation with the user, offering them to identify themselves
  as either a new user or a returning user. Based on the user's input, it directs the flow of
  conversation accordingly:

  - For new users, it transitions into the new_user_flow, guiding them through the process of
    expressing their feedback on the response they received from the bot, whether it is positive
    or negative
  - For returning users, it prompts them to enter their username to load their existing user model.
    If the model is found, it proceeds with the returning_user_flow, allowing users to continue to
    add feedback on how satisfied they are with the answer they get and interact with the
    knowledge base. If no model is found, it suggests starting as a new user.
  - Users can exit the conversation at any point by entering 'exit'. The program handles case-insensitive
    commands for user status and exiting the conversation.


  Inputs: None

  Returns: None
  """
  print("Enter 'exit' at any time to end the conversation.")
  user_status = input("Enter 'New' for New User or 'Returning' for Returning User: (New/Returning) ").strip()

  if user_status.lower() == 'exit':
    print("Goodbye! Feel free to come back anytime.")
    exit()

  if user_status.lower() == "returning":
    username = input("Enter your username: ")
    if username.lower() == 'exit':
        print("Goodbye! Feel free to come back anytime.")
        exit()
    user_model = load_user_model(username)
    if user_model:
        returning_user_flow(user_model)
    else:
        print("User model not found. Let's start as a new user.")
        new_user_flow()

  elif user_status.lower() == "new":
    new_user_flow()

  else:
    print("Invalid command. Please try again.")
    main()

In [52]:
# driver program executes here
if __name__ == "__main__":
   main()

Enter 'exit' at any time to end the conversation.
Enter 'New' for New User or 'Returning' for Returning User: (New/Returning) returning
Enter your username: Nikhil
Welcome back, Nikhil! It's Chris here, feel free to ask any NVIDIA related questions.
What is XGBoost?
Chris: XGBoost is a popular implementation of gradient boosting that enhances its performance by using techniques like CUDA and parallel algorithms to speed up the training process.
Was this answer helpful? (Yes/No): Yes
Chris: Great to hear that helped!
What is the Cooperative Groups programming model?
Chris: The Cooperative Groups programming model extends the CUDA programming model by enabling synchronization patterns within and across CUDA thread blocks. It offers APIs for defining, partitioning, and synchronizing groups of threads, providing a more flexible approach to thread cooperation.
Was this answer helpful? (Yes/No): Yes
Chris: Awesome!
What has recently sparked interest in ray tracing?
Chris: The recent introduc