# CARREFOUR ChatBot Models (BoW and Embedding)

## About Us

### Group 1

> #### Members
>
> - Collins Ndung'u
> - Nancy Daniel
> - Baptiste Billy Nitunga
> - Gideon Mutuku


View our poster [here](https://www.overleaf.com/read/cbzmjmbfhvzq#f10dff)

## Import libraries

In [None]:
import os
import json
import random
import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('all')

# Initialize lemmatizer globally
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

## Data loading and preprocessing

In [None]:
import requests

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download&id={}"
    session = requests.Session()
    response = session.get(URL.format(id), stream=True)
    token = get_confirm_token(session, response)
    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL.format(id), params = params, stream = True)
    save_response_content(response, destination)

def get_confirm_token(session, response):
    for key, value in response.cookies.items():
        if key.startswith('download'):
            return value
    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:
                f.write(chunk)


file_id = '1mzt8cPO1_cJMcnYAdoF3Lx1rO-1sfWxw'
destination = 'intents.json'
download_file_from_google_drive(file_id, destination)

print(f"Downloaded {destination}")

Downloaded intents.json


In [None]:
# Data Loading and Preprocessing
def load_and_preprocess_data(intents_file="intents.json", max_sequence_length=20):
    """
    Loads data from an intents.json file, tokenizes, lemmatizes, and creates
    vocabulary, classes, and training data (Bag of Words and Embeddings).
    Splits the data into training (90%) and testing (10%) sets for both.
    """

    # Load data from JSON
    intents_file_path = os.path.join(os.getcwd(), intents_file)
    with open(intents_file_path, 'r') as f:
        intents = json.load(f)

    words = []
    classes = []
    documents = []

    # Iterate through intents to extract patterns and tags
    for intent in intents['intents']:
        tag = intent['tag']
        classes.append(tag)
        for pattern in intent['patterns']:
            # Tokenize each word in the pattern
            w = word_tokenize(pattern)
            words.extend(w)
            # Add to documents list
            documents.append((w, tag))

    # Lemmatize and lower each word and remove duplicates
    words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ['?', '!', '.', ',']]
    words = sorted(list(set(words)))
    classes = sorted(list(set(classes)))

    print(f"{len(documents)} documents")
    print(f"{len(classes)} classes: {classes}")
    print(f"{len(words)} unique lemmatized words: {words}")

    # Shuffle documents
    random.shuffle(documents)

    # Split documents into training (90%) and testing (10%)
    split_index = int(len(documents) * 0.9)
    train_documents = documents[:split_index]
    test_documents = documents[split_index:]


    # Create Bag of Words data for train and test sets
    train_x_bow = []
    train_y_bow = []
    test_x_bow = []
    test_y_bow = []

    output_empty = [0] * len(classes)

    for doc in train_documents:
        bag = []
        pattern_words = [lemmatizer.lemmatize(word.lower()) for word in doc[0]]
        for w in words:
            bag.append(1) if w in pattern_words else bag.append(0)
        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1
        train_x_bow.append(bag)
        train_y_bow.append(output_row)

    for doc in test_documents:
        bag = []
        pattern_words = [lemmatizer.lemmatize(word.lower()) for word in doc[0]]
        for w in words:
            bag.append(1) if w in pattern_words else bag.append(0)
        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1
        test_x_bow.append(bag)
        test_y_bow.append(output_row)


    # Generate Embeddings data for train and test sets
    train_x_emb, train_y_emb = generate_embeddings(train_documents, words, classes, max_sequence_length)
    test_x_emb, test_y_emb = generate_embeddings(test_documents, words, classes, max_sequence_length)


    return train_x_bow, train_y_bow, test_x_bow, test_y_bow, train_x_emb, train_y_emb, test_x_emb, test_y_emb, words, classes, intents


def generate_embeddings(documents, words, classes, max_sequence_length=20):
    """
    Generates text embeddings for documents by converting words to indices,
    padding/truncating sequences, and creating one-hot encoded labels.
    """
    # Initialize lemmatizer

    word_index = {"<PAD>": 0}
    word_index.update({word: i + 1 for i, word in enumerate(words)}) # Add words from vocabulary
    # Add index for unknown words
    unknown_word_index = len(words) + 1
    word_index['<UNK>'] = unknown_word_index


    sequences = []
    labels = []

    for pattern_words, tag in documents:
        # Convert words to indices
        indexed_pattern = [word_index.get(lemmatizer.lemmatize(word.lower()), unknown_word_index) for word in pattern_words]

        # Pad or truncate sequence
        if len(indexed_pattern) < max_sequence_length:
            # Pad with zeros at the beginning
            padded_sequence = [word_index["<PAD>"]] * (max_sequence_length - len(indexed_pattern)) + indexed_pattern
        else:
            # Truncate from the beginning
            padded_sequence = indexed_pattern[:max_sequence_length]

        sequences.append(padded_sequence)

        # Create one-hot encoded label
        label = [0] * len(classes)
        label[classes.index(tag)] = 1
        labels.append(label)

    return sequences, labels

## Model definitions

In [None]:
# The definition of the BoW model
class ChatbotModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(ChatbotModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

# The definition of the Embedding model
class EmbeddingChatbotModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, max_sequence_length):
        super(EmbeddingChatbotModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Take the output of the last time step
        output = self.fc(lstm_out[:, -1, :])
        return output

## Model training

In [None]:
# Training the BoW model
def train_model(model, train_x, train_y, epochs=200, batch_size=8, learning_rate=0.001):
    """
    Trains the chatbot model using the provided training data.
    """
    # Convert data to PyTorch tensors
    X_train = torch.tensor(train_x, dtype=torch.float32)
    y_train = torch.tensor(train_y, dtype=torch.float32)

    # Create DataLoader for batching
    dataset = TensorDataset(X_train, y_train)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Loss and optimizer (CrossEntropyLoss for multi-class classification)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print("\nTraining the BoW model...")
    for epoch in range(epochs):
        for i, (inputs, labels) in enumerate(dataloader):
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, torch.max(labels, 1)[1]) # labels need to be class indices for CrossEntropyLoss

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (epoch+1) % 10 == 0:
            print (f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

    print("Training complete!")
    return model

# Training the embediing model
def train_embedding_model(model, train_x_emb, train_y_emb, epochs=200, batch_size=8, learning_rate=0.001):
    """
    Trains the chatbot model using the provided embedding training data.
    """
    # Convert data to PyTorch tensors
    X_train = torch.tensor(train_x_emb, dtype=torch.long) # Use long for indices
    y_train = torch.tensor(train_y_emb, dtype=torch.float32)

    # Create DataLoader for batching
    dataset = TensorDataset(X_train, y_train)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Loss and optimizer (CrossEntropyLoss for multi-class classification)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print("\nTraining the embedding model...")
    for epoch in range(epochs):
        for i, (inputs, labels) in enumerate(dataloader):
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, torch.max(labels, 1)[1]) # labels need to be class indices for CrossEntropyLoss

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (epoch+1) % 10 == 0:
            print (f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

    print("Embedding model training complete!")
    return model

## Model evaluation

In [None]:
def evaluate_model(model, test_x, test_y, classes, is_embedding_model=False):
    """
    Evaluates the trained chatbot model on the test set and calculates accuracy.
    Can handle both Bag of Words (is_embedding_model=False) and
    Embedding (is_embedding_model=True) model inputs.
    """
    # Convert test data to PyTorch tensors
    if is_embedding_model:
        X_test = torch.tensor(test_x, dtype=torch.long) # Use long for indices
    else:
        X_test = torch.tensor(test_x, dtype=torch.float32) # Use float32 for BoW

    y_test = torch.tensor(test_y, dtype=torch.float32)

    # Set model to evaluation mode
    model.eval()

    # Disable gradient calculations
    with torch.no_grad():
        outputs = model(X_test)

    # Determine predicted and true class indices
    predicted_indices = torch.argmax(outputs, dim=1)
    true_indices = torch.argmax(y_test, dim=1)

    # Calculate accuracy
    correct_predictions = (predicted_indices == true_indices).sum().item()
    total_samples = y_test.size(0)
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0

    return accuracy

## Prediction and response

In [None]:
def clean_up_sentence(sentence):
    """Tokenizes and lemmatizes a sentence."""
    # Initialize lemmatizer within the function
    lemmatizer = WordNetLemmatizer()
    sentence_words = word_tokenize(sentence)
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words

def bag_of_words(sentence, words):
    """Creates a bag of words array for a given sentence using the provided vocabulary."""
    sentence_words = clean_up_sentence(sentence)
    bag = [0] * len(words)
    for s in sentence_words:
        for i, w in enumerate(words):
            if w == s:
                bag[i] = 1
    return np.array(bag)

def sentence_to_embedding_sequence(sentence, words, max_sequence_length, word_index):
    """Converts a single sentence to an embedding sequence."""
    lemmatizer = WordNetLemmatizer()
    sentence_words = clean_up_sentence(sentence)
    indexed_pattern = [word_index.get(lemmatizer.lemmatize(word.lower()), word_index.get('<UNK>', 0)) for word in sentence_words]

    # Pad or truncate sequence
    if len(indexed_pattern) < max_sequence_length:
        # Pad with zeros at the beginning
        padded_sequence = [word_index["<PAD>"]] * (max_sequence_length - len(indexed_pattern)) + indexed_pattern
    else:
        # Truncate from the beginning
        padded_sequence = indexed_pattern[:max_sequence_length]

    return padded_sequence


def predict_class(sentence, model, words, classes):
    """
    Predicts the intent class of a sentence using the Bag of Words model.
    User input is preprocessed consistently with training data.
    """
    # Generate BoW for the user input using the same vocabulary as the model was trained on
    bag = bag_of_words(sentence, words)

    # Convert to tensor and unsqueeze for batch dimension
    input_data = torch.tensor(bag, dtype=torch.float32).unsqueeze(0)

    # Get prediction from the BoW model
    with torch.no_grad():
        output = model(input_data)

    # Get probabilities and predicted class index
    probabilities = torch.softmax(output, dim=1)
    predicted_index = torch.argmax(probabilities).item()
    predicted_class = classes[predicted_index]

    # Filter out predictions below a threshold
    results = []
    for i, p in enumerate(probabilities[0]):
        # Use a slightly lower threshold for interactive chat if needed, or keep consistent
        if p > 0.7: # Confidence threshold
            results.append({'intent': classes[i], 'probability': p.item()})

    # Sort by probability in descending order
    results.sort(key=lambda x: x['probability'], reverse=True)
    return results

def predict_class_embedding(sentence, model, words, classes, max_sequence_length):
    """
    Predicts the intent class of a sentence using the Embedding model.
    User input is converted to an embedding sequence.
    """
    # Create word_index mapping
    word_index = {"<PAD>": 0}
    word_index.update({word: i + 1 for i, word in enumerate(words)})
    unknown_word_index = len(words) + 1
    word_index['<UNK>'] = unknown_word_index


    # Convert sentence to embedding sequence
    embedding_sequence = sentence_to_embedding_sequence(sentence, words, max_sequence_length, word_index)

    # Convert to tensor and unsqueeze for batch dimension
    input_data = torch.tensor(embedding_sequence, dtype=torch.long).unsqueeze(0)

    # Get prediction from the Embedding model
    with torch.no_grad():
        output = model(input_data)

    # Get probabilities and predicted class index
    probabilities = torch.softmax(output, dim=1)
    predicted_index = torch.argmax(probabilities).item()
    predicted_class = classes[predicted_index]

    # Filter out predictions below a threshold
    results = []
    for i, p in enumerate(probabilities[0]):
        if p > 0.7: # Confidence threshold
            results.append({'intent': classes[i], 'probability': p.item()})

    # Sort by probability in descending order
    results.sort(key=lambda x: x['probability'], reverse=True)
    return results


def get_response(intents_list, intents_data):
    """Gets a random response from the intents data for the predicted intent."""
    if not intents_list:
        return "Sorry, I don't understand that."

    tag = intents_list[0]['intent']

    # Find the intent with the matching tag in the intents data
    for intent in intents_data['intents']:
        if intent['tag'] == tag:
            # Return a random response from the list of responses, if multiple
            return random.choice(intent['responses'])

    # Fallback if no matching intent is found
    return "I couldn't find a specific answer for that."

## Main execution

In [None]:
if __name__ == "__main__":
    # Load and preprocess data, splitting into train and test sets for both BoW and Embeddings
    # Pass max_sequence_length here for embedding generation
    train_x_bow, train_y_bow, test_x_bow, test_y_bow, \
    train_x_emb, train_y_emb, test_x_emb, test_y_emb, \
    words, classes, intents = load_and_preprocess_data(intents_file="intents.json", max_sequence_length=20)

    # Determine input and output sizes for the models
    # BoW input size is the vocabulary size
    input_size_bow = len(words)
    # Embedding input size is the max sequence length
    max_sequence_length = 20
    input_size_emb = max_sequence_length
    output_size = len(classes) # Output size is the number of classes for all models

    # Instantiate the Bag of Words model
    model_bow = ChatbotModel(input_size_bow, output_size)

    # Instantiate the Embedding model
    vocab_size = len(words) + 2 # +2 for padding and unknown words
    embedding_dim = 100 # Choose an appropriate embedding dimension
    hidden_size = 128 # Choose an appropriate hidden size for LSTM (Long short-term memory)
    model_emb = EmbeddingChatbotModel(vocab_size, embedding_dim, hidden_size, output_size, max_sequence_length=input_size_emb)


    # Train the Bag of Words model on the training data
    trained_model_bow = train_model(model_bow, train_x_bow, train_y_bow)

    # Train the embedding model on the training data
    trained_model_emb = train_embedding_model(model_emb, train_x_emb, train_y_emb)

    # Evaluate the Bag of Words model on the test data
    accuracy_bow = evaluate_model(trained_model_bow, test_x_bow, test_y_bow, classes, is_embedding_model=False)
    print(f"\nBag of Words model accuracy on the test set: {accuracy_bow:.4f}")

    # Evaluate the Embedding model on the test data
    accuracy_emb = evaluate_model(trained_model_emb, test_x_emb, test_y_emb, classes, is_embedding_model=True)
    print(f"\nEmbedding model accuracy on the test set: {accuracy_emb:.4f}")

    torch.save(trained_model_bow.state_dict(), "chatbot_model_bow.pth")
    torch.save(trained_model_emb.state_dict(), "chatbot_model_emb.pth")
    print("\nTrained models saved as chatbot_model_bow.pth and chatbot_model_emb.pth")
    print("\nChatbot is ready!")

1177 documents
266 classes: ['App_Download', 'App_Download_2', 'App_Download_3', 'App_Download_4', 'App_Download_5', 'App_Download_6', 'App_Download_7', 'App_Login', 'App_Login_2', 'App_Login_3', 'App_SignUp', 'App_SignUp_2', 'App_SignUp_3', 'App_SignUp_4', 'Basket_Modify', 'Basket_Modify_2', 'Basket_Modify_3', 'Basket_Modify_4', 'Contact_Us', 'Contact_Us_2', 'Contact_Us_3', 'Contact_Us_4', 'Contact_Us_5', 'Contact_Us_6', 'Contact_Us_7', 'Contact_Us_8', 'Defective_Product', 'Defective_Product_2', 'Defective_Product_3', 'Defective_Product_4', 'Defective_Product_5', 'Defective_Product_6', 'Defective_Product_7', 'Delivery_Area', 'Delivery_Area_10', 'Delivery_Area_2', 'Delivery_Area_3', 'Delivery_Area_4', 'Delivery_Area_5', 'Delivery_Area_6', 'Delivery_Area_7', 'Delivery_Area_8', 'Delivery_Area_9', 'Delivery_Charge', 'Delivery_Charge_10', 'Delivery_Charge_11', 'Delivery_Charge_12', 'Delivery_Charge_2', 'Delivery_Charge_3', 'Delivery_Charge_4', 'Delivery_Charge_5', 'Delivery_Charge_6', 'Del

## Use the chatbot

In [None]:
# Load the trained models
input_size_bow = len(words)
max_sequence_length = 20
output_size = len(classes)
vocab_size = len(words) + 2
embedding_dim = 100
hidden_size = 128

model_bow = ChatbotModel(input_size_bow, output_size)
model_bow.load_state_dict(torch.load("chatbot_model_bow.pth"))
model_bow.eval()

model_emb = EmbeddingChatbotModel(vocab_size, embedding_dim, hidden_size, output_size, max_sequence_length=max_sequence_length)
model_emb.load_state_dict(torch.load("chatbot_model_emb.pth"))
model_emb.eval()

print("Chatbot is ready!")

# Allow user to choose which model to use
while True:
    model_choice = input("Choose model (BoW/Embedding): ").lower()
    if model_choice in ['bow', 'embedding']:
        break
    else:
        print("Invalid choice. Please enter 'BoW' or 'Embedding'.")

if model_choice == 'bow':
    print("\nUsing the Bag of Words model for interactive chat.")
    current_model = model_bow
    is_embedding = False
else:
    print("\nUsing the Embedding model for interactive chat.")
    current_model = model_emb
    is_embedding = True

print("Type 'quit' to exit.")
while True:
    message = input("You: ")
    if message.lower() == "quit":
        break

    # Predict intent using the chosen model
    if is_embedding:
        results = predict_class_embedding(message, current_model, words, classes, max_sequence_length)
    else:
         # Predict intent using the Bag of Words model and the vocabulary from the CSV data
        results = predict_class(message, current_model, words, classes)


    if results:
        # Get response based on predicted intent from the intents data
        response = get_response(results, intents)
        print(f"Bot: {response}")
    else:
        print("Bot: Sorry, I don't understand that.")

Chatbot is ready!

Using the Bag of Words model for interactive chat.
Type 'quit' to exit.
Bot: Sorry, I don't understand that.
Bot: Yes
Bot: Sorry, I don't understand that.
Bot: Sorry, I don't understand that.
Bot: Sorry, I don't understand that.
Bot: No-under 100 MB on both stores
Bot: No
Bot: No-Visa & MasterCard only
Bot: Sorry, I don't understand that.
Bot: Sorry, I don't understand that.
Bot: Yes-free download
