<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Natural-Language-Processing/Text-Generation-Mini-ChatBot/text_generation_mini_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets==3.6.0

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m491.5/491.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m491.5/491.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import tensorflow as tf
import numpy as np
import re
import pandas as pd
from datasets import load_dataset

In [None]:
def load_data():
    dataset = load_dataset("empathetic_dialogues", trust_remote_code=True)
    return dataset

def format_chat_data(dataset):
    df_train = pd.DataFrame(dataset['train'])
    df_test = pd.DataFrame(dataset["test"])
    df_val = pd.DataFrame(dataset["validation"])

    df = pd.concat([df_train, df_test, df_val], ignore_index=True)
    df = df.sort_values(by=['conv_id', 'utterance_idx'])
    dialogs = df.groupby("conv_id")["utterance"].apply(list).tolist()

    formatted_corpus = []

    for dialog in dialogs:
        if len(dialog) < 2:
            continue
        for i in range(0, len(dialog) - 1, 2):
            user_text = dialog[i].replace("_comma_", ",")
            bot_text = dialog[i+1].replace("_comma_", ",")

            chat_turn = f"<USER> {user_text} <BOT> {bot_text} <END>"
            formatted_corpus.append(chat_turn)

    return formatted_corpus

@tf.keras.utils.register_keras_serializable()
def chat_standardization(input_string):
    text = tf.strings.lower(input_string)
    # Keep special tokens
    text = tf.strings.regex_replace(text, r"(<user>|<bot>|<end>)", r" \1 ")
    # Split special characters
    text = tf.strings.regex_replace(text, r"([.,!?])", r" \1 ")
    text = tf.strings.regex_replace(text, r"[^a-zA-Z0-9.,!?<> ]", "")
    text = tf.strings.regex_replace(text, r"\s{2,}", " ")
    return text

def get_vectorizer(text, max_tokens):
    vectorizer = tf.keras.layers.TextVectorization(
        standardize=chat_standardization,
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=None,
    )
    vectorizer.adapt(text)
    vocab = vectorizer.get_vocabulary()
    vocab_size = len(vocab)
    return vectorizer, vocab, vocab_size

def prepare_dataset(vectorizer, text_corpus, batch_size, seq_len):
    full_text = " ".join(text_corpus)
    full_text_ids = vectorizer([full_text])[0]
    word_dataset = tf.data.Dataset.from_tensor_slices(full_text_ids)
    sequences = word_dataset.batch(seq_len + 1, drop_remainder=True)

    def split_input_target(seq):
        return seq[:-1], seq[1:]

    dataset = sequences.map(split_input_target)
    dataset = dataset.shuffle(10_000).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
    return dataset

def build_model(vocab_size, embedding_dim=256, rnn_units=512, batch_size=64, stateful=False):
    if stateful:
        input_layer = tf.keras.Input(batch_shape=(batch_size, None))
    else:
        input_layer = tf.keras.Input(shape=(None,))

    model = tf.keras.Sequential([
        input_layer,

        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),

        tf.keras.layers.GRU(units=rnn_units, return_sequences=True, stateful=stateful),
        tf.keras.layers.Dropout(0.3),

        tf.keras.layers.GRU(units=rnn_units, return_sequences=True, stateful=stateful),
        tf.keras.layers.Dropout(0.3),

        tf.keras.layers.Dense(units=vocab_size)
    ])
    return model

def generate_reply(model, vectorizer, user_input, max_generate=50, temp=0.6):
    prompt = f"<user> {user_input} <bot>"

    input_eval = vectorizer([prompt])

    vocab = vectorizer.get_vocabulary()
    generated_tokens = []

    for layer in model.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

    for i in range(max_generate):
        predictions = model(input_eval)
        predictions = predictions[0, -1, :]

        predictions = predictions / temp
        predictions = tf.expand_dims(predictions, 0)

        predicted_id = tf.random.categorical(predictions, num_samples=1)[0, 0].numpy()

        predicted_word = vocab[predicted_id]

        if predicted_word == "<end>":
            break

        if predicted_id > 1:
            generated_tokens.append(predicted_word)

        input_eval = tf.expand_dims([predicted_id], 0)

    bot_reply = " ".join(generated_tokens)
    bot_reply = bot_reply.replace(" ,", ",").replace(" .", ".").replace(" ?", "?").replace(" !", "!")

    return bot_reply

def start_chat(model, vectorizer):
    print("="*50)
    print("ü§ñ AI Chatbot is ONLINE! (Type 'quit' or 'exit' to stop)")
    print("="*50)

    while True:
        user_text = input("\nYou: ")

        if user_text.lower() in ['quit', 'exit']:
            print("ü§ñ AI: Goodbye! Have a great day.")
            break

        try:
            reply = generate_reply(
                model=model,
                vectorizer=vectorizer,
                user_input=user_text,
                max_generate=60,
                temp=0.6
            )
            print(f"ü§ñ AI: {reply}")
        except Exception as e:
            print(f"[ERROR] Something went wrong: {e}")

def main():
    MAX_TOKENS = 12_000
    SEQ_LENGTH = 40
    BATCH_SIZE = 64
    # Load Dataset
    print("[INFO] - Load the dataset...")
    dataset = load_data()
    print("[INFO] - Dataset loaded...")

    # Format dataset
    print("[INFO] - Grouping conversations...")
    formatted_corpus = format_chat_data(dataset)
    print(f"[INFO] - Successfully created {len(formatted_corpus)} chat turns from all splits.")

    # Vectorization
    print("[INFO] - Apply vectorization...")
    vectorizer, vocab, vocab_size = get_vectorizer(formatted_corpus, max_tokens=MAX_TOKENS)
    print("[INFO] - Vectorization applied...")
    print(f"[INFO] - Vocabulary size: {vocab_size}")

    # Creating dataset
    print("[INFO] - Creating tf.data.Dataset..")
    dataset = prepare_dataset(vectorizer, formatted_corpus, BATCH_SIZE, SEQ_LENGTH)
    print("[INFO] - Dataset created...")
    # Check shapes
    for inputs, targets in dataset.take(1):
        print("\nInput shape:", inputs.shape)
        print("Target shape:", targets.shape)

    # Build model
    print("\n[INFO] - Building model...")
    model = build_model(
        vocab_size=vocab_size,
    )
    print("[INFO] - Model Created Successfuly...")
    print("\n\nCheck Summary:")
    model.summary()

    model.compile(
        optimizer="adam",
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    )
    print("\n[INFO] - Model Compiled...")

    checkpoint_path = "chatbot_model_checkpoint.weights.h5"
    callbacks = [
        # Stop training if the loss doesn't improve for 7 epochs, and restore the best weights.
        tf.keras.callbacks.EarlyStopping(
            monitor="loss", patience=7, restore_best_weights=True),
        # Save model weights at each epoch, but only keep the best performing one based on loss.
        tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_path, save_weights_only=True, monitor="loss", save_best_only=True),
        # Reduce the learning rate if the loss plateaus for 3 epochs.
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor="loss", factor=0.5, patience=3)
    ]
    print("[INFO] - Starting training..")
    history = model.fit(
        dataset,
        epochs=100,
        callbacks=callbacks
    )
    print("[INFO] - Training Finished")
    model.save_weights("final_weights.weights.h5")

    # Build inference model
    print("\n[INFO] - Building Inference Model...")
    inference_model = build_model(
        vocab_size=vocab_size,
        batch_size=1,
        stateful=True
    )

    inference_model.load_weights("final_weights.weights.h5")
    inference_model.build(tf.TensorShape([1, None]))
    print("[INFO] - Inference Model Ready.")

    # Start Chat
    start_chat(inference_model, vectorizer)

# Execution
main()