<a href="https://www.kaggle.com/code/nourish/s-chatbot?scriptVersionId=244448176" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>


**Model Choice: DistilGPT2**
We use `distilgpt2` for this task because it offers a good balance of performance, smaller size, and faster training/inference times compared to larger models. It's a practical choice for a commercially usable chatbot without requiring immense computational resources.


In [35]:
# Cell 1: Imports and Global Setup
# This cell imports all necessary libraries and sets up global configurations like random seeds and device (CPU/GPU).

import pandas as pd
import numpy as np
import torch
import torch.multiprocessing as mp
from torch.utils.data import Dataset, DataLoader
from transformers import default_data_collator as DataCollatorForCausalLM
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    pipeline
)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import os
from datetime import datetime
import re
from collections import Counter
import string
import warnings
warnings.filterwarnings('ignore') # Ignore warnings for cleaner output

# --- Global Configurations ---
# Set random seeds for reproducibility across runs.
torch.manual_seed(42)
np.random.seed(42)


mp.set_start_method('spawn', force=True)

# Determine the device to use (GPU if available, otherwise CPU).
# Kaggle P100 GPU is automatically detected if available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Define global constants for paths and model names.
CSV_FILE = "/kaggle/input/ptoject/combined_dataset.csv" # Path to your dataset on Kaggle.
MODEL_NAME = "distilgpt2"  # The pre-trained model to fine-tune.
OUTPUT_DIR = "./chatbot_model"  # Directory to save the fine-tuned model and logs.

# Training parameters
NUM_EPOCHS = 5   # Number of full passes through the training dataset.
BATCH_SIZE = 32   # Number of samples processed in one forward/backward pass.
LEARNING_RATE = 2e-5 # Initial learning rate for the optimizer.
MAX_LENGTH = 512 # Maximum sequence length for tokenization.

print("Global settings initialized:")
print(f"  Device: {device}")
if torch.cuda.is_available():
    print(f"  GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("  GPU not available. Training and inference will use CPU (slower).")
print(f"  CSV File Path: {CSV_FILE}")
print(f"  Base Model: {MODEL_NAME}")
print(f"  Output Directory: {OUTPUT_DIR}")
print("All necessary libraries imported and configurations set.")

Global settings initialized:
  Device: cuda
  GPU Name: Tesla P100-PCIE-16GB
  GPU Memory: 15.9 GB
  CSV File Path: /kaggle/input/ptoject/combined_dataset.csv
  Base Model: distilgpt2
  Output Directory: ./chatbot_model
All necessary libraries imported and configurations set.


In [36]:
# Cell 2: Data Loading and Preprocessing
# This cell loads your conversational data from the specified CSV file and prepares it for model training.

print("PHASE 1: Data Loading and Preprocessing")
print("=" * 40)

MODEL_NAME = "gpt2"  # or your chosen model name

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer for model '{MODEL_NAME}' loaded successfully.")

try:
    def clean_text(text):
        text = str(text)
        text = text.strip()
        text = re.sub(r"\s+", " ", text)  # collapse multiple spaces/newlines
        return text
        
    df = pd.read_csv(CSV_FILE)
    print(f"Successfully loaded CSV from: {CSV_FILE}")
    print(f"Initial DataFrame shape: {df.shape}")
    
    if 'question' not in df.columns or 'support' not in df.columns:
        raise ValueError("CSV must contain 'question' and 'support' columns. Please check your dataset.")
    
    df = df.dropna(subset=['question', 'support']) # Remove rows with missing Q/A
    print(f"DataFrame shape after dropping NaNs: {df.shape}")

    # Format each question-answer pair into a single string.
    # The <|endoftext|> token acts as a conversation turn separator for models like DistilGPT2.
    formatted_texts = []
    token_lens = []
    for _, row in df.iterrows():
        question = clean_text(row['question'])
        support = clean_text(row['support'])
        formatted_text = f"Question: {question} Answer: {support}<|endoftext|>"

        tokens = tokenizer.encode(formatted_text)
        token_lens.append(len(tokens))
    
        formatted_texts.append((formatted_text, len(tokens)))

    
    print(f"Formatted {len(formatted_texts)} question-answer pairs for training.")

    min_len = 32
    max_len = int(MAX_LENGTH * 0.9)

    filtered_texts = [
        text for text, l in formatted_texts
        if min_len <= l <= max_len
    ]

    print(f"Kept {len(filtered_texts)} / {len(formatted_texts)} pairs after length filtering.")
    
    # Split the formatted data into training and validation sets.
    train_texts, val_texts = train_test_split(
        filtered_texts,
        test_size=0.2,
        random_state=42
    )
    
    print(f"Data split: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

except FileNotFoundError:
    print(f"ERROR: The CSV file '{CSV_FILE}' was not found. Please ensure it's in the correct Kaggle input path.")
except Exception as e:
    print(f"An error occurred during data loading/preprocessing: {e}")

print("Data loading and preprocessing complete.")

numeric_df = df.select_dtypes(include=[np.number])
print(f"Contains NaN: {numeric_df.isna().any().any()} | Contains Inf: {np.isinf(numeric_df.values).any()}")


PHASE 1: Data Loading and Preprocessing
Tokenizer for model 'gpt2' loaded successfully.
Successfully loaded CSV from: /kaggle/input/ptoject/combined_dataset.csv
Initial DataFrame shape: (13679, 2)
DataFrame shape after dropping NaNs: (12252, 2)
Formatted 12252 question-answer pairs for training.
Kept 11496 / 12252 pairs after length filtering.
Data split: 9196 training samples, 2300 validation samples.
Data loading and preprocessing complete.
Contains NaN: False | Contains Inf: False


In [37]:
print("PHASE 2: Tokenizer Setup")
print("=" * 40)

try:
    # Add a padding token if the tokenizer doesn't have one
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Tokenizer's pad token set to EOS token (ID: {tokenizer.pad_token_id}).")
    else:
        print(f"Tokenizer already has a pad token (ID: {tokenizer.pad_token_id}).")
    
    # Force right padding
    tokenizer.padding_side = "right"
    print(f"Tokenizer padding_side set to: {tokenizer.padding_side}")
    
    # Print special tokens
    print(f"Special tokens:")
    print(f"  pad_token: '{tokenizer.pad_token}' -> {tokenizer.pad_token_id}")
    print(f"  eos_token: '{tokenizer.eos_token}' -> {tokenizer.eos_token_id}")
    print(f"  bos_token: '{tokenizer.bos_token}' -> {tokenizer.bos_token_id if tokenizer.bos_token else 'None'}")
    print(f"  unk_token: '{tokenizer.unk_token}' -> {tokenizer.unk_token_id}")

    print(f"Tokenizer vocabulary size: {len(tokenizer)}")

except Exception as e:
    print(f"An error occurred during tokenizer setup: {e}")

print("Tokenizer setup complete.")


PHASE 2: Tokenizer Setup
Tokenizer's pad token set to EOS token (ID: 50256).
Tokenizer padding_side set to: right
Special tokens:
  pad_token: '<|endoftext|>' -> 50256
  eos_token: '<|endoftext|>' -> 50256
  bos_token: '<|endoftext|>' -> 50256
  unk_token: '<|endoftext|>' -> 50256
Tokenizer vocabulary size: 50257
Tokenizer setup complete.


In [38]:
class ChatbotDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Function to create dataset instance (no longer defines the class internally)
def create_chatbot_dataset(encodings, labels):
    return ChatbotDataset(encodings, labels)


# --- PHASE 3: Dataset Creation ---
print("PHASE 3: Dataset Creation")
print("=" * 40)

try:
    # Tokenize training texts
    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Tokenize validation texts
    val_encodings = tokenizer(
        val_texts,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Prepare labels
    # def prepare_labels(encodings):
    #     labels = encodings['input_ids'].clone()
    #     # Set attention_mask == 0 to -100 for ignore_index in loss calculation
    #     labels[encodings['attention_mask'] == 0] = -100
    #     return labels

    # train_labels = prepare_labels(train_encodings)
    # val_labels = prepare_labels(val_encodings)

    train_labels = train_encodings['input_ids'].clone() # No -100 assignment here
    val_labels = val_encodings['input_ids'].clone()   # No -100 assignment here

    # Create datasets using your new function
    train_dataset = create_chatbot_dataset(train_encodings, train_labels)
    val_dataset = create_chatbot_dataset(val_encodings, val_labels)

    print(f"Training Dataset created with {len(train_dataset)} samples.")
    print(f"Validation Dataset created with {len(val_dataset)} samples.")

    # Optional check
    sample = train_dataset[0]
    print("\nSample from training dataset (first entry):")
    print(f"  Input IDs shape: {sample['input_ids'].shape}")
    print(f"  Attention Mask shape: {sample['attention_mask'].shape}")
    print(f"  Labels shape: {sample['labels'].shape}")
    print(f"  Decoded Input: {tokenizer.decode(sample['input_ids'], skip_special_tokens=True)[:100]}...")

except Exception as e:
    print(f"An error occurred during dataset creation: {e}")
    import traceback
    traceback.print_exc()

print("Dataset creation complete.")



PHASE 3: Dataset Creation
Training Dataset created with 9196 samples.
Validation Dataset created with 2300 samples.

Sample from training dataset (first entry):
  Input IDs shape: torch.Size([512])
  Attention Mask shape: torch.Size([512])
  Labels shape: torch.Size([512])
  Decoded Input: Question: What are made on ribosomes in the cytoplasm? Answer: DNA is located in the nucleus. Protei...
Dataset creation complete.


In [39]:
# Cell 5: Model Setup
# This cell loads the pre-trained DistilGPT2 model and moves it to the appropriate device (GPU/CPU).

print("PHASE 4: Model Setup")
print("=" * 40)

try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        # Use mixed precision (float16) if a GPU is available to save memory and speed up training.
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        # Automatically map model to available devices (e.g., multiple GPUs if configured).
        device_map="auto" if torch.cuda.is_available() else None
    ).to(device)  # Ensure the model is explicitly moved to the selected device.
    
    print(f"Model '{MODEL_NAME}' loaded successfully and moved to {device}.")
    print(f"Model total parameters: {sum(p.numel() for p in model.parameters()):,}")

    # --- Safety: Check that tokenizer + model embeddings are aligned ---
    vocab_size_tokenizer = len(tokenizer)
    vocab_size_model = model.get_input_embeddings().num_embeddings
    
    print(f"Tokenizer vocab size: {vocab_size_tokenizer}")
    print(f"Model embedding size: {vocab_size_model}")
    
    if vocab_size_tokenizer != vocab_size_model:
        print("Mismatch detected! Resizing model embeddings...")
        model.resize_token_embeddings(vocab_size_tokenizer)
        print("Resized model embeddings.")

    # --- Safety: Check that pad token and endoftext token exist ---
    if tokenizer.pad_token is None:
        print("WARNING: Tokenizer pad_token is None! Setting pad_token to eos_token...")
        tokenizer.pad_token = tokenizer.eos_token
        model.resize_token_embeddings(len(tokenizer))
        print(f"Pad token set to eos_token. New vocab size: {len(tokenizer)}")

    if "<|endoftext|>" not in tokenizer.get_vocab():
        print("WARNING: <|endoftext|> token not found in tokenizer vocab!")
    else:
        print(f"<|endoftext|> token ID: {tokenizer.convert_tokens_to_ids('<|endoftext|>')}")

    # --- Debugging: Print any frozen parameters ---
    print("\nChecking if any model parameters are frozen...")
    frozen_count = 0
    for name, param in model.named_parameters():
        if not param.requires_grad:
            print(f"  FROZEN PARAM: {name}")
            frozen_count += 1

    if frozen_count == 0:
        print("All model parameters are trainable.")
    else:
        print(f"{frozen_count} parameters are frozen! This may affect training.")

except Exception as e:
    print(f"An error occurred during model setup: {e}")

print("Model setup complete.")


PHASE 4: Model Setup
Model 'gpt2' loaded successfully and moved to cuda.
Model total parameters: 124,439,808
Tokenizer vocab size: 50257
Model embedding size: 50257
<|endoftext|> token ID: 50256

Checking if any model parameters are frozen...
All model parameters are trainable.
Model setup complete.


In [40]:
print("PHASE 5: Training Arguments Configuration (FP16 = False)")
print("=" * 40)

import os
# Create output dirs
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "logs"), exist_ok=True)

# Define gradient accumulation steps
GRADIENT_ACCUMULATION_STEPS = 4

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=False,  # We are training + validating, not predicting here
    eval_strategy="epoch",  # Do eval at end of each epoch
    save_strategy="epoch",        # Save at end of each epoch
    save_total_limit=2,           # Keep only last 2 checkpoints
    save_safetensors=True,        # Use safetensors format
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=100,
    logging_dir=f'{OUTPUT_DIR}/logs',
    logging_strategy="steps",      # Log every N steps
    logging_steps=10,              # Log every 10 steps
    logging_nan_inf_filter=True,   # Helps debug NaN/Inf issues
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=False,                    # We are turning FP16 off explicitly
    dataloader_pin_memory=True,
    remove_unused_columns=False,   # Important for GPT-style models
    gradient_checkpointing=True,   # Saves memory for large models
    run_name=f"{MODEL_NAME}_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
    disable_tqdm=False,
    report_to="none",              # No W&B or other tracking
    seed=42,
    max_steps=-1,                  # No limit on steps (train fully)
    save_steps=None,               # Not used, as we save per epoch
    eval_steps=None,               # Not used, as we evaluate per epoch
    logging_first_step=True,       # Log first step for debug clarity
    dataloader_num_workers=0,      # You can increase this if needed (2 is safe default)
    gradient_checkpointing_kwargs=None, # Default
    optim="adamw_torch_fused",     # Use PyTorch fused AdamW (faster if supported)
)

# Print args
print("Training arguments configured (FP16 = False):")
for arg, value in training_args.to_dict().items():
    if arg not in ["__post_init__", "_setup_devices"]:
        print(f"  {arg}: {value}")
print(f"  Effective Training Batch Size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")

print("Training arguments setup complete.")


PHASE 5: Training Arguments Configuration (FP16 = False)
Training arguments configured (FP16 = False):
  output_dir: ./chatbot_model
  overwrite_output_dir: True
  do_train: True
  do_eval: True
  do_predict: False
  eval_strategy: epoch
  prediction_loss_only: False
  per_device_train_batch_size: 32
  per_device_eval_batch_size: 32
  per_gpu_train_batch_size: None
  per_gpu_eval_batch_size: None
  gradient_accumulation_steps: 4
  eval_accumulation_steps: None
  eval_delay: 0
  torch_empty_cache_steps: None
  learning_rate: 2e-05
  weight_decay: 0.01
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1e-08
  max_grad_norm: 1.0
  num_train_epochs: 5
  max_steps: -1
  lr_scheduler_type: linear
  lr_scheduler_kwargs: {}
  warmup_ratio: 0.0
  warmup_steps: 100
  log_level: passive
  log_on_each_node: True
  logging_dir: ./chatbot_model/logs
  logging_strategy: steps
  logging_first_step: True
  logging_steps: 10
  logging_nan_inf_filter: True
  save_strategy: epoch
  save_steps: None
  

In [41]:
# Cell 7: Data Collator Setup
# This cell sets up the data collator, which batches and dynamically pads sequences during training.

print("PHASE 6: Data Collator Setup")
print("=" * 40)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
# Initialize the data collator for causal language modeling (CLM).
# Setting mlm=False means we are doing next-token prediction, not masked language modeling.
# data_collator = DataCollatorForCausalLM(tokenizer=tokenizer) 

print(f"Data collator for causal language modeling initialized using tokenizer '{MODEL_NAME}'.")
print("Data collator setup complete.")


PHASE 6: Data Collator Setup
Data collator for causal language modeling initialized using tokenizer 'gpt2'.
Data collator setup complete.


In [42]:
print("PHASE 7: Model Training")
print("=" * 40)

# IMPORTANT: Wrap the training part inside if __name__ == '__main__':
# This ensures that when multiprocessing spawns new processes, this code
# is not re-executed, which can lead to issues with how modules are loaded.

import transformers
if __name__ == '__main__':
    try:
        print("Initializing Hugging Face Trainer...")
        transformers.logging.set_verbosity_error()  # Suppress unnecessary logging

        # Patch model config to avoid warnings if needed
        if not hasattr(model.config, "loss_type") or model.config.loss_type is None:
            model.config.loss_type = "ForCausalLMLoss"
            print("Patched model.config.loss_type to 'ForCausalLMLoss' to avoid warning.")

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        print("Hugging Face Trainer initialized. Starting training process...")

        # Train the model
        train_result = trainer.train()

        # Save model & tokenizer
        final_model_save_path = os.path.join(OUTPUT_DIR, "prototype_model")
        print(f"Saving final model to {final_model_save_path}...")
        trainer.save_model(output_dir=final_model_save_path)
        tokenizer.save_pretrained(save_directory=final_model_save_path)

        print(f"Training completed! Final model and tokenizer saved to: {final_model_save_path}")

        # Display training metrics
        metrics = train_result.metrics
        print("\nTraining Metrics:")
        for key, value in metrics.items():
            if isinstance(value, (float, int)):
                print(f"  {key}: {value:.4f}")
            else:
                print(f"  {key}: {value}")

        # Save training metadata
        training_info = {
            "model_name": MODEL_NAME,
            "num_epochs": NUM_EPOCHS,
            "batch_size": BATCH_SIZE,
            "learning_rate": LEARNING_RATE,
            "training_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "train_samples": len(train_dataset),
            "val_samples": len(val_dataset),
            "final_train_loss": metrics.get('train_loss'),
            "final_eval_loss": metrics.get('eval_loss')
        }
        with open(f"{OUTPUT_DIR}/training_info.json", "w") as f:
            json.dump(training_info, f, indent=2)
        print(f"Training metadata saved to: {OUTPUT_DIR}/training_info.json")

    except Exception as e:
        import traceback
        print(f"An error occurred during model training: {e}")
        traceback.print_exc()

print("Model training phase complete.")

PHASE 7: Model Training
Initializing Hugging Face Trainer...
Hugging Face Trainer initialized. Starting training process...


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,
4,0.0,
5,0.0,


Saving final model to ./chatbot_model/prototype_model...
Training completed! Final model and tokenizer saved to: ./chatbot_model/prototype_model

Training Metrics:
  train_runtime: 4065.6954
  train_samples_per_second: 11.3090
  train_steps_per_second: 0.0890
  total_flos: 12014207631360000.0000
  train_loss: 80.7115
  epoch: 5.0000
Training metadata saved to: ./chatbot_model/training_info.json
Model training phase complete.


In [43]:
print("PHASE 8: Inference Model Loading")
print("=" * 40)

try:
    inference_model_path = os.path.join(OUTPUT_DIR, "prototype_model")
    print(f"Attempting to load fine-tuned model from: {inference_model_path}")
    
    # Load tokenizer used for the trained model
    inference_tokenizer = AutoTokenizer.from_pretrained(inference_model_path)
    
    # Load the fine-tuned causal language model
    inference_model = AutoModelForCausalLM.from_pretrained(
        inference_model_path,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None
    )
    
    # Create a Hugging Face pipeline for easy text generation.
    generator = pipeline(
        "text-generation",
        model=inference_model,
        tokenizer=inference_tokenizer
        # No need to specify device anymore!
    )
    
    print("Fine-tuned model and tokenizer loaded successfully for inference.")
    print(f"Inference pipeline initialized.")

except Exception as e:
    print(f"An error occurred during inference model loading: {e}")
    print("Please ensure the training phase completed successfully and model files exist.")

print("Inference model loading complete.")


PHASE 8: Inference Model Loading
Attempting to load fine-tuned model from: ./chatbot_model/prototype_model
Fine-tuned model and tokenizer loaded successfully for inference.
Inference pipeline initialized.
Inference model loading complete.


In [44]:
# Cell 10: KeywordMatcher Setup
# This cell sets up the KeywordMatcher, which is used as a fallback or complementary mechanism to the LLM.

print("PHASE 9: Keyword Matcher Setup")
print("=" * 40)

# Helper functions for KeywordMatcher
def preprocess_text(text):
    """Cleans and preprocesses a given text string for better matching."""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    return text

def extract_keywords(text, top_n=10):
    """Extracts the most important keywords from a text based on frequency, after preprocessing."""
    processed_text = preprocess_text(text)
    words = processed_text.split()
    stop_words = {'what', 'how', 'when', 'where', 'why', 'who', 'which', 'is', 'are', 'was', 'were', 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'do', 'does', 'did', 'can', 'could', 'should', 'would', 'will'}
    keywords = [word for word in words if word not in stop_words and len(word) > 2]
    keyword_counts = Counter(keywords)
    return [word for word, count in keyword_counts.most_common(top_n)]

class KeywordMatcher:
    """
    Class for keyword-based matching and similarity calculation using TF-IDF.
    """
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 2),  
            max_features=5000,
            lowercase=True
        )
        self.question_vectors = None
        self.original_questions = None
        self.original_answers = None
        
    def fit(self, questions, answers):
        """Fits the TF-IDF vectorizer to a list of questions and stores original questions and answers."""
        self.original_questions = questions
        self.original_answers = answers
        processed_questions = [preprocess_text(q) for q in questions]
        self.question_vectors = self.vectorizer.fit_transform(processed_questions)
        print(f"  Keyword matcher fitted on {len(questions)} questions.")
    
    def find_best_match(self, user_question, similarity_threshold=0.3, keyword_threshold=0.4):
        """Finds the best matching question from the dataset based on TF-IDF cosine similarity and keyword overlap."""
        processed_user_question = preprocess_text(user_question)
        user_vector = self.vectorizer.transform([processed_user_question])
        similarities = cosine_similarity(user_vector, self.question_vectors).flatten()
        top_indices = np.argsort(similarities)[::-1][:5] 
        user_keywords = set(extract_keywords(user_question))
        
        best_match_idx = None
        best_score = 0
        match_details = []
        
        for idx in top_indices:
            similarity_score = similarities[idx]
            dataset_keywords = set(extract_keywords(self.original_questions[idx]))
            common_keywords = user_keywords.intersection(dataset_keywords)
            keyword_score = len(common_keywords) / max(len(user_keywords), 1) if user_keywords else 0
            combined_score = 0.6 * similarity_score + 0.4 * keyword_score
            match_details.append({
                'index': idx,
                'question': self.original_questions[idx],
                'answer': self.original_answers[idx],
                'similarity_score': similarity_score,
                'keyword_score': keyword_score,
                'combined_score': combined_score,
                'common_keywords': list(common_keywords)
            })
            
            if combined_score > best_score and (similarity_score >= similarity_threshold or keyword_score >= keyword_threshold):
                best_score = combined_score
                best_match_idx = idx
        match_details.sort(key=lambda x: x['combined_score'], reverse=True)
        return best_match_idx, best_score, match_details

# Initialize and fit the KeywordMatcher
keyword_matcher = KeywordMatcher()
use_fallback = True # Flag to enable/disable keyword-based fallback

try:
    df_full = pd.read_csv(CSV_FILE) # Load full dataset for keyword matcher
    questions_for_matcher = df_full['question'].fillna('').tolist()
    answers_for_matcher = df_full['support'].fillna('').tolist()
    keyword_matcher.fit(questions_for_matcher, answers_for_matcher)
    print("Keyword matcher successfully set up and fitted with data.")

except Exception as e:
    print(f"WARNING: Could not set up keyword matcher: {e}. Keyword fallback will be disabled.")
    use_fallback = False

print("Keyword matcher setup complete.")

PHASE 9: Keyword Matcher Setup
  Keyword matcher fitted on 13679 questions.
Keyword matcher successfully set up and fitted with data.
Keyword matcher setup complete.


In [45]:
# Cell 11: Inference Helper Functions
# These helper functions assist the main `generate_response` logic by filtering out low-quality outputs.

print("PHASE 10: Defining Inference Helper Functions")
print("=" * 40)

def _is_generic_response(response: str) -> bool:
    """
    Check if a generated response is too generic or indicates the model couldn't find a good answer.
    Useful to decide when to trigger a fallback (like keyword matching).

    Args:
        response (str): The generated text from the model.

    Returns:
        bool: True if the response is generic/insufficient, False otherwise.
    """
    generic_phrases = [
        "i'm sorry", "i don't know", "i can't help", "i'm not sure",
        "i apologize", "sorry, i couldn't", "coherent response", "generate a proper response"
    ]
    response_lower = response.lower()
    
    # Short responses are likely generic
    if len(response.split()) < 5:
        return True
    
    # Check if any generic phrase is present
    return any(phrase in response_lower for phrase in generic_phrases)

def _is_all_padding(response: str) -> bool:
    """
    Check if the generated response is mostly padding or empty after decoding.

    Args:
        response (str): The generated text from the model.

    Returns:
        bool: True if response is empty or too short (likely padding), False otherwise.
    """
    return len(response.strip()) < 2

print("Inference helper functions `_is_generic_response` and `_is_all_padding` defined.")
print("Inference helper functions setup complete.")


PHASE 10: Defining Inference Helper Functions
Inference helper functions `_is_generic_response` and `_is_all_padding` defined.
Inference helper functions setup complete.


In [46]:
# Cell 12: Main Inference Function (`generate_response`)
# This function combines LLM generation and keyword matching for a hybrid response strategy.

print("PHASE 11: Defining Main Inference Logic")
print("=" * 40)

def generate_response(question, max_length=150, temperature=0.2, top_p=0.9, use_keyword_fallback=True):
    """
    Generates a response using a hybrid approach:
    1. Tries keyword matching first for strong, direct answers.
    2. Falls back to LLM generation if keyword match is moderate or weak,
       and can use keyword match if LLM response is generic.
    """
    global generator, keyword_matcher, use_fallback # Access global variables

    # Step 1: Attempt keyword matching if enabled
    if use_fallback and use_keyword_fallback:
        match_idx, match_score, match_details = keyword_matcher.find_best_match(question)
        
        # If a strong keyword match is found, return its answer directly.
        if match_idx is not None and match_score > 0.7:
            print(f"******************[Strategy: KEYWORD MATCH] Direct hit! Score: {match_score:.3f}*********************")
            return keyword_matcher.original_answers[match_idx]
        
        # If a moderate keyword match is found, try LLM first, but keep fallback as an option.
        elif match_idx is not None and match_score > 0.4:
            print(f"******************[Strategy: HYBRID MODE] Moderate keyword match (score: {match_score:.3f}). Trying LLM first.*******************")
            
            # Try to generate a response using the LLM
            llm_response = _generate_llm_response_llm(question, max_length, temperature, top_p)
            
            # If the LLM's response is generic or unhelpful, use the keyword match as a fallback.
            if _is_generic_response(llm_response) and not _is_all_padding(llm_response):
                print("*****************[Fallback Action] LLM response was generic. Using keyword match.****************")
                return keyword_matcher.original_answers[match_idx]
            
            # Otherwise, if LLM generated a good response, use it.
            return llm_response
    
    # Step 2: If no strong keyword match or fallback is disabled, default to LLM generation.
    print("########################[Strategy: LLM MODE] No strong keyword match or fallback disabled. Generating with LLM.#################")
    return _generate_llm_response_llm(question, max_length, temperature, top_p)

def _generate_llm_response_llm(question, max_length, temperature, top_p):
    """
    Generates a response using the fine-tuned Large Language Model (LLM) via the Hugging Face pipeline.
    """
    global generator, inference_tokenizer # Access global pipeline and tokenizer
    input_text = f"Question: {question} Answer:"
    
    response = generator(
        input_text,
        max_length=max_length,
        num_beams=5,
        do_sample=False,
        no_repeat_ngram_size=2,
        pad_token_id=inference_tokenizer.eos_token_id,
        num_return_sequences=1
    )
    
    generated_text = response[0]['generated_text']
    
    if "Answer:" in generated_text:
        answer = generated_text.split("Answer:")[-1].strip()
        answer = answer.replace("<|endoftext|>", "").strip()
        return answer
    
    return "I'm sorry, I couldn't generate a proper response from the LLM."

print("Main inference function `generate_response` and LLM specific generation `_generate_llm_response_llm` defined.")

print("Inference logic setup complete.")

PHASE 11: Defining Main Inference Logic
Main inference function `generate_response` and LLM specific generation `_generate_llm_response_llm` defined.
Inference logic setup complete.


In [47]:
# Cell 13: Debugging Function (`get_match_details`)
# This function provides detailed insights into how the keyword matcher found its matches.

print("PHASE 12: Defining Debugging Function")
print("=" * 40)

def get_match_details(question):
    """
    Provides detailed information about the keyword matching process for a given question.
    Useful for debugging and understanding why a certain keyword match was or wasn't chosen.
    """
    global keyword_matcher, use_fallback # Access global variables
    if not use_fallback:
        return "Keyword matching not available or disabled."
    
    match_idx, match_score, match_details = keyword_matcher.find_best_match(question)
    
    result = f"User Question: {question}\n"
    result += f"Best Match Score: {match_score:.3f}\n\n"
    result += "Top 3 Matches (ordered by combined score):\n"
    result += "-" * 50 + "\n"
    
    for i, detail in enumerate(match_details[:3]):
        result += f"{i+1}. Question (from dataset): {detail['question']}\n"
        result += f"   Cosine Similarity Score: {detail['similarity_score']:.3f}\n"
        result += f"   Keyword Overlap Score: {detail['keyword_score']:.3f}\n"
        result += f"   Combined Score: {detail['combined_score']:.3f}\n"
        result += f"   Common Keywords with User Query: {', '.join(detail['common_keywords'])}\n"
        result += f"   Corresponding Answer: {detail['answer'][:100]}...\n\n" # Truncate answer for brevity
    
    return result

print("Debugging function `get_match_details` defined.")

print("Debugging function setup complete.")

PHASE 12: Defining Debugging Function
Debugging function `get_match_details` defined.
Debugging function setup complete.


In [48]:
# Cell 14: Interactive Chat Function (`chat_interactive`)
# This cell defines an interactive command-line interface for testing the chatbot.

print("PHASE 13: Defining Interactive Chat Function")
print("=" * 40)

def chat_interactive():
    """
    Provides an enhanced interactive command-line chat interface for testing the chatbot.
    Supports 'quit', 'exit', 'help', and 'debug' commands.
    """
    print("\n=== Chatbot Interactive Mode ===")
    print("Type 'quit' or 'exit' to end the conversation.")
    print("Type 'debug' to see keyword matching details for the last question asked.")
    print("Type 'help' for available commands.")
    print("=" * 45 + "\n")
    
    last_question = "" # Stores the last question asked for 'debug' command
    
    while True:
        user_input = input("You: ").strip()
        
        if user_input.lower() in ['quit', 'exit']:
            print("Goodbye! Exiting chat mode.")
            break
        
        if user_input.lower() == 'help':
            print("\nAvailable commands:")
            print("- 'debug': Show detailed keyword matching information for your last question.")
            print("- 'quit' or 'exit': Terminate the interactive chat session.")
            print("- Any other text: Ask a question to the chatbot.")
            print("\n")
            continue
        
        if user_input.lower() == 'debug':
            if last_question:
                print("\n" + "="*50)
                print("KEYWORD MATCHING DEBUG INFO")
                print("="*50)
                print(get_match_details(last_question)) # Call global function
                print("="*50 + "\n")
            else:
                print("No previous question to debug yet. Ask a question first.\n")
            continue
        
        if not user_input:
            print("Please type something to ask the chatbot.\n")
            continue
        
        last_question = user_input # Store the current question for potential debugging
        print("Bot: ", end="")
        response = generate_response(user_input) # Call global function
        print(response)
        print()

print("Interactive chat function `chat_interactive` defined.")

print("Interactive chat function setup complete.")

PHASE 13: Defining Interactive Chat Function
Interactive chat function `chat_interactive` defined.
Interactive chat function setup complete.


In [49]:
# Cell 15: Batch Testing Function (`batch_test`)
# This cell defines a function to test the chatbot with a predefined list of questions.

print("PHASE 14: Defining Batch Testing Function")
print("=" * 40)

def batch_test(test_questions):
    """
    Tests the chatbot with a list of predefined questions and prints results.
    """
    print("\n" + "="*60)
    print("BATCH TESTING RESULTS")
    print("="*60)
    
    for i, question in enumerate(test_questions):
        print(f"\nTest {i+1} - Q: {question}")
        response = generate_response(question) # Call global generate_response
        print(f"A: {response}")
        # Optionally, show keyword match details for each batch test question
        # Uncomment the line below if you want to see verbose details for each test
        # print(get_match_details(question))
        print("--- End of Test ---")
    print("All predefined test questions processed in batch.")

print("Batch testing function `batch_test` defined.")

print("Batch testing function setup complete.")

PHASE 14: Defining Batch Testing Function
Batch testing function `batch_test` defined.
Batch testing function setup complete.


In [50]:
# Cell 16: Main Execution - Testing and Interactive Chat
# This cell executes the testing and interactive chat functions to demonstrate the chatbot's capabilities.

print("PHASE 15: Executing Testing and Interactive Chat")
print("=" * 40)

if 'generator' not in locals() or generator is None:
    print("ERROR: Inference model (generator) not loaded. Please run previous cells.")
else:
    # Define test questions for batch testing
    batch_test_questions = [
    "What is carbon emission?",
    "How does AI work?",
    "Tell me about deep learning.",
    "What are neural networks?",
    "Who invented the light bulb?",  # Question potentially outside training data domain
    "Hello, how are you?",

    # Environment-related questions:
    "What is climate change?",
    "How does deforestation impact the environment?",
    "What are renewable energy sources?",
    "Explain the greenhouse effect.",
    "What is global warming?",
    "What are the effects of plastic pollution?",
    "How can individuals reduce their carbon footprint?",
    "What is biodiversity and why is it important?",
    "What are the main causes of air pollution?",
    "How do electric vehicles help the environment?"
]
    
    batch_test(batch_test_questions)
    
    print("\n" + "="*60)
    print("Starting interactive chat session...")
    print("="*60)
    chat_interactive()
    print("Interactive chat session ended.")

print("All phases of the chatbot workflow finished.")
print("\n--- Chatbot Pipeline Finished ---")

PHASE 15: Executing Testing and Interactive Chat

BATCH TESTING RESULTS

Test 1 - Q: What is carbon emission?
******************[Strategy: HYBRID MODE] Moderate keyword match (score: 0.532). Trying LLM first.*******************
A: '
--- End of Test ---

Test 2 - Q: How does AI work?
******************[Strategy: KEYWORD MATCH] Direct hit! Score: 0.896*********************
A: 
--- End of Test ---

Test 3 - Q: Tell me about deep learning.
########################[Strategy: LLM MODE] No strong keyword match or fallback disabled. Generating with LLM.#################
A: '
--- End of Test ---

Test 4 - Q: What are neural networks?
******************[Strategy: HYBRID MODE] Moderate keyword match (score: 0.459). Trying LLM first.*******************
A: '
--- End of Test ---

Test 5 - Q: Who invented the light bulb?
******************[Strategy: HYBRID MODE] Moderate keyword match (score: 0.610). Trying LLM first.*******************
A: '
--- End of Test ---

Test 6 - Q: Hello, how are you?
######

You:  exit


Goodbye! Exiting chat mode.
Interactive chat session ended.
All phases of the chatbot workflow finished.

--- Chatbot Pipeline Finished ---
