In [1]:
!pip install transformers torch datasets scikit-learn pandas openpyxl imbalanced-learn --upgrade




In [3]:
import pandas as pd
import re
import torch
import random
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, classification_report

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [5]:
# Load dataset
file_path = "chatbot_dataset.xlsx"
df = pd.read_excel(file_path)

# Keep only AI-generated responses
df_ai = df[df["response_source"] == "Generative AI"].copy()

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower().strip()
        text = re.sub(r"[^\w\s.,!?]", "", text)  # Retain important punctuation
    return text

# Apply text cleaning
df_ai["user_message"] = df_ai["user_message"].apply(clean_text)
df_ai["chatbot_response"] = df_ai["chatbot_response"].apply(clean_text)
df_ai["combined_text"] = df_ai["user_message"] + " " + df_ai["chatbot_response"]

# Encode categories
label_encoder = LabelEncoder()
df_ai["category_encoded"] = label_encoder.fit_transform(df_ai["categories"])

# Check class distribution before oversampling
print("Category distribution before oversampling:")
print(df_ai["categories"].value_counts())

# Apply RandomOverSampler **before** splitting
ros = RandomOverSampler(random_state=SEED)
X_resampled, y_resampled = ros.fit_resample(df_ai[["combined_text"]], df_ai["category_encoded"])

# Convert DataFrame back to list for tokenization
X_resampled = X_resampled["combined_text"].tolist()

# Split the resampled data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                    test_size=0.2, random_state=SEED, stratify=y_resampled)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize datasets efficiently
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128)

print("Data preprocessing complete!")


Category distribution before oversampling:
categories
General conversation    93
Other                   82
Japeto Chat             71
Services                68
Compliance              51
About Japeto            48
Project management      47
Paige                   36
Apps                    28
AI services             25
Billing                 23
Contact                 23
Technical stack         21
Support                 20
Partnerships            19
Feedback                19
Pat                     18
Recruitment             11
Managed hosting          3
Websites                 3
Name: count, dtype: int64
Data preprocessing complete!


In [6]:
class ChatbotDataset(Dataset):
    def __init__(self, encodings, labels, device="cpu"):
        self.encodings = encodings
        self.labels = labels
        self.device = device  # Store device

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(self.device) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).to(self.device)
        return item

# Convert to PyTorch Dataset (ensure labels are correct)
train_dataset = ChatbotDataset(train_encodings, y_train.tolist(), device)
test_dataset = ChatbotDataset(test_encodings, y_test.tolist(), device)

print("Dataset successfully converted to PyTorch format!")


Dataset successfully converted to PyTorch format!


In [9]:
# Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_)).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=16,
    num_train_epochs=4,  # More stable training
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",
)

# Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model (Only run ONCE)
trainer.train()

# Save trained model
trainer.save_model("./trained_model_v2")
print("Model training complete and saved successfully!")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.662126
2,No log,0.811693
3,No log,0.489338
4,No log,0.411133


Model training complete and saved successfully!


In [41]:
import os
import numpy as np
import torch
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

# Set the correct model path
model_path = "./trained_model_v2"



# Ensure model exists
if not os.path.exists(model_path):
    raise FileNotFoundError("Trained model not found! Make sure you trained and saved it correctly.")

# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)

# Recreate TrainingArguments for evaluation
training_args = TrainingArguments(
    output_dir="./results_eval",
    eval_strategy="epoch",  # future-safe version of evaluation_strategy
    per_device_eval_batch_size=16,
    report_to="none",
)

# Recreate Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,  # test_dataset must be defined earlier
)

# Run evaluation
print("Running evaluation...")
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
classification_rep = classification_report(
    y_test,
    y_pred,
    labels=np.arange(len(label_encoder.classes_)),
    target_names=label_encoder.classes_,
    zero_division=0
)

# Print results
print(f"\nModel Accuracy (DistilBERT + Optimized Data): {accuracy * 100:.2f}%\n")
print("Classification Report:\n")
print(classification_rep)


Running evaluation...



Model Accuracy (DistilBERT + Optimized Data): 91.67%

Classification Report:

                      precision    recall  f1-score   support

         AI services       0.86      1.00      0.92        18
        About Japeto       0.94      0.94      0.94        18
                Apps       1.00      1.00      1.00        19
             Billing       1.00      1.00      1.00        18
          Compliance       0.86      0.95      0.90        19
             Contact       1.00      1.00      1.00        18
            Feedback       0.86      1.00      0.93        19
General conversation       0.94      0.79      0.86        19
         Japeto Chat       1.00      0.89      0.94        18
     Managed hosting       1.00      1.00      1.00        19
               Other       0.82      0.47      0.60        19
               Paige       0.70      0.78      0.74        18
        Partnerships       0.86      1.00      0.93        19
                 Pat       1.00      1.00      1.00 