In [1]:
!pip install -q transformers datasets accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/

In [2]:
import pandas as pd
import numpy as np
import torch
import time
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    pipeline,
    DataCollatorWithPadding,
    EvalPrediction
)
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)


2025-08-06 04:27:37.607773: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754454457.828468      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754454457.892600      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Model saving configuration
MODEL_SAVE_DIR = "./saved_ticket_classifier_downsampled"
TOKENIZER_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, "tokenizer")
MODEL_WEIGHTS_DIR = os.path.join(MODEL_SAVE_DIR, "model")
METADATA_FILE = os.path.join(MODEL_SAVE_DIR, "metadata.json")
LABEL_ENCODER_FILE = os.path.join(MODEL_SAVE_DIR, "label_encoder.pkl")

In [4]:
def create_save_directories():
    """Create directories for saving model components"""
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
    os.makedirs(TOKENIZER_SAVE_DIR, exist_ok=True)
    os.makedirs(MODEL_WEIGHTS_DIR, exist_ok=True)
    print(f"Created save directories: {MODEL_SAVE_DIR}")

def save_model_components(model, tokenizer, label_encoder, training_config, results):
    """Save all model components needed for inference"""
    create_save_directories()
    
    print("Saving model components...")
    
    # Save the trained model
    model.save_pretrained(MODEL_WEIGHTS_DIR)
    print(f"✓ Model saved to: {MODEL_WEIGHTS_DIR}")
    
    # Save the tokenizer
    tokenizer.save_pretrained(TOKENIZER_SAVE_DIR)
    print(f"✓ Tokenizer saved to: {TOKENIZER_SAVE_DIR}")
    
    # Save label encoder
    with open(LABEL_ENCODER_FILE, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"✓ Label encoder saved to: {LABEL_ENCODER_FILE}")
    
    # Save metadata and configuration
    metadata = {
        "model_name": "distilbert-base-uncased",
        "num_labels": len(label_encoder.classes_),
        "label_classes": label_encoder.classes_.tolist(),
        "max_length": 512,
        "training_config": training_config,
        "results": results,
        "save_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "preprocessing_params": {
            "method": "downsampling_for_balance",
            "tokenizer_model": "distilbert-base-uncased"
        }
    }
    
    with open(METADATA_FILE, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    print(f"✓ Metadata saved to: {METADATA_FILE}")
    
    print(f"\n✅ All model components saved successfully to: {MODEL_SAVE_DIR}")
    return MODEL_SAVE_DIR

In [5]:
def load_model_for_inference(model_dir=MODEL_SAVE_DIR):
    """
    Load all model components for inference
    """
    print(f"Loading model components from: {model_dir}")
    
    # Load metadata
    with open(os.path.join(model_dir, "metadata.json"), 'r') as f:
        metadata = json.load(f)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir, "tokenizer"))
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(os.path.join(model_dir, "model"))
    
    # Load label encoder
    with open(os.path.join(model_dir, "label_encoder.pkl"), 'rb') as f:
        label_encoder = pickle.load(f)
    
    print("✅ Model components loaded successfully!")
    print(f"Model: {metadata['model_name']}")
    print(f"Number of classes: {metadata['num_labels']}")
    print(f"Classes: {metadata['label_classes']}")
    
    return model, tokenizer, label_encoder, metadata

In [6]:
def predict_ticket_category(text, model, tokenizer, label_encoder, device='cpu', max_length=512):
    """
    Predict the category of a single ticket text
    """
    model.eval()
    model.to(device)
    
    # Tokenize input
    encoding = tokenizer(
        str(text),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).cpu().numpy()[0]
        confidence = torch.max(predictions).cpu().numpy()
    
    # Convert back to original label
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    
    return {
        'predicted_class': predicted_label,
        'confidence': float(confidence),
        'class_probabilities': {
            label_encoder.classes_[i]: float(predictions[0][i].cpu().numpy()) 
            for i in range(len(label_encoder.classes_))
        }
    }

In [7]:
def downsample_dataframe(df, target_column='queue'):
    """
    Downsample the majority classes to match the size of the smallest class.
    """
    class_counts = df[target_column].value_counts()
    min_class_size = class_counts.min()
    
    print(f"Original class distribution:\n{class_counts}")
    print(f"\nSmallest class size: {min_class_size}. Downsampling all classes to this size.")
    
    downsampled_df = pd.DataFrame()
    for class_name in class_counts.index:
        class_subset = df[df[target_column] == class_name]
        downsampled_subset = class_subset.sample(min_class_size, random_state=42)
        downsampled_df = pd.concat([downsampled_df, downsampled_subset], ignore_index=True)
    
    # Shuffle the final dataset to mix the classes
    downsampled_df = downsampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"\nNew (downsampled) class distribution:\n{downsampled_df[target_column].value_counts()}")
    return downsampled_df

In [9]:
# Load and Prepare Dataset
print("Loading dataset...")
df = pd.read_csv("/kaggle/input/processedtickets2/processed_tickets.csv")

# Basic data cleaning
df = df.dropna(subset=['full_text', 'queue'])
df['full_text'] = df['full_text'].astype(str)
print(f"Original dataset shape: {df.shape}")
print(f"Number of unique queues: {df['queue'].nunique()}")


Loading dataset...
Original dataset shape: (11923, 7)
Number of unique queues: 5


In [10]:
# --- Downsample the data to create a balanced dataset ---
downsampled_df = downsample_dataframe(df)

Original class distribution:
queue
Technical Support    5245
Product Support      2814
Customer Service     2027
Billing Support      1302
Sales & HR            535
Name: count, dtype: int64

Smallest class size: 535. Downsampling all classes to this size.

New (downsampled) class distribution:
queue
Product Support      535
Sales & HR           535
Technical Support    535
Billing Support      535
Customer Service     535
Name: count, dtype: int64


In [11]:
# Encode target labels
label_encoder = LabelEncoder()
downsampled_df['label'] = label_encoder.fit_transform(downsampled_df['queue'])
num_labels = len(label_encoder.classes_)
print(f"\nNumber of classes: {num_labels}")
print(f"Classes: {list(label_encoder.classes_)}")

# Select required columns
df_final = downsampled_df[['full_text', 'label']].copy()


Number of classes: 5
Classes: ['Billing Support', 'Customer Service', 'Product Support', 'Sales & HR', 'Technical Support']


In [12]:
# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_final['full_text'], 
    df_final['label'], 
    test_size=0.2, 
    stratify=df_final['label'], 
    random_state=42
)
print(f"\nDataset split:")
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")



Dataset split:
Training samples: 2140
Validation samples: 535


In [13]:
# Custom Dataset Class
class TicketDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts.reset_index(drop=True) if hasattr(texts, 'reset_index') else texts
        self.labels = labels.reset_index(drop=True) if hasattr(labels, 'reset_index') else labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx] if hasattr(self.texts, 'iloc') else self.texts[idx])
        label = int(self.labels.iloc[idx] if hasattr(self.labels, 'iloc') else self.labels[idx])
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [14]:
def compute_metrics(eval_pred: EvalPrediction):
    """Compute accuracy metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'eval_samples': len(labels)
    }

In [15]:
# Training and Evaluation Function
def train_distilbert_with_downsampling(train_texts, val_texts, train_labels, val_labels, num_labels):
    print(f"\n--- Training DistilBERT with Downsampled Text ---")
    
    model_name = "distilbert-base-uncased"
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=num_labels
    )
    
    # Create datasets
    train_dataset = TicketDataset(train_texts, train_labels, tokenizer)
    val_dataset = TicketDataset(val_texts, val_labels, tokenizer)
    
    print(f"Created datasets - Train: {len(train_dataset)}, Val: {len(val_dataset)}")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    print(f"Using device: {device}")
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results/distilbert_downsampled",
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=30,
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_accuracy",
        greater_is_better=True,
        report_to="none",
        dataloader_num_workers=0,
        remove_unused_columns=False,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
        eval_steps=None,
    )
    
    # Initialize trainer
    # NOTE: Using standard Trainer as dataset is now balanced via downsampling.
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )
    
    # Train model
    print("Starting training...")
    start_train = time.time()
    trainer.train()
    end_train = time.time()
    training_time = end_train - start_train
    
    print(f"Training completed in {training_time:.2f} seconds")
    
    # Evaluate on validation set
    print("Evaluating on validation set...")
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE_TRAINING)
    
    model.eval()
    all_preds, all_labels = [], []
    inference_times = []
    
    with torch.no_grad():
        for batch in val_loader:
            start_inf = time.time()
            
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels']
            outputs = model(**inputs)
            
            end_inf = time.time()
            inference_times.append(end_inf - start_inf)
            
            all_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            all_labels.extend(labels.numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    avg_inference_time = np.mean(inference_times)
    
    print(f"\nResults:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Average inference time per batch: {avg_inference_time:.4f} seconds")
    
    # Detailed classification report
    print(f"\nDetailed Classification Report:")
    target_names = [f"Class_{i}" for i in range(num_labels)]
    print(classification_report(all_labels, all_preds, target_names=target_names))
    
    # Prepare training configuration for saving
    training_config = {
        "model_name": model_name,
        "num_train_epochs": training_args.num_train_epochs,
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "per_device_eval_batch_size": training_args.per_device_eval_batch_size,
        "learning_rate": training_args.learning_rate,
        "fp16": training_args.fp16
    }
    
    results = {
        'model': 'DistilBERT + Downsampling',
        'training_time': training_time,
        'inference_time': avg_inference_time,
        'accuracy': accuracy,
    }
    
    # Save all model components
    save_path = save_model_components(model, tokenizer, label_encoder, training_config, results)
    
    # Add save path to results
    results['save_path'] = save_path
    
    return results, model, tokenizer

In [16]:
# Run the training
print("="*50)
print("STARTING TRAINING AND SAVING PROCESS")
print("="*50)

results, trained_model, trained_tokenizer = train_distilbert_with_downsampling(
    train_texts, val_texts, train_labels, val_labels, num_labels
)

STARTING TRAINING AND SAVING PROCESS

--- Training DistilBERT with Downsampled Text ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created datasets - Train: 2140, Val: 535
Using device: cuda
Starting training...


Epoch,Training Loss,Validation Loss,Samples,Accuracy
1,1.5261,1.385713,535,0.379439
2,1.3072,1.290287,535,0.452336
3,1.1898,1.269825,535,0.428037
4,1.0569,1.260724,535,0.46729
5,0.8879,1.313666,535,0.48972
6,0.7199,1.326769,535,0.48785
7,0.5516,1.376821,535,0.517757
8,0.3786,1.468429,535,0.493458
9,0.2654,1.632356,535,0.495327
10,0.1844,1.751629,535,0.51215


SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

In [1]:
# Display results
print(f"\n{'='*50}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*50}")
print(f"Model: {results['model']}")
print(f"Training Time: {results['training_time']:.2f} seconds")
print(f"Average Inference Time: {results['inference_time']:.4f} seconds per batch")
print(f"Final Accuracy: {results['accuracy']:.4f}")
print(f"Model saved to: {results['save_path']}")


FINAL RESULTS SUMMARY


NameError: name 'results' is not defined