# Customer IT Support Text Classification with DistilBERT (3 Classes)

This notebook performs text classification using DistilBERT with 3 consolidated classes:
1. Technical/IT Support (Technical Support, IT Support)
2. Customer & Product Support (Customer Service, Product Support)
3. Financial/Other (Billing and Payments, Returns and Exchanges, Service Outages and Maintenance, Sales and Pre-Sales, Human Resources, General Inquiry)

In [None]:
import sys
import os

# Add the parent directory to the path to import local modules
sys.path.append(os.path.abspath('..'))

# Initialize TensorFlow correctly to avoid import conflicts
import tensorflow as tf
from tensorflow import keras

# Use GPU if available and set memory growth to avoid OOM errors
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to true for all GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPUs available: {len(gpus)}")
    except RuntimeError as e:
        print(f"Error configuring GPUs: {e}")

# Regular imports after TensorFlow is initialized
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.metrics import confusion_matrix, classification_report
from transformers import DistilBertTokenizer

# Import the DistilBERT implementation after TensorFlow is properly initialized
from src.model import DistilBertClassifier, save_model, load_model, model_predict
from src.utils import encode_texts, encode_labels, load_training_conf
from src.train import define_callbacks, train_model

## 1. Data Loading and Class Consolidation

In [None]:
# Define file paths
train_path = "../data/ticket_train.csv"
valid_path = "../data/ticket_valid.csv"
test_path = "../data/ticket_test.csv"

# Define columns
text_column = "text_en"  
label_column = "queue"  

# Load configuration for defaults
conf = load_training_conf()
max_length = 192  # Increase max sequence length for better context

In [None]:
def load_dataset(file_path, text_column, label_column, tokenizer=None, unique_labels=None):
    """Load and process a dataset file with consolidated categories"""
    # Try different delimiters
    try:
        df = pd.read_csv(file_path, delimiter=";")
    except:
        try:
            df = pd.read_csv(file_path, delimiter=",")
        except Exception as e:
            raise ValueError(f"Could not read file {file_path} with delimiter ',' or ';': {e}")
    
    print(f"Loaded {file_path} with {len(df)} rows")
    
    # Get text and labels
    texts = df[text_column].tolist()
    labels = df[label_column].tolist()
    
    # Initialize tokenizer if not provided
    if tokenizer is None:
        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
        tokenizer.max_length = max_length
        tokenizer.pad_to_max_length = True

    # Get unique labels if not provided\n",
    if unique_labels is None:
      unique_labels = sorted(list(set(labels)))
    
    # Encode labels
    encoded_labels = encode_labels(labels, unique_labels)
    
    # Encode texts
    print(f"\nTokenizing texts from {file_path}...")
    encoded_texts = encode_texts(tokenizer, texts)
    
    return encoded_texts, encoded_labels, tokenizer, unique_labels

In [None]:
# Load datasets 
x_train, y_train, tokenizer, unique_labels = load_dataset(train_path, text_column, label_column)
x_valid, y_valid, _, _ = load_dataset(valid_path, text_column, label_column, tokenizer)
x_test, y_test, _, _ = load_dataset(test_path, text_column, label_column, tokenizer)

# Print dataset info
print(f"Number of classes: {len(unique_labels)}"),
print(f"Unique labels: {unique_labels}"),
print(f"\nTraining samples: {len(y_train)}")
print(f"Validation samples: {len(y_valid)}")
print(f"Test samples: {len(y_test)}")

## 2. Model Training with Optimized Hyperparameters

In [None]:
# Clear memory before model creation
gc.collect()
tf.keras.backend.clear_session()

# Create the DistilBERT model with optimized hyperparameters
model = DistilBertClassifier(
    num_labels=3,  # 3 consolidated categories
    learning_rate=3e-5,  # Slightly lower learning rate for better generalization
    dropout_rate=0.2,  
    metrics=["accuracy"]
)
print("Model created successfully")

In [None]:
# Define optimized callbacks
callbacks = define_callbacks(
    patience=3,  
    min_delta=0.003,  
)

In [None]:
# Create a learning rate scheduler for better convergence
def lr_scheduler(epoch, lr):
    if epoch < 2:
        return lr
    else:
        return lr * 0.8  # Decay learning rate by 20% after epoch 2

lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
callbacks.append(lr_callback)

In [None]:
# Train the model with optimized settings
BATCH_SIZE = 16  # Larger batch size for better optimization since we have fewer classes
EVAL_BATCH_SIZE = 32

train_loss, train_accuracy = train_model(
    model,
    x_train,  
    x_valid,  
    y_train,
    y_valid,
    epochs=8,  # More epochs for better convergence with 3 classes
    batch_size=BATCH_SIZE,  
    eval_batch_size=EVAL_BATCH_SIZE,
    callbacks=callbacks,
)

print(f"Validation Loss: {train_loss:.4f}, Validation Accuracy: {train_accuracy:.4f}")

## 3. Evaluate on Test Set

In [None]:
# Evaluate on the test set
# Clear memory before prediction
gc.collect()
tf.keras.backend.clear_session()

# Use batched prediction
y_pred_logits = model.predict_in_batches(x_test, batch_size=16)
y_pred = np.argmax(y_pred_logits, axis=1)
y_true = np.argmax(y_test, axis=1)

# Calculate test accuracy
test_accuracy = np.mean(y_pred == y_true)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=consolidated_categories))

In [None]:
# Plot confusion matrix with improved visualization
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_true, y_pred)

# Normalize the confusion matrix
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Plot normalized confusion matrix with improved aesthetics
ax = sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues', 
            xticklabels=consolidated_categories, yticklabels=consolidated_categories, 
            annot_kws={"size": 14}, linewidths=0.5, cbar=False)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('True', fontsize=14)
plt.title('Normalized Confusion Matrix', fontsize=16)
plt.tight_layout()

# Add percentages in each cell
for i in range(len(consolidated_categories)):
    for j in range(len(consolidated_categories)):
        text = ax.texts[i * len(consolidated_categories) + j]
        text.set_text(f"{cm_norm[i, j]:.2f}\n({cm[i, j]})")

plt.show()

## 4. Error Analysis to Improve Further

In [None]:
# Load original test data to analyze misclassifications
try:
    test_df = pd.read_csv(test_path, delimiter=";")
except:
    test_df = pd.read_csv(test_path, delimiter=",")

# Add consolidated categories and predictions
test_df['consolidated_queue'] = test_df[label_column].apply(map_to_consolidated_category)
test_df['predicted_queue'] = [consolidated_categories[i] for i in y_pred]
test_df['correctly_classified'] = test_df['consolidated_queue'] == test_df['predicted_queue']

# Find misclassified examples
misclassified = test_df[~test_df['correctly_classified']]

# Analyze examples from each error type
print("Most common misclassification patterns:")
error_patterns = misclassified.groupby(['consolidated_queue', 'predicted_queue']).size().reset_index()
error_patterns.columns = ['True Category', 'Predicted Category', 'Count']
error_patterns = error_patterns.sort_values('Count', ascending=False)
display(error_patterns.head(5))

# Show some examples of the most common misclassification
most_common_error = error_patterns.iloc[0]
true_cat = most_common_error['True Category']
pred_cat = most_common_error['Predicted Category']

print(f"\nExamples of {true_cat} tickets misclassified as {pred_cat}:")
examples = misclassified[
    (misclassified['consolidated_queue'] == true_cat) & 
    (misclassified['predicted_queue'] == pred_cat)
].head(5)

for i, (_, row) in enumerate(examples.iterrows()):
    print(f"\nExample {i+1}:")
    print(f"Text: {row[text_column][:200]}...")
    print(f"Original Queue: {row[label_column]}")
    print(f"True Category: {row['consolidated_queue']}")
    print(f"Predicted: {row['predicted_queue']}")

## 5. Save the Model

In [None]:
# Save the model and tokenizer
model_folder = "../models/distilbert_3class"
save_model(model, tokenizer, model_folder=model_folder)

# Save consolidated categories for prediction
os.makedirs(model_folder, exist_ok=True)
with open(f"{model_folder}/categories.txt", "w") as f:
    f.write("\n".join(consolidated_categories))
    
print(f"Model, tokenizer, and category labels saved to {model_folder}/ directory")

## 6. Make Predictions on New Examples

In [None]:
# Test prediction on new examples
# Clear memory before running predictions
gc.collect()
tf.keras.backend.clear_session()

test_examples = [
    "I need help with my email account, I can't login to the system",
    "My server is down and customers can't access our website. This is urgent!",
    "I would like to request a refund for the software purchase I made last week",
    "Can you help me understand how to use the new reporting feature?",
    "We need to update our subscription to include more licenses for the team",
    "The network connection in the east wing office is very slow",
    "I have a question about my recent invoice. There seems to be an extra charge."
]

# Get predictions using small batch size
pred_indices = model_predict(model, tokenizer, test_examples, batch_size=2)
predictions = [consolidated_categories[idx] for idx in pred_indices]

# Display results with confidence scores
encoded_examples = encode_texts(tokenizer, test_examples)
pred_logits = model.predict_in_batches(encoded_examples, batch_size=2)
pred_probs = tf.nn.softmax(pred_logits, axis=1).numpy()

print("Prediction Results:\n")
for i, (example, prediction, probs) in enumerate(zip(test_examples, predictions, pred_probs)):
    confidence = probs.max() * 100
    print(f"Example {i+1}:\nText: {example}\n")
    print(f"Predicted category: {prediction} (Confidence: {confidence:.2f}%)\n")
    
    # Show probabilities for all categories
    print("Probabilities for each category:")
    for j, category in enumerate(consolidated_categories):
        print(f"  {category}: {probs[j]*100:.2f}%")
    print("\n" + "-"*50 + "\n")