# Customer IT Support Text Classification with DistilBERT

This notebook demonstrates how to use DistilBERT for text classification of customer IT support tickets.

In [None]:
import sys
import os

# Add the parent directory to the path to import local modules
sys.path.append(os.path.abspath('..'))

# Initialize TensorFlow correctly to avoid import conflicts
from src.tf_init import tf, keras

# Use GPU if available and set memory growth to avoid OOM errors
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to true for all GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # Or limit memory usage
        # tf.config.experimental.set_virtual_device_configuration(
        #     gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
        print(f"GPUs available: {len(gpus)}")
    except RuntimeError as e:
        print(f"Error configuring GPUs: {e}")

# Regular imports after TensorFlow is initialized
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Now import the DistilBERT implementation after TensorFlow is properly initialized
from src.model import DistilBertClassifier, save_model, load_model, model_predict
from src.utils import encode_texts, encode_labels, load_training_conf
from src.train import training_data, define_callbacks, train_model

## 1. Configuration and Data Loading

Let's start by loading our configuration and preprocessing our data for the model.

In [None]:
# Load configuration - if no config file exists, default values will be used
conf = load_training_conf()
print("Configuration loaded:")
print(conf)

In [None]:
# Extract configuration values
conf_train, conf_data = conf["training"], conf["data"]

# Update dataset path if needed - use your actual file path
# For CSV files with semicolon or comma separator (';' or ',')
dataset_path = "../data/dataset_filtered_tickets_english.csv"  
conf_data["dataset_path"] = dataset_path

# Set smaller max sequence length to reduce memory usage
conf_data["max_words_per_message"] = 128

In [None]:
# Prepare data for training and testing - wrapped in try-except for better error handling
try:
    (x_train, x_test, y_train, y_test), tokenizer, unique_labels = training_data(
        conf_data["dataset_path"],
        conf_data["text_column"],
        conf_data["label_column"],
        test_size=conf_train.get("test_set_size", 0.2),
        subset_size=-1,  # Use all data
        max_length=conf_data["max_words_per_message"],
        pad_to_max_length=conf_data.get("pad_to_max_length", True),
    )
    
    # Display information about our dataset
    print(f"Number of classes: {len(unique_labels)}")
    print(f"Unique labels: {unique_labels}")
    print(f"Training samples: {len(y_train)}")
    print(f"Test samples: {len(y_test)}")
except Exception as e:
    print(f"Error loading data: {e}")

## 2. Model Training

Now we'll initialize and train our DistilBERT model with memory-efficient settings.

In [None]:
# Create the DistilBERT model
try:
    # Clear memory before model creation
    gc.collect()
    tf.keras.backend.clear_session()
    
    model = DistilBertClassifier(
        num_labels=y_train.shape[1],
        learning_rate=conf_train.get("learning_rate", 5e-5),
        dropout_rate=0.2,
        metrics=["accuracy"]
    )
    print("Model created successfully")
except Exception as e:
    print(f"Error creating model: {e}")

In [None]:
# Define callbacks for training
callbacks = define_callbacks(
    patience=conf_train.get("early_stopping_patience", 2),
    min_delta=conf_train.get("early_stopping_min_delta_acc", 0.01),
)

In [None]:
# Use smaller batch sizes to avoid OOM errors
BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8

# Train the model with memory-efficient settings
try:
    test_loss, test_accuracy = train_model(
        model,
        x_train,
        x_test,
        y_train,
        y_test,
        epochs=conf_train.get("epochs", 3),
        batch_size=BATCH_SIZE,  
        eval_batch_size=EVAL_BATCH_SIZE,
        callbacks=callbacks,
    )
    
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
except Exception as e:
    print(f"Error during training: {e}")

## 3. Model Evaluation

Let's evaluate our model's performance on the test set using batched prediction.

In [None]:
# Get predictions for the test set using batched prediction
try:
    # Clear memory before prediction
    gc.collect()
    tf.keras.backend.clear_session()
    
    # Use batched prediction to avoid OOM errors
    y_pred_logits = model.predict_in_batches(x_test, batch_size=8)
    y_pred = np.argmax(y_pred_logits, axis=1)
    y_true = np.argmax(y_test, axis=1)
    
    # Print classification report
    print(classification_report(y_true, y_pred, target_names=unique_labels))
except Exception as e:
    print(f"Error during prediction: {e}")

In [None]:
# Plot confusion matrix
try:
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error plotting confusion matrix: {e}")

## 4. Save the Model

Now we'll save our trained model and tokenizer for future use.

In [None]:
# Save the model and tokenizer
try:
    model_folder = "../models"  # Save one level up from notebooks
    save_model(model, tokenizer, model_folder=model_folder)
    
    # Save unique labels for prediction
    os.makedirs(model_folder, exist_ok=True)
    with open(f"{model_folder}/unique_labels.txt", "w") as f:
        f.write("\n".join(unique_labels))
        
    print(f"Model, tokenizer, and labels saved to {model_folder}/ directory")
except Exception as e:
    print(f"Error saving model: {e}")

## 5. Make Predictions on New Examples

Let's use our model to make some predictions on new examples.

In [None]:
# Test prediction on some examples
try:
    # Clear memory before running predictions
    gc.collect()
    tf.keras.backend.clear_session()
    
    test_examples = [
        "I need help with my email account, I can't login",
        "My server is down and customers can't access our website",
        "I would like to request a refund for the software purchase"
    ]
    
    # Get predictions using small batch size
    pred_indices = model_predict(model, tokenizer, test_examples, batch_size=1)
    predictions = [unique_labels[idx] for idx in pred_indices]
    
    # Display results
    for example, prediction in zip(test_examples, predictions):
        print(f"Text: {example}\nPredicted queue: {prediction}\n")
except Exception as e:
    print(f"Error during prediction: {e}")