# Text Classification for Support Tickets

This notebook demonstrates text classification using traditional NLP techniques with scikit-learn. We'll implement both Naive Bayes and SVM classifiers to predict ticket types from text content.

## Imports

In [None]:
import sys
import os
import nltk
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
import seaborn as sns

# Add parent directory to path to import local modules
sys.path.append("..")

# Import local modules
from src.models import (
    create_naive_bayes_pipeline, create_svm_pipeline, 
    get_grid_search_params, create_grid_search, train_ml_model, predict
)
from src.evaluate_model import (
    evaluate_model, print_confusion_matrix, print_classification_report,
    plot_confusion_matrix, plot_class_distribution, compare_models
)
from src.utils import save_model, load_model, save_config, load_config
from src.config import get_config, update_config

# For Jupyter Notebook
%matplotlib inline
sns.set()

## Set Parameters

In [None]:
# Get default configuration
config = get_config()

# Update with our specific settings
config = update_config(config, 
    data={
        'train_path': '../data/ticket_train.csv',
        'valid_path': '../data/ticket_valid.csv',
        'test_path': '../data/ticket_test.csv',
        'text_column': 'body',  # Column containing the text
        'label_column': 'ticket_type'  # Column to predict
    }
)

# Display the configuration
from pprint import pprint
print("Configuration:")
pprint(config)

## Prepare Data and Labels

In [None]:
# Load training and testing data from separate files
train_data, train_labels, _ = load_data_ml(
    config['data']['train_path'],
    config['data']['text_column'],
    config['data']['label_column']
)

test_data, test_labels, unique_labels = load_data_ml(
    config['data']['test_path'],
    config['data']['text_column'],
    config['data']['label_column']
)

# Print dataset sizes
print(f"Training examples: {len(train_data)}")
print(f"Testing examples: {len(test_data)}")
print(f"Number of classes: {len(unique_labels)}")
print(f"Classes: {unique_labels}")

## Feature Extraction

In [None]:
# Create vectorizer 
count_vect = create_vectorizer()

# Create TF-IDF transformer
tfidf_transformer = create_tfidf_transformer()

# Apply to training data
vectorized_data = count_vect.fit_transform(train_data)
print(f"Vectorized data shape: {vectorized_data.shape}")

features = tfidf_transformer.fit_transform(vectorized_data)
print(f"TF-IDF features shape: {features.shape}")

## Train Naive Bayes Classifier

In [None]:
# Create Naive Bayes pipeline
nb_pipeline = create_naive_bayes_pipeline(
    count_vect,
    tfidf_transformer,
    config['training']['fit_prior']
)

# Train the model
print("Training Naive Bayes model...")
nb_pipeline = train_ml_model(nb_pipeline, train_data, train_labels)

In [None]:
# Evaluate the model
nb_predictions = predict(nb_pipeline, test_data)
nb_accuracy = evaluate_model(nb_predictions, test_labels)

# Print confusion matrix
print_confusion_matrix(test_labels, nb_predictions)

# Print classification report
print_classification_report(test_labels, nb_predictions)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(test_labels, nb_predictions)

## Grid Search for Naive Bayes

In [None]:
if config['training']['use_grid_search']:
    print("Performing grid search for Naive Bayes...")
    
    # Get grid search parameters
    nb_params = get_grid_search_params('NB')
    
    # Create grid search
    nb_gs = create_grid_search(
        nb_pipeline,
        nb_params,
        config['training']['grid_search_jobs'],
        config['training']['grid_search_cv']
    )
    
    # Train the model with grid search
    nb_gs = train_ml_model(nb_gs, train_data, train_labels)
    
    # Get best parameters
    print(f"Best parameters: {nb_gs.best_params_}")
    
    # Evaluate the model
    nb_gs_predictions = predict(nb_gs, test_data)
    nb_gs_accuracy = evaluate_model(nb_gs_predictions, test_labels)
    
    # Print confusion matrix
    print_confusion_matrix(test_labels, nb_gs_predictions)
    
    # Print classification report
    print_classification_report(test_labels, nb_gs_predictions)
    
    # Plot confusion matrix
    plot_confusion_matrix(test_labels, nb_gs_predictions)

## Train SVM Classifier

In [None]:
# Create SVM pipeline
svm_pipeline = create_svm_pipeline(
    count_vect,
    tfidf_transformer
)

# Train the model
print("Training SVM model...")
svm_pipeline = train_ml_model(svm_pipeline, train_data, train_labels)

In [None]:
# Evaluate the model
svm_predictions = predict(svm_pipeline, test_data)
svm_accuracy = evaluate_model(svm_predictions, test_labels)

# Print confusion matrix
print_confusion_matrix(test_labels, svm_predictions)

# Print classification report
print_classification_report(test_labels, svm_predictions)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(test_labels, svm_predictions)

## Grid Search for SVM

In [None]:
if config['training']['use_grid_search']:
    print("Performing grid search for SVM...")
    
    # Get grid search parameters
    svm_params = get_grid_search_params('SVM')
    
    # Create grid search
    svm_gs = create_grid_search(
        svm_pipeline,
        svm_params,
        config['training']['grid_search_jobs'],
        config['training']['grid_search_cv']
    )
    
    # Train the model with grid search
    svm_gs = train_ml_model(svm_gs, train_data, train_labels)
    
    # Get best parameters
    print(f"Best parameters: {svm_gs.best_params_}")
    
    # Evaluate the model
    svm_gs_predictions = predict(svm_gs, test_data)
    svm_gs_accuracy = evaluate_model(svm_gs_predictions, test_labels)
    
    # Print confusion matrix
    print_confusion_matrix(test_labels, svm_gs_predictions)
    
    # Print classification report
    print_classification_report(test_labels, svm_gs_predictions)
    
    # Plot confusion matrix
    plot_confusion_matrix(test_labels, svm_gs_predictions)

## Compare Models

In [None]:
# Create list of model names and accuracies
model_names = ['Naive Bayes']
accuracies = [nb_accuracy]

if config['training']['use_grid_search']:
    model_names.append('Naive Bayes (Grid Search)')
    accuracies.append(nb_gs_accuracy)

model_names.append('SVM')
accuracies.append(svm_accuracy)

if config['training']['use_grid_search']:
    model_names.append('SVM (Grid Search)')
    accuracies.append(svm_gs_accuracy)

# Compare models
compare_models(model_names, accuracies)

## Save the Best Model

In [None]:
if config['model']['save_model']:
    # Determine the best model based on accuracy
    best_model_idx = np.argmax(accuracies)
    best_model_name = model_names[best_model_idx]
    print(f"Best model: {best_model_name} with accuracy {accuracies[best_model_idx]:.4f}")
    
    # Get the best model
    if best_model_name == 'Naive Bayes':
        best_model = nb_pipeline
    elif best_model_name == 'Naive Bayes (Grid Search)':
        best_model = nb_gs
    elif best_model_name == 'SVM':
        best_model = svm_pipeline
    elif best_model_name == 'SVM (Grid Search)':
        best_model = svm_gs
    
    # Save the model
    model_path = os.path.join(config['data']['model_dir'], config['model']['model_name'])
    save_model(best_model, model_path)
    
    # Save the configuration
    if config['model']['save_config']:
        config_path = os.path.join(config['data']['model_dir'], config['model']['config_name'])
        save_config(config, config_path)

## Test Predictions on New Text

In [None]:
# Function to predict on new text
def predict_on_text(model, text):
    prediction = model.predict([text])[0]
    print(f"Text: {text}")
    print(f"Predicted class: {prediction}")
    return prediction

In [None]:
# Test prediction on new texts
test_texts = [
    "I am having issues with my email. It's not sending messages.",
    "The software crashes every time I try to save my work.",
    "Can you help me understand how to use the new feature?",
    "I need to reset my password for the system access."
]

for text in test_texts:
    predict_on_text(best_model, text)