In [19]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
aniketg11_supportticketsclassification_path = kagglehub.dataset_download('aniketg11/supportticketsclassification')

print('Data source import complete.')


Data source import complete.


In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/supportticketsclassification/all_tickets.csv


In [21]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
try:
    df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')

    # Prepare the data by combining title and body
    # Fill potential missing titles with an empty string
    df['text'] = df['title'].fillna('') + ' ' + df['body']

    # Define features (X) and target (y)
    X = df['text']
    y = df['category']

    # Perform the train-test split (80% train, 20% test)
    # Using stratify=y to ensure the same distribution of categories in train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Create new DataFrames for the split data
    train_df = pd.DataFrame({'text': X_train, 'category': y_train})
    test_df = pd.DataFrame({'text': X_test, 'category': y_test})

    # Save the split data to new CSV files
    train_df.to_csv('train_tickets.csv', index=False)
    test_df.to_csv('test_tickets.csv', index=False)

    print("Data splitting is complete.")
    print(f"Training set shape: {train_df.shape}")
    print(f"Testing set shape: {test_df.shape}")
    print("\nCreated 'train_tickets.csv' and 'test_tickets.csv'.")

except FileNotFoundError:
    print("Error: 'all_tickets.csv' not found. Please make sure the file is uploaded.")
except Exception as e:
    print(f"An error occurred: {e}")

Data splitting is complete.
Training set shape: (38839, 2)
Testing set shape: (9710, 2)

Created 'train_tickets.csv' and 'test_tickets.csv'.


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')

# Combine title and body into a single 'text' feature
df['text'] = df['title'].fillna('') + ' ' + df['body']

# Select the features (X) and the target (y)
X = df[['text']]
y = df['category']

# Perform an 80/20 split, stratifying by category to maintain distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create new DataFrames for the training and testing sets
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save the split data to new CSV files
train_df.to_csv('train_tickets.csv', index=False)
test_df.to_csv('test_tickets.csv', index=False)

print("The data has been split into training and testing sets.")
print("\nTraining set preview:")
print(train_df.head())
print(f"\nTraining set shape: {train_df.shape}")

print("\nTesting set preview:")
print(test_df.head())
print(f"\nTesting set shape: {test_df.shape}")

The data has been split into training and testing sets.

Training set preview:
                                                    text  category
2030                                            la la si         5
34036  laptop needed tuesday july pm re needed hi kin...         5
14815  laptop partitions tuesday pm partitions hi par...         5
33103  new starter access card enabling tower wednesd...         7
28341  monitor thursday october pm monitor switch rol...         5

Training set shape: (38839, 2)

Testing set preview:
                                                    text  category
27265  oracle error november error hello please find ...         4
9805   re incident can open shared document from has ...         4
16302    wednesday pm hi has under please log install...         5
22962  oracle project change wednesday pm change hi p...         4
13532    wednesday good morning has accessed purchase...         5

Testing set shape: (9710, 2)


In [24]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.mixed_precision import Policy, set_global_policy

# Enable Mixed Precision Training
policy = Policy('mixed_float16')
set_global_policy(policy)
print(f'Compute dtype: {policy.compute_dtype}')
print(f'Variable dtype: {policy.variable_dtype}')


# --- 1. Load The Pre-split Data ---
print("Loading pre-split training and testing data...")
train_df = pd.read_csv('train_tickets.csv')
test_df = pd.read_csv('test_tickets.csv')

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']


# --- 2. Text Preprocessing ---
print("Preprocessing text data...")
vocab_size = 10000
max_length = 200
embedding_dim = 128
batch_size = 64

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')


# --- Create efficient tf.data pipelines ---
print("Creating efficient tf.data pipelines...")
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_padded, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test_padded, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


# --- 3. Build the Deep Learning Model ---
print("Building the model...")
num_classes = y_train.max() + 1

model = Sequential([
    # THE FIX: Explicitly set the Embedding layer to use float32 for stability
    Embedding(vocab_size, embedding_dim, input_length=max_length, dtype='float32'),

    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    # The final layer should also use float32 for numerical stability
    Dense(num_classes, activation='softmax', dtype='float32')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



# --- 4. Train the Model using the tf.data.Dataset ---
print("\nTraining the model...")
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    train_dataset,
    epochs=20,
    validation_data=test_dataset,
    callbacks=[early_stopping],
    verbose=1
)
model.summary()

# --- 5. Evaluate the Model ---
print("\nEvaluating the model on the test set...")
loss, accuracy = model.evaluate(test_dataset)
print("========================================")
print(f"Final Test Loss: {loss:.4f}")
print(f"Final Test Accuracy: {accuracy:.4f}")
print("========================================")

Compute dtype: float16
Variable dtype: float32
Loading pre-split training and testing data...
Preprocessing text data...
Creating efficient tf.data pipelines...
Building the model...

Training the model...
Epoch 1/20




[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 41ms/step - accuracy: 0.7616 - loss: 0.8841 - val_accuracy: 0.8511 - val_loss: 0.4728
Epoch 2/20
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 41ms/step - accuracy: 0.8596 - loss: 0.4410 - val_accuracy: 0.8660 - val_loss: 0.4257
Epoch 3/20
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.8837 - loss: 0.3654 - val_accuracy: 0.8682 - val_loss: 0.4186
Epoch 4/20
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 39ms/step - accuracy: 0.8954 - loss: 0.3209 - val_accuracy: 0.8639 - val_loss: 0.4264
Epoch 5/20
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 38ms/step - accuracy: 0.9070 - loss: 0.2836 - val_accuracy: 0.8560 - val_loss: 0.4515
Epoch 6/20
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 37ms/step - accuracy: 0.9160 - loss: 0.2526 - val_accuracy: 0.8667 - val_loss: 0.4516



Evaluating the model on the test set...
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8700 - loss: 0.4223
Final Test Loss: 0.4186
Final Test Accuracy: 0.8682


In [25]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.mixed_precision import Policy, set_global_policy

# --- 1. Setup Environment for Acceleration ---
# This helps speed up training on compatible GPUs
try:
    policy = Policy('mixed_float16')
    set_global_policy(policy)
    print(f'Compute dtype: {policy.compute_dtype}')
    print(f'Variable dtype: {policy.variable_dtype}')
except Exception as e:
    print(f"Could not set mixed precision policy: {e}")

# --- 2. Define Hyperparameters ---
vocab_size = 10000
max_length = 200
embedding_dim = 128
batch_size = 64
learning_rate = 0.0005
dropout_rate = 0.4

# --- 3. Load and Preprocess Data ---
print("Loading and preprocessing data...")
try:
    train_df = pd.read_csv('train_tickets.csv')
    test_df = pd.read_csv('test_tickets.csv')
except FileNotFoundError:
    print("train_tickets.csv or test_tickets.csv not found. Splitting all_tickets.csv.")
    all_tickets_df = pd.read_csv('all_tickets.csv')
    all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']
    train_df, test_df = train_test_split(all_tickets_df, test_size=0.2, random_state=42, stratify=all_tickets_df['category'])

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length, padding='post', truncating='post')

# --- 4. Create tf.data pipelines for performance ---
print("Creating efficient tf.data pipelines...")
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_padded, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_padded, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# --- 5. Build the Improved Model ---
print("Building the improved GRU model...")
num_classes = y_train.max() + 1

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length, dtype='float32'),
    Bidirectional(GRU(80, return_sequences=True)),
    Dropout(dropout_rate),
    Bidirectional(GRU(40)),
    Dropout(dropout_rate),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax', dtype='float32')
])

optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


# --- 6. Train the Model ---
print("\nTraining the model...")
early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

history = model.fit(
    train_dataset,
    epochs=25,
    validation_data=test_dataset,
    callbacks=[early_stopping],
    verbose=1
)
model.summary()

# --- 7. Evaluate the Final Model ---
print("\nEvaluating the final model...")
final_loss, final_accuracy = model.evaluate(test_dataset)
print("========================================")
print(f"Final Test Loss: {final_loss:.4f}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")
print("========================================")

Compute dtype: float16
Variable dtype: float32
Loading and preprocessing data...
Creating efficient tf.data pipelines...
Building the improved GRU model...

Training the model...
Epoch 1/25




[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 46ms/step - accuracy: 0.7465 - loss: 0.9948 - val_accuracy: 0.8462 - val_loss: 0.4654
Epoch 2/25
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 38ms/step - accuracy: 0.8541 - loss: 0.4574 - val_accuracy: 0.8648 - val_loss: 0.4318
Epoch 3/25
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 38ms/step - accuracy: 0.8767 - loss: 0.3861 - val_accuracy: 0.8657 - val_loss: 0.4345
Epoch 4/25
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 38ms/step - accuracy: 0.8888 - loss: 0.3453 - val_accuracy: 0.8635 - val_loss: 0.4336
Epoch 5/25
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 37ms/step - accuracy: 0.9004 - loss: 0.3126 - val_accuracy: 0.8629 - val_loss: 0.4461
Epoch 6/25
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 37ms/step - accuracy: 0.9089 - loss: 0.2863 - val_accuracy: 0.8623 - val_loss: 0.4667



Evaluating the final model...
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.8656 - loss: 0.4348
Final Test Loss: 0.4318
Final Test Accuracy: 0.8648


In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# --- 1. Load Data ---
print("Loading pre-split training and testing data...")
try:
    train_df = pd.read_csv('train_tickets.csv')
    test_df = pd.read_csv('test_tickets.csv')
except FileNotFoundError:
    print("train_tickets.csv or test_tickets.csv not found. Splitting from all_tickets.csv")
    # Fallback to splitting the main file if pre-split files aren't available
    all_tickets_df = pd.read_csv('all_tickets.csv')
    all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']
    train_df, test_df = train_test_split(all_tickets_df, test_size=0.2, random_state=42, stratify=all_tickets_df['category'])

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']

# --- 2. Feature Extraction with TF-IDF ---
print("\nPerforming TF-IDF Vectorization...")
# Using n-grams (1,2) includes word pairs (bigrams), which can capture more context.
# Limiting features to the top 20,000 is a common practice.
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))

# Fit the vectorizer on the training data and transform it
print("Fitting TF-IDF on training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Only transform the test data with the already-fitted vectorizer
print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF matrix shape (Train): {X_train_tfidf.shape}")
print(f"TF-IDF matrix shape (Test): {X_test_tfidf.shape}")

# --- 3. Define, Train, and Evaluate Models ---
# A dictionary of the models we want to test
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Linear SVC": LinearSVC(random_state=42)
}

# Loop through each model
for name, model in models.items():
    print("\n" + "="*50)
    print(f"Training {name}...")

    # Train the model
    model.fit(X_train_tfidf, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_tfidf)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)

    # Generate a detailed classification report
    # set zero_division=0 to handle cases where a class has no predictions
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print("="*50)

Loading pre-split training and testing data...

Performing TF-IDF Vectorization...
Fitting TF-IDF on training data...
Transforming test data...
TF-IDF matrix shape (Train): (38839, 20000)
TF-IDF matrix shape (Test): (9710, 20000)

Training Multinomial Naive Bayes...

Results for Multinomial Naive Bayes:
Accuracy: 0.8386

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00        27
           4       0.84      0.98      0.90      6812
           5       0.85      0.69      0.76      1927
           6       0.73      0.14      0.24       526
           7       0.75      0.21      0.32       184
           8       0.97      0.69      0.80        48
           9       0.00      0.00      0.00        38
          11       0.93      0.23      0.37       123
     

In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb

# --- 1. Load Data ---
print("Loading pre-split training and testing data...")
try:
    train_df = pd.read_csv('train_tickets.csv')
    test_df = pd.read_csv('test_tickets.csv')
except FileNotFoundError:
    print("train_tickets.csv or test_tickets.csv not found. Splitting from all_tickets.csv")
    all_tickets_df = pd.read_csv('all_tickets.csv')
    all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']
    train_df, test_df = train_test_split(all_tickets_df, test_size=0.2, random_state=42, stratify=all_tickets_df['category'])

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']

# --- 2. Label Encoding ---
# Gradient boosting models require class labels to be zero-indexed (0, 1, 2,...)
print("\nPerforming Label Encoding on the target variable...")
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
print("Label encoding complete.")

# --- 3. Feature Extraction with TF-IDF ---
print("\nPerforming TF-IDF Vectorization...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print(f"TF-IDF matrix shape (Train): {X_train_tfidf.shape}")

# --- 4. Define, Train, and Evaluate Advanced Models ---
models = {
    # UPDATED: Using the modern API for GPU training in XGBoost
    "XGBoost": xgb.XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        tree_method='hist',
        device='cuda',
        random_state=42
    ),
    "LightGBM": lgb.LGBMClassifier(
        objective='multiclass',
        metric='multi_logloss',
        boosting_type='goss',
        device='gpu',
        random_state=42
    )
}

for name, model in models.items():
    print("\n" + "="*50)
    print(f"Training {name}...")

    # Train the model on the encoded labels
    model.fit(X_train_tfidf, y_train_encoded)

    # Make predictions on the test set
    y_pred_encoded = model.predict(X_test_tfidf)

    # Evaluate the model
    accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

    # For the report, we can use the original class names for better readability
    y_pred_original = label_encoder.inverse_transform(y_pred_encoded)
    report = classification_report(y_test, y_pred_original, zero_division=0)

    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print("="*50)

Loading pre-split training and testing data...

Performing Label Encoding on the target variable...
Label encoding complete.

Performing TF-IDF Vectorization...
TF-IDF matrix shape (Train): (38839, 20000)

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.




Results for XGBoost:
Accuracy: 0.5747

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00        27
           4       0.71      0.76      0.74      6812
           5       0.25      0.18      0.21      1927
           6       0.06      0.08      0.07       526
           7       0.05      0.01      0.01       184
           8       0.00      0.00      0.00        48
           9       0.00      0.00      0.00        38
          11       0.01      0.02      0.01       123
          12       0.00      0.00      0.00         9

    accuracy                           0.57      9710
   macro avg       0.09      0.09      0.09      9710
weighted avg       0.55      0.57      0.56      9710


Training LightGBM...




[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 336642
[LightGBM] [Info] Number of data points in the train set: 38839, number of used features: 9037
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 3 dense feature groups (0.15 MB) transferred to GPU in 0.000657 secs. 1 sparse feature groups
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score -9.468568
[LightGBM] [Info] Start training from score -6.506737
[LightGBM] [Info] Start training from score -9.874033
[LightGBM] [Info] Start training from score -5.866700
[LightGBM] [Info] Start training from score -0.354408
[LightGBM] [Info] Start training from score -1.617296
[LightGBM] [Info] Start training from score -2.916536
[LightGBM] [Info] Start training from score -3.964592
[LightGBM] [Info] Start 




Results for LightGBM:
Accuracy: 0.8152

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.03      0.07      0.04        14
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00        27
           4       0.88      0.91      0.89      6812
           5       0.81      0.69      0.74      1927
           6       0.53      0.37      0.43       526
           7       0.51      0.58      0.55       184
           8       0.27      0.56      0.36        48
           9       0.18      0.37      0.24        38
          11       0.65      0.46      0.54       123
          12       0.04      0.33      0.07         9

    accuracy                           0.82      9710
   macro avg       0.32      0.36      0.32      9710
weighted avg       0.82      0.82      0.82      9710



In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib # Used for saving and loading the model and tools

# --- 1. Load the FULL training dataset ---
# We use all available data to train the final model
print("Loading all_tickets.csv data...")
all_tickets_df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']

X_train = all_tickets_df['text'].astype(str)
y_train = all_tickets_df['category']

# --- 2. Fit the necessary tools (Encoder and Vectorizer) ---
print("Fitting LabelEncoder and TfidfVectorizer...")

# Label Encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# --- 3. Train the Final Model ---
# We'll use good hyperparameters similar to what a search would find.
# For a real project, you would plug in the "best_params_" from the previous step.
print("Training the final LightGBM model...")
final_model = lgb.LGBMClassifier(
    objective='multiclass',
    metric='multi_logloss',
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    reg_alpha=0.1,
    reg_lambda=0.1,
    colsample_bytree=0.8,
    boosting_type='goss',
    device='gpu',
    random_state=42
)
final_model.fit(X_train_tfidf, y_train_encoded)
print("Model training complete.")

# --- 4. Save the Artifacts ---
# Save the three components to disk for our prediction app
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')
joblib.dump(final_model, 'lgbm_model.joblib')

print("\n" + "="*50)
print("✅ Final model and tools have been saved successfully!")
print("   - tfidf_vectorizer.joblib")
print("   - label_encoder.joblib")
print("   - lgbm_model.joblib")
print("="*50)

Loading all_tickets.csv data...
Fitting LabelEncoder and TfidfVectorizer...
Training the final LightGBM model...




[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 411928
[LightGBM] [Info] Number of data points in the train set: 48549, number of used features: 10883
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 3 dense feature groups (0.19 MB) transferred to GPU in 0.000746 secs. 1 sparse feature groups
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score -9.404035
[LightGBM] [Info] Start training from score -6.513663
[LightGBM] [Info] Start training from score -9.691717
[LightGBM] [Info] Start training from score -5.870348
[LightGBM] [Info] Start training from score -0.354421
[LightGBM] [Info] Start training from score -1.617275
[LightGBM] [Info] Start training from score -2.916350
[LightGBM] [Info] Start training from score -3.964869
[LightGBM] [Info] Start

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
import nltk
from nltk.stem.porter import PorterStemmer
import re

# --- 1. Setup Stemming Function ---
# NLTK's word tokenizer is required for stemming
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    print("NLTK 'punkt' tokenizer not found. Downloading...")
    nltk.download('punkt')

stemmer = PorterStemmer()

def stem_text(text):
    """
    Takes a string, tokenizes it, stems each token, and returns the joined string.
    """
    # Remove non-alphabetic characters and split into words
    words = re.sub(r'[^a-zA-Z]', ' ', text).lower().split()
    # Stem each word
    stemmed_words = [stemmer.stem(word) for word in words]
    # Join the words back into a single string
    return " ".join(stemmed_words)

# --- 2. Load and Prepare Data ---
print("Loading all_tickets.csv data...")
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
df['text'] = (df['title'].fillna('') + ' ' + df['body']).astype(str)

print("\nApplying stemming to all ticket text...")
# Apply the stemming function to the 'text' column. This can take a moment.
df['stemmed_text'] = df['text'].apply(stem_text)
print("Stemming complete.")

# Define features (X) and target (y)
X = df['stemmed_text']
y = df['category']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into {len(X_train)} training and {len(X_test)} testing records.")

# --- 3. Two-Step TF-IDF Vectorization ---
print("\nPerforming two-step TF-IDF vectorization...")

# Step 3a: Use CountVectorizer to get word counts
count_vectorizer = CountVectorizer(stop_words='english', max_features=20000)
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Step 3b: Use TfidfTransformer to get TF-IDF weights
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print("Vectorization complete.")
print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")

# --- 4. Train and Evaluate Models ---
# As seen in the notebook, we'll test these classifiers
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

for name, model in models.items():
    print("\n" + "="*50)
    print(f"Training {name}...")

    # Train the model
    model.fit(X_train_tfidf, y_train)

    # Make predictions
    y_pred = model.predict(X_test_tfidf)

    # Evaluate and print results
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted') # Use 'weighted' for multi-class f1

    print(f"\nResults for {name}:")
    print(f"Accuracy Score: {accuracy:.4f}")
    print(f"Weighted F1 Score: {f1:.4f}")
    print("="*50)

Loading all_tickets.csv data...

Applying stemming to all ticket text...
Stemming complete.

Data split into 38839 training and 9710 testing records.

Performing two-step TF-IDF vectorization...
Vectorization complete.
TF-IDF matrix shape: (38839, 6631)

Training Multinomial Naive Bayes...

Results for Multinomial Naive Bayes:
Accuracy Score: 0.8271
Weighted F1 Score: 0.7890

Training Logistic Regression...

Results for Logistic Regression:
Accuracy Score: 0.8611
Weighted F1 Score: 0.8503


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
import nltk
from nltk.stem.porter import PorterStemmer
import re
import warnings

# Suppress warnings from the solver for cleaner output
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

# --- 1. Setup Stemming Function ---
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    print("NLTK 'punkt' tokenizer not found. Downloading...")
    nltk.download('punkt')

stemmer = PorterStemmer()
def stem_text(text):
    words = re.sub(r'[^a-zA-Z]', ' ', text).lower().split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

# --- 2. Load and Prepare Data ---
print("Loading all_tickets.csv data...")
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
df['text'] = (df['title'].fillna('') + ' ' + df['body']).astype(str)

print("\nApplying stemming to all ticket text...")
df['stemmed_text'] = df['text'].apply(stem_text)
print("Stemming complete.")

X = df['stemmed_text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into {len(X_train)} training and {len(X_test)} testing records.")

# --- 3. Two-Step TF-IDF Vectorization ---
print("\nPerforming two-step TF-IDF vectorization...")
count_vectorizer = CountVectorizer(stop_words='english', max_features=20000)
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print("Vectorization complete.")

# --- 4. Hyperparameter Tuning with GridSearchCV ---
print("\n" + "="*50)
print("Starting Hyperparameter Tuning for Logistic Regression...")

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],    # Regularization type
    'solver': ['saga']          # 'saga' solver supports both l1 and l2
}

# Instantiate the model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Set up the Grid Search with 5-fold cross-validation
# n_jobs=-1 uses all available CPU cores to speed up the search
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=3
)

# Run the search
grid_search.fit(X_train_tfidf, y_train)

# --- 5. Evaluate the Best Found Model ---
print("\n" + "="*50)
print("Hyperparameter tuning complete.")
print(f"Best cross-validation score (Accuracy): {grid_search.best_score_:.4f}")
print("Best parameters found:")
print(grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the final optimized model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\n" + "="*50)
print("Results for the Best Tuned Logistic Regression Model on the Test Set:")
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Weighted F1 Score: {f1:.4f}")
print("="*50)

Loading all_tickets.csv data...

Applying stemming to all ticket text...
Stemming complete.

Data split into 38839 training and 9710 testing records.

Performing two-step TF-IDF vectorization...
Vectorization complete.

Starting Hyperparameter Tuning for Logistic Regression...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
