In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
try:
    df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')

    # Prepare the data by combining title and body
    # Fill potential missing titles with an empty string
    df['text'] = df['title'].fillna('') + ' ' + df['body']

    # Define features (X) and target (y)
    X = df['text']
    y = df['category']

    # Perform the train-test split (80% train, 20% test)
    # Using stratify=y to ensure the same distribution of categories in train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Create new DataFrames for the split data
    train_df = pd.DataFrame({'text': X_train, 'category': y_train})
    test_df = pd.DataFrame({'text': X_test, 'category': y_test})

    # Save the split data to new CSV files
    train_df.to_csv('train_tickets.csv', index=False)
    test_df.to_csv('test_tickets.csv', index=False)

    print("Data splitting is complete.")
    print(f"Training set shape: {train_df.shape}")
    print(f"Testing set shape: {test_df.shape}")
    print("\nCreated 'train_tickets.csv' and 'test_tickets.csv'.")

except FileNotFoundError:
    print("Error: 'all_tickets.csv' not found. Please make sure the file is uploaded.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')

# Combine title and body into a single 'text' feature
df['text'] = df['title'].fillna('') + ' ' + df['body']

# Select the features (X) and the target (y)
X = df[['text']]
y = df['category']

# Perform an 80/20 split, stratifying by category to maintain distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create new DataFrames for the training and testing sets
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save the split data to new CSV files
train_df.to_csv('train_tickets.csv', index=False)
test_df.to_csv('test_tickets.csv', index=False)

print("The data has been split into training and testing sets.")
print("\nTraining set preview:")
print(train_df.head())
print(f"\nTraining set shape: {train_df.shape}")

print("\nTesting set preview:")
print(test_df.head())
print(f"\nTesting set shape: {test_df.shape}")

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.mixed_precision import Policy, set_global_policy

# Enable Mixed Precision Training
policy = Policy('mixed_float16')
set_global_policy(policy)
print(f'Compute dtype: {policy.compute_dtype}')
print(f'Variable dtype: {policy.variable_dtype}')


# --- 1. Load The Pre-split Data ---
print("Loading pre-split training and testing data...")
train_df = pd.read_csv('train_tickets.csv')
test_df = pd.read_csv('test_tickets.csv')

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']


# --- 2. Text Preprocessing ---
print("Preprocessing text data...")
vocab_size = 10000
max_length = 200
embedding_dim = 128
batch_size = 64

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')


# --- Create efficient tf.data pipelines ---
print("Creating efficient tf.data pipelines...")
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_padded, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test_padded, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


# --- 3. Build the Deep Learning Model ---
print("Building the model...")
num_classes = y_train.max() + 1

model = Sequential([
    # THE FIX: Explicitly set the Embedding layer to use float32 for stability
    Embedding(vocab_size, embedding_dim, input_length=max_length, dtype='float32'),

    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    # The final layer should also use float32 for numerical stability
    Dense(num_classes, activation='softmax', dtype='float32')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



# --- 4. Train the Model using the tf.data.Dataset ---
print("\nTraining the model...")
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    train_dataset,
    epochs=20,
    validation_data=test_dataset,
    callbacks=[early_stopping],
    verbose=1
)
model.summary()

# --- 5. Evaluate the Model ---
print("\nEvaluating the model on the test set...")
loss, accuracy = model.evaluate(test_dataset)
print("========================================")
print(f"Final Test Loss: {loss:.4f}")
print(f"Final Test Accuracy: {accuracy:.4f}")
print("========================================")

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.mixed_precision import Policy, set_global_policy

# --- 1. Setup Environment for Acceleration ---
# This helps speed up training on compatible GPUs
try:
    policy = Policy('mixed_float16')
    set_global_policy(policy)
    print(f'Compute dtype: {policy.compute_dtype}')
    print(f'Variable dtype: {policy.variable_dtype}')
except Exception as e:
    print(f"Could not set mixed precision policy: {e}")

# --- 2. Define Hyperparameters ---
vocab_size = 10000
max_length = 200
embedding_dim = 128
batch_size = 64
learning_rate = 0.0005
dropout_rate = 0.4

# --- 3. Load and Preprocess Data ---
print("Loading and preprocessing data...")
try:
    train_df = pd.read_csv('train_tickets.csv')
    test_df = pd.read_csv('test_tickets.csv')
except FileNotFoundError:
    print("train_tickets.csv or test_tickets.csv not found. Splitting all_tickets.csv.")
    all_tickets_df = pd.read_csv('all_tickets.csv')
    all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']
    train_df, test_df = train_test_split(all_tickets_df, test_size=0.2, random_state=42, stratify=all_tickets_df['category'])

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length, padding='post', truncating='post')

# --- 4. Create tf.data pipelines for performance ---
print("Creating efficient tf.data pipelines...")
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_padded, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_padded, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# --- 5. Build the Improved Model ---
print("Building the improved GRU model...")
num_classes = y_train.max() + 1

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length, dtype='float32'),
    Bidirectional(GRU(80, return_sequences=True)),
    Dropout(dropout_rate),
    Bidirectional(GRU(40)),
    Dropout(dropout_rate),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax', dtype='float32')
])

optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


# --- 6. Train the Model ---
print("\nTraining the model...")
early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

history = model.fit(
    train_dataset,
    epochs=25,
    validation_data=test_dataset,
    callbacks=[early_stopping],
    verbose=1
)
model.summary()

# --- 7. Evaluate the Final Model ---
print("\nEvaluating the final model...")
final_loss, final_accuracy = model.evaluate(test_dataset)
print("========================================")
print(f"Final Test Loss: {final_loss:.4f}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")
print("========================================")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# --- 1. Load Data ---
print("Loading pre-split training and testing data...")
try:
    train_df = pd.read_csv('train_tickets.csv')
    test_df = pd.read_csv('test_tickets.csv')
except FileNotFoundError:
    print("train_tickets.csv or test_tickets.csv not found. Splitting from all_tickets.csv")
    # Fallback to splitting the main file if pre-split files aren't available
    all_tickets_df = pd.read_csv('all_tickets.csv')
    all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']
    train_df, test_df = train_test_split(all_tickets_df, test_size=0.2, random_state=42, stratify=all_tickets_df['category'])

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']

# --- 2. Feature Extraction with TF-IDF ---
print("\nPerforming TF-IDF Vectorization...")
# Using n-grams (1,2) includes word pairs (bigrams), which can capture more context.
# Limiting features to the top 20,000 is a common practice.
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))

# Fit the vectorizer on the training data and transform it
print("Fitting TF-IDF on training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Only transform the test data with the already-fitted vectorizer
print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF matrix shape (Train): {X_train_tfidf.shape}")
print(f"TF-IDF matrix shape (Test): {X_test_tfidf.shape}")

# --- 3. Define, Train, and Evaluate Models ---
# A dictionary of the models we want to test
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Linear SVC": LinearSVC(random_state=42)
}

# Loop through each model
for name, model in models.items():
    print("\n" + "="*50)
    print(f"Training {name}...")

    # Train the model
    model.fit(X_train_tfidf, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_tfidf)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)

    # Generate a detailed classification report
    # set zero_division=0 to handle cases where a class has no predictions
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print("="*50)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb

# --- 1. Load Data ---
print("Loading pre-split training and testing data...")
try:
    train_df = pd.read_csv('train_tickets.csv')
    test_df = pd.read_csv('test_tickets.csv')
except FileNotFoundError:
    print("train_tickets.csv or test_tickets.csv not found. Splitting from all_tickets.csv")
    all_tickets_df = pd.read_csv('all_tickets.csv')
    all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']
    train_df, test_df = train_test_split(all_tickets_df, test_size=0.2, random_state=42, stratify=all_tickets_df['category'])

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']

# --- 2. Label Encoding ---
# Gradient boosting models require class labels to be zero-indexed (0, 1, 2,...)
print("\nPerforming Label Encoding on the target variable...")
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
print("Label encoding complete.")

# --- 3. Feature Extraction with TF-IDF ---
print("\nPerforming TF-IDF Vectorization...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print(f"TF-IDF matrix shape (Train): {X_train_tfidf.shape}")

# --- 4. Define, Train, and Evaluate Advanced Models ---
models = {
    # UPDATED: Using the modern API for GPU training in XGBoost
    "XGBoost": xgb.XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        tree_method='hist',
        device='cuda',
        random_state=42
    ),
    "LightGBM": lgb.LGBMClassifier(
        objective='multiclass',
        metric='multi_logloss',
        boosting_type='goss',
        device='gpu',
        random_state=42
    )
}

for name, model in models.items():
    print("\n" + "="*50)
    print(f"Training {name}...")

    # Train the model on the encoded labels
    model.fit(X_train_tfidf, y_train_encoded)

    # Make predictions on the test set
    y_pred_encoded = model.predict(X_test_tfidf)

    # Evaluate the model
    accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

    # For the report, we can use the original class names for better readability
    y_pred_original = label_encoder.inverse_transform(y_pred_encoded)
    report = classification_report(y_test, y_pred_original, zero_division=0)

    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print("="*50)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib # Used for saving and loading the model and tools

# --- 1. Load the FULL training dataset ---
# We use all available data to train the final model
print("Loading all_tickets.csv data...")
all_tickets_df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']

X_train = all_tickets_df['text'].astype(str)
y_train = all_tickets_df['category']

# --- 2. Fit the necessary tools (Encoder and Vectorizer) ---
print("Fitting LabelEncoder and TfidfVectorizer...")

# Label Encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# --- 3. Train the Final Model ---
# We'll use good hyperparameters similar to what a search would find.
# For a real project, you would plug in the "best_params_" from the previous step.
print("Training the final LightGBM model...")
final_model = lgb.LGBMClassifier(
    objective='multiclass',
    metric='multi_logloss',
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    reg_alpha=0.1,
    reg_lambda=0.1,
    colsample_bytree=0.8,
    boosting_type='goss',
    device='gpu',
    random_state=42
)
final_model.fit(X_train_tfidf, y_train_encoded)
print("Model training complete.")

# --- 4. Save the Artifacts ---
# Save the three components to disk for our prediction app
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')
joblib.dump(final_model, 'lgbm_model.joblib')

print("\n" + "="*50)
print("✅ Final model and tools have been saved successfully!")
print("   - tfidf_vectorizer.joblib")
print("   - label_encoder.joblib")
print("   - lgbm_model.joblib")
print("="*50)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
import nltk
from nltk.stem.porter import PorterStemmer
import re

# --- 1. Setup Stemming Function ---
# NLTK's word tokenizer is required for stemming
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    print("NLTK 'punkt' tokenizer not found. Downloading...")
    nltk.download('punkt')

stemmer = PorterStemmer()

def stem_text(text):
    """
    Takes a string, tokenizes it, stems each token, and returns the joined string.
    """
    # Remove non-alphabetic characters and split into words
    words = re.sub(r'[^a-zA-Z]', ' ', text).lower().split()
    # Stem each word
    stemmed_words = [stemmer.stem(word) for word in words]
    # Join the words back into a single string
    return " ".join(stemmed_words)

# --- 2. Load and Prepare Data ---
print("Loading all_tickets.csv data...")
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
df['text'] = (df['title'].fillna('') + ' ' + df['body']).astype(str)

print("\nApplying stemming to all ticket text...")
# Apply the stemming function to the 'text' column. This can take a moment.
df['stemmed_text'] = df['text'].apply(stem_text)
print("Stemming complete.")

# Define features (X) and target (y)
X = df['stemmed_text']
y = df['category']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into {len(X_train)} training and {len(X_test)} testing records.")

# --- 3. Two-Step TF-IDF Vectorization ---
print("\nPerforming two-step TF-IDF vectorization...")

# Step 3a: Use CountVectorizer to get word counts
count_vectorizer = CountVectorizer(stop_words='english', max_features=20000)
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Step 3b: Use TfidfTransformer to get TF-IDF weights
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print("Vectorization complete.")
print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")

# --- 4. Train and Evaluate Models ---
# As seen in the notebook, we'll test these classifiers
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

for name, model in models.items():
    print("\n" + "="*50)
    print(f"Training {name}...")

    # Train the model
    model.fit(X_train_tfidf, y_train)

    # Make predictions
    y_pred = model.predict(X_test_tfidf)

    # Evaluate and print results
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted') # Use 'weighted' for multi-class f1

    print(f"\nResults for {name}:")
    print(f"Accuracy Score: {accuracy:.4f}")
    print(f"Weighted F1 Score: {f1:.4f}")
    print("="*50)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.stem.snowball import SnowballStemmer

# --- 1. Define the Custom Stemming Vectorizer ---
# As you provided, this class integrates SnowballStemmer directly into the pipeline.
# It's more efficient and cleaner than applying a function to the DataFrame.
print("Defining StemmedCountVectorizer class...")
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

# --- 2. Load and Prepare Data ---
print("Loading all_tickets.csv data...")
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
# We don't need to apply a separate stemming function anymore.
# The vectorizer will handle it.
df['text'] = (df['title'].fillna('') + ' ' + df['body']).astype(str)

X = df['text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into {len(X_train)} training and {len(X_test)} testing records.")

# --- 3. Vectorization using the Custom Stemmer ---
print("\nPerforming two-step TF-IDF vectorization with Snowball Stemming...")

# Step 3a: Use our new StemmedCountVectorizer
stemmed_count_vect = StemmedCountVectorizer(stop_words='english', max_features=20000, ngram_range=(1,2))
X_train_counts = stemmed_count_vect.fit_transform(X_train)

# Step 3b: Use TfidfTransformer as before
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Transform the test set using the fitted vectorizer and transformer
X_test_tfidf = tfidf_transformer.transform(stemmed_count_vect.transform(X_test))
print("Vectorization complete.")
print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")

# --- 4. Train and Evaluate Models ---
# Including SGDClassifier alongside the others for a full comparison.
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SGD Classifier": SGDClassifier(random_state=42)
}

for name, model in models.items():
    print("\n" + "="*50)
    print(f"Training {name}...")
    
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print("="*50)

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalAveragePooling1D, Dense, Concatenate, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.mixed_precision import Policy, set_global_policy

# --- 1. Setup Environment for Acceleration ---
try:
    policy = Policy('mixed_float16')
    set_global_policy(policy)
    print(f'Compute dtype: {policy.compute_dtype}')
    print(f'Variable dtype: {policy.variable_dtype}')
except Exception as e:
    print(f"Could not set mixed precision policy: {e}")

# --- 2. Define Hyperparameters ---
vocab_size = 10000
max_length = 200
embedding_dim = 128
batch_size = 64
learning_rate = 0.001 # CNNs can often handle a slightly higher learning rate

# --- 3. Load and Preprocess Data ---
print("Loading and preprocessing data...")
try:
    train_df = pd.read_csv('train_tickets.csv')
    test_df = pd.read_csv('test_tickets.csv')
except FileNotFoundError:
    print("train_tickets.csv or test_tickets.csv not found. Splitting all_tickets.csv.")
    all_tickets_df = pd.read_csv('all_tickets.csv')
    all_tickets_df['text'] = all_tickets_df['title'].fillna('') + ' ' + all_tickets_df['body']
    train_df, test_df = train_test_split(all_tickets_df, test_size=0.2, random_state=42, stratify=all_tickets_df['category'])

X_train = train_df['text'].astype(str)
y_train = train_df['category']
X_test = test_df['text'].astype(str)
y_test = test_df['category']

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length, padding='post', truncating='post')

# --- 4. Create tf.data pipelines for performance ---
print("Creating efficient tf.data pipelines...")
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_padded, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_padded, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# --- 5. Build the 1D Inception-Style Model ---
print("Building the 1D Inception-style model...")
num_classes = y_train.max() + 1

# Define the input layer
input_layer = Input(shape=(max_length,), dtype='int32')

# Embedding layer
embedding_layer = Embedding(vocab_size, embedding_dim, dtype='float32')(input_layer)

# Define the parallel convolutional branches
# Branch 1: Captures tri-grams
conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_layer)
pool1 = GlobalAveragePooling1D()(conv1)

# Branch 2: Captures 4-grams
conv2 = Conv1D(filters=64, kernel_size=4, activation='relu')(embedding_layer)
pool2 = GlobalAveragePooling1D()(conv2)

# Branch 3: Captures 5-grams
conv3 = Conv1D(filters=64, kernel_size=5, activation='relu')(embedding_layer)
pool3 = GlobalAveragePooling1D()(conv3)

# Concatenate the outputs of all branches
concatenated = Concatenate()([pool1, pool2, pool3])

# Add dropout for regularization
dropout = Dropout(0.5)(concatenated)

# Add a dense layer
dense = Dense(128, activation='relu')(dropout)

# Output layer
output_layer = Dense(num_classes, activation='softmax', dtype='float32')(dense)

# Create the Keras Model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

# --- 6. Train the Model ---
print("\nTraining the model...")
early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

history = model.fit(
    train_dataset,
    epochs=25,
    validation_data=test_dataset,
    callbacks=[early_stopping],
    verbose=1
)

# --- 7. Evaluate the Final Model ---
print("\nEvaluating the final model...")
final_loss, final_accuracy = model.evaluate(test_dataset)
print("========================================")
print(f"Final Test Loss: {final_loss:.4f}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")
print("========================================")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Import the models we will use in our ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# Import the Voting Classifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, classification_report

# --- 1. Load and Prepare Data ---
print("Loading all_tickets.csv data...")
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
df['text'] = (df['title'].fillna('') + ' ' + df['body']).astype(str)

X = df['text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into {len(X_train)} training and {len(X_test)} testing records.")

# --- 2. Feature Extraction with TF-IDF ---
print("\nPerforming TF-IDF Vectorization...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print("Vectorization complete.")

# --- 3. Create the Ensemble Model ---
print("\nBuilding the Ensemble Voting Classifier...")

# Create instances of the individual models
clf1 = LogisticRegression(max_iter=1000, C=10, penalty='l2', solver='saga', random_state=42)
clf2 = LinearSVC(random_state=42)
clf3 = MultinomialNB()

# Create the Voting Classifier
# 'estimators' is a list of (name, model) tuples
# 'voting='hard'' means the final prediction is based on a majority vote
ensemble_model = VotingClassifier(
    estimators=[('lr', clf1), ('svc', clf2), ('nb', clf3)],
    voting='hard'
)

# --- 4. Train and Evaluate the Ensemble Model ---
print("Training the ensemble model...")
# We train the VotingClassifier just like any other model
ensemble_model.fit(X_train_tfidf, y_train)

print("\nEvaluating the ensemble model...")
y_pred = ensemble_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

print("\n" + "="*50)
print("Results for the Ensemble Voting Classifier:")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)
print("="*50)

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Import the necessary Hugging Face libraries
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from datasets import Dataset

# --- 1. Load Data and Encode Labels ---
print("Loading all_tickets.csv data...")
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
df['text'] = (df['title'].fillna('') + ' ' + df['body']).astype(str)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
num_labels = len(label_encoder.classes_)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# --- 2. Tokenization ---
MODEL_NAME = 'distilbert-base-uncased'
print(f"\nLoading tokenizer for '{MODEL_NAME}'...")
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

def tokenize_function(examples):
    # This function tokenizes the text and pads it to a uniform length
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# --- 3. Load the Pre-trained Model ---
print(f"Loading pre-trained model '{MODEL_NAME}'...")
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

# --- 4. Prepare TensorFlow Datasets using the Recommended Method ---
# This is the key step. We use the model's own method to create the TF datasets.
# This handles the complex conversion process correctly.
print("Preparing TensorFlow datasets...")
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_train_dataset,
    batch_size=16,
    shuffle=True,
    tokenizer=tokenizer
)

tf_test_dataset = model.prepare_tf_dataset(
    tokenized_test_dataset,
    batch_size=16,
    shuffle=False,
    tokenizer=tokenizer
)

# --- 5. Compile the Model ---
# We can now use the standard Keras compile method.
# AdamW is an optimizer particularly well-suited for Transformers.
optimizer = tf.keras.optimizers.AdamW(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# --- 6. Train (Fine-Tune) the Model ---
print("\nStarting fine-tuning...")
# A validation split is created automatically by `prepare_tf_dataset` if not provided
# but we will use our test set for validation for simplicity here.
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    tf_train_dataset,
    validation_data=tf_test_dataset, # Use the test set for validation during training
    epochs=5,
    callbacks=[early_stopping]
)

# --- 7. Final Evaluation ---
print("\nEvaluating the fine-tuned model...")
final_loss, final_accuracy = model.evaluate(tf_test_dataset)
print("========================================")
print(f"Final Test Loss: {final_loss:.4f}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")
print("========================================")

In [None]:
import os
import joblib
print("\nSaving model and associated components...")
output_model_dir = "./fine_tuned_ticket_classifier"
os.makedirs(output_model_dir, exist_ok=True)

# Use save_pretrained for the model and tokenizer
model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

# Use joblib for the scikit-learn label encoder
joblib.dump(label_encoder, os.path.join(output_model_dir, 'label_encoder.joblib'))

print(f"\n✅ Model, tokenizer, and label encoder saved to '{output_model_dir}'")

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Import the specific classes for RoBERTa
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from datasets import Dataset

# --- 1. Load Data and Encode Labels ---
print("Loading all_tickets.csv data...")
df = pd.read_csv('/kaggle/input/supportticketsclassification/all_tickets.csv')
df['text'] = (df['title'].fillna('') + ' ' + df['body']).astype(str)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
num_labels = len(label_encoder.classes_)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# --- 2. Tokenization ---
# ** THE MAIN CHANGE IS HERE **
MODEL_NAME = 'roberta-base'
print(f"\nLoading tokenizer for '{MODEL_NAME}'...")
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

def tokenize_function(examples):
    # This function tokenizes the text and pads it to a uniform length
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# --- 3. Load the Pre-trained Model ---
print(f"Loading pre-trained model '{MODEL_NAME}'...")
# ** AND HERE **
model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

# --- 4. Prepare TensorFlow Datasets ---
print("Preparing TensorFlow datasets...")
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_train_dataset,
    batch_size=16, # RoBERTa is larger, so a small batch size is important
    shuffle=True,
    tokenizer=tokenizer
)

tf_test_dataset = model.prepare_tf_dataset(
    tokenized_test_dataset,
    batch_size=64,
    shuffle=False,
    tokenizer=tokenizer
)

# --- 5. Compile and Train the Model ---
optimizer = tf.keras.optimizers.AdamW(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

print("\nStarting fine-tuning for RoBERTa...")
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    tf_train_dataset,
    validation_data=tf_test_dataset,
    epochs=5,
    callbacks=[early_stopping]
)

# --- 6. Final Evaluation ---
print("\nEvaluating the fine-tuned RoBERTa model...")
final_loss, final_accuracy = model.evaluate(tf_test_dataset)
print("========================================")
print(f"Final Test Loss: {final_loss:.4f}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")
print("========================================")

In [None]:
# --- 7. Save the Final Model, Tokenizer, and Encoder ---
print("\nSaving model and associated components...")
output_model_dir = "./fine_tuned_roberta_classifier"
os.makedirs(output_model_dir, exist_ok=True)

# Use save_pretrained for the model and tokenizer
model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

# Use joblib for the scikit-learn label encoder
joblib.dump(label_encoder, os.path.join(output_model_dir, 'label_encoder.joblib'))

print(f"\n✅ Model, tokenizer, and label encoder saved to '{output_model_dir}'")


In [None]:
!ls -l

In [None]:
!zip -r fine_tuned_roberta_classifier.zip ./fine_tuned_roberta_classifier

In [None]:
!download /kaggle/working/fine_tuned_roberta_classifier.zip

In [None]:
from IPython.display import FileLink

# This creates a clickable link to the zip file in your notebook's output
display(FileLink(r'fine_tuned_roberta_classifier.zip'))