In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# New Imports required for the new models
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture as GMM

# Ensemble Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'lightgbm'

Load processed data

In [None]:
# Processed data (one-hot / True-False etc.)
train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")

# Assuming 'test_raw.csv' was used to get the original trip_id column
# If not available, replace 'test_raw.csv' with the actual filename containing 'trip_id'
try:
    test_raw = pd.read_csv('test_raw.csv')
except FileNotFoundError:
    print("Warning: 'test_raw.csv' not found. Assuming 'test_processed.csv' contains 'trip_id'.")
    test_raw = test_df.copy()

print(f"Train Data Shape: {train_df.shape}")
print(f"Test Data Shape: {test_df.shape}")

In [None]:
# --- FIX: Clean column names for LightGBM compatibility ---
import re

def clean_feature_names(df):
    """
    Cleans column names by replacing problematic characters (commas, brackets,
    and other special symbols) with underscores for LightGBM compatibility.
    """
    new_cols = []
    for col in df.columns:
        # 1. Replace commas and brackets with underscores
        cleaned_col = col.replace(',', '_').replace('[', '_').replace(']', '_')
        # 2. Replace any remaining non-alphanumeric/non-underscore sequence with a single underscore
        cleaned_col = re.sub(r'[^A-Za-z0-9_]+', '_', cleaned_col)
        # 3. Clean up leading/trailing underscores and double underscores
        cleaned_col = cleaned_col.strip('_').replace('__', '_')
        new_cols.append(cleaned_col)
    
    df.columns = new_cols
    return df

# Apply the cleaning to both dataframes before splitting
train_df = clean_feature_names(train_df)
test_df = clean_feature_names(test_df)

print("Feature names cleaned for LightGBM compatibility.")
# -----------------------------------------------------------

Cross-Validation Split

In [None]:
TARGET = 'spend_category'

X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

# Drop 'trip_id' from test features if it exists, as it's not a feature
if 'trip_id' in test_df.columns:
    X_test = test_df.drop(columns=['trip_id'])
else:
    X_test = test_df

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")

n_classes = y_train.nunique()
print(f"Number of target classes: {n_classes}")


Naive Bayes(Baseline, no tuning)

In [None]:
nb_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('nb', GaussianNB()) # FIXED: Changed from BernoulliNB to GaussianNB
])


# Hyperparameter tuning grid for GaussianNB
nb_pipe.fit(X_train, y_train)
best_nb = nb_pipe
print("Naive Bayes (GaussianNB) fitted.")

Logistic Regression

In [None]:
lr_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=42, max_iter=1000))
# 'lbfgs' is the correct solver for multinomial loss. Increased max_iter for convergence robustness.
])

# Hyperparameter tuning grid for Logistic Regression
lr_param_grid = {
    'lr__C': [0.1, 1.0, 10.0]
}

lr_grid_search = GridSearchCV(lr_pipe, lr_param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, verbose=1)
lr_grid_search.fit(X_train, y_train)
best_lr = lr_grid_search.best_estimator_
print(f"\nBest LR Parameters: {lr_grid_search.best_params_}")


SVM

In [None]:
# LinearSVC is significantly faster and more memory-efficient than SVC(kernel='linear').
# Note: LinearSVC does not support probability=True or the 'rbf' kernel.
lsvc_pipe = Pipeline([
    ('scaler', StandardScaler()),
    # dual='auto' selects the best algorithm for efficiency. max_iter increased for large data.
    ('lsvc', LinearSVC(random_state=42, dual='auto', max_iter=10000)) 
])

# 1. Reduce parameter search space (only C is tunable for LinearSVC)
lsvc_param_grid = {
    'lsvc__C': [0.1, 1.0, 10.0]
}

# 2. Reduce n_jobs and CV folds to mitigate memory errors
# Use n_jobs=4 (or 2) instead of -1 to limit memory consumption.
# Use cv=3 folds for faster initial tuning.
lsvc_grid_search = GridSearchCV(lsvc_pipe, lsvc_param_grid, 
                                cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True), 
                                scoring='accuracy', 
                                n_jobs=4, # Use a specific, limited number of cores
                                verbose=1)

lsvc_grid_search.fit(X_train, y_train)
best_svm = lsvc_grid_search.best_estimator_
print(f"\nOptimized Best SVM Parameters (LinearSVC): {lsvc_grid_search.best_params_}")

K-Means Clustering

In [None]:
# Used for finding natural groups in the features (X_train).
# Set n_clusters to match the number of target classes for direct comparison.
kmeans_model = KMeans(n_clusters=n_classes, random_state=42, n_init='auto')

# K-Means will run fast
kmeans_model.fit(X_train)

Gaussian Mixture Model

In [None]:
# A probabilistic approach to clustering, assuming data is generated from a mixture of Gaussian distributions.
gmm_model = GMM(n_components=n_classes, random_state=42)

#GMM is more computationally intensive
gmm_model.fit(X_train)

XGBoost

In [None]:
xgb_pipe = Pipeline([
    # XGBoost does not need a scaler
    ('xgb', XGBClassifier(objective='multi:softmax',
                          num_class=n_classes,
                          use_label_encoder=False, # Suppress warning on older versions
                          eval_metric='mlogloss', # Standard multi-class metric
                          random_state=42))
])

# Using a reduced grid for faster initial results (cv=3, n_jobs=4)
xgb_param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__learning_rate': [0.05, 0.1],
    'xgb__max_depth': [3, 5]
}

xgb_grid_search = GridSearchCV(xgb_pipe, xgb_param_grid, cv=StratifiedKFold(n_splits=3), scoring='accuracy', n_jobs=4, verbose=1)
xgb_grid_search.fit(X_train, y_train)
best_xgb = xgb_grid_search.best_estimator_
print(f"\nBest XGBoost Parameters: {xgb_grid_search.best_params_}")

LightGBM

In [None]:
lgbm_pipe = Pipeline([
    # LightGBM does not need a scaler
    ('lgbm', LGBMClassifier(objective='multiclass',
                            num_class=n_classes,
                            metric='multi_logloss',
                            random_state=42,
                            n_jobs=-1))
])

# Using a reduced grid for faster initial results (cv=3, n_jobs=4)
lgbm_param_grid = {
    'lgbm__n_estimators': [100, 200],
    'lgbm__learning_rate': [0.05, 0.1],
    'lgbm__num_leaves': [20, 31]
}

lgbm_grid_search = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=StratifiedKFold(n_splits=3), scoring='accuracy', n_jobs=4, verbose=1)
lgbm_grid_search.fit(X_train, y_train)
best_lgbm = lgbm_grid_search.best_estimator_
print(f"\nBest LightGBM Parameters: {lgbm_grid_search.best_params_}")

CatBoost

In [None]:
# --- CATBOOST-SPECIFIC DATA PREPARATION (Using Raw Files) ---
import re

# 1. Re-define helper functions used in original preprocessing (crucial for consistency)
def map_range_to_midpoint(range_str):
    try:
        if '-' in range_str:
            start, end = map(float, re.findall(r'\d+', range_str))
            return (start + end) / 2
        else: # Handle single number values
            return float(range_str)
    except:
        return np.nan # Use NaN for CatBoost to impute

age_mapping = {'<18': 10, '18-24': 21, '25-44': 35, '45-64': 55, '65+': 70}

# 2. Load Raw Data
raw_train_df = pd.read_csv("train.csv")
raw_test_df = pd.read_csv("test.csv")
TARGET = 'spend_category'

# 3. Clean Target and Store Test ID
raw_train_df.dropna(subset=[TARGET], inplace=True)
y_train_cbt = raw_train_df[TARGET].astype(int)

# 4. Apply Feature Engineering (Ordinal Mappings)
for df in [raw_train_df, raw_test_df]:
    # Apply Midpoint mapping
    df['days_booked_midpoint'] = df['days_booked_before_trip'].apply(map_range_to_midpoint)
    df['total_trip_days_midpoint'] = df['total_trip_days'].apply(map_range_to_midpoint)
    # Apply Age Ordinal mapping
    df['age_group_ordinal'] = df['age_group'].map(age_mapping).fillna(np.nan)

    # Drop original ordinal columns
    df.drop(columns=['days_booked_before_trip', 'total_trip_days', 'age_group'], inplace=True)

# 5. Separate X and Identify Categorical Features
X_train_cbt = raw_train_df.drop(columns=[TARGET, 'trip_id'])
X_test_cbt = raw_test_df.drop(columns=['trip_id'])

# CatBoost needs the list of *string* column names that are categorical
CBT_CAT_FEATURES = X_train_cbt.select_dtypes(include=['object']).columns.tolist()

print(f"\nCatBoost Data Ready.")
print(f"Number of string categorical features identified: {len(CBT_CAT_FEATURES)}")
# --------------------------------------------------------------------------------


cbt_pipe = Pipeline([
    ('cbt', CatBoostClassifier(verbose=0,
                                random_state=42, 
                                loss_function='MultiClass', 
                                eval_metric='MultiClass',
                                # Pass the categorical feature names to CatBoost
                                cat_features=CBT_CAT_FEATURES, # <-- THIS IS THE KEY CHANGE
                                thread_count=4)) 
])



#Hyperparameter tuning grid for CatBoost
cbt_param_grid = {
    'cbt__n_estimators': [100, 200],
    'cbt__learning_rate': [0.05, 0.1]
}

cbt_grid_search = GridSearchCV(cbt_pipe, cbt_param_grid, cv=StratifiedKFold(n_splits=3), scoring='accuracy', n_jobs=4, verbose=1)

# Fit CatBoost with the specially prepared raw data
cbt_grid_search.fit(X_train_cbt, y_train_cbt) # <-- USE CBT VARIABLES
best_cbt = cbt_grid_search.best_estimator_
print(f"\nBest CatBoost Parameters: {cbt_grid_search.best_params_}")

Evaluate models on Training data

In [None]:
def evaluate_model(model, X, y, model_name):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print(f"\n--- {model_name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_report(y, y_pred, zero_division=0))

    # Optional: Display confusion matrix
    # disp = ConfusionMatrixDisplay.from_estimator(model, X, y, normalize='true')
    # disp.ax_.set_title(f"Confusion Matrix for {model_name}")
    # plt.show()

    return y_pred

train_pred_nb = evaluate_model(best_nb, X_train, y_train, "Gaussian Naive Bayes (Fixed)")
train_pred_lr = evaluate_model(best_lr, X_train, y_train, "Logistic Regression (Fixed)")
train_pred_svm = evaluate_model(best_svm, X_train, y_train, "Support Vector Machine")
train_pred_xgb = evaluate_model(best_xgb, X_train, y_train, "XGBoost Classifier")
train_pred_lgbm = evaluate_model(best_lgbm, X_train, y_train, "LightGBM Classifier")
train_pred_cbt = evaluate_model(best_cbt, X_train_cbt, y_train_cbt, "CatBoost Classifier")

Make predictions on Test Data and Save Submission

In [None]:
test_pred_nb = best_nb.predict(X_test)
test_pred_lr = best_lr.predict(X_test)
test_pred_svm = best_svm.predict(X_test)
test_pred_xgb = best_xgb.predict(X_test)
test_pred_lgbm = best_lgbm.predict(X_test)
test_pred_cbt = best_cbt.predict(X_test_cbt) # <-- New

print("\nPredictions generated for the test set.")

print("\nSample predictions on test set:")
print("Naive Bayes:", test_pred_nb[:5])
print("Logistic   :", test_pred_lr[:5])
print("SVM        :", test_pred_svm[:5])
print("XGBoost    :", test_pred_xgb[:5])
print("LightGBM   :", test_pred_lgbm[:5]) 
# NOTE: CatBoost multi-class prediction returns a 2D array, so we flatten it
test_pred_cbt_flat = test_pred_cbt.flatten()

print("CatBoost   :", test_pred_cbt_flat[:5]) # <-- New


# SAVE SUBMISSION FILES (WITH trip_id)

# Ensure test_raw has trip_id column
if 'trip_id' not in test_raw.columns:
    print("Error: 'trip_id' column not found in test_raw/test_df! Cannot generate submission file.")
else:
    submission_nb = pd.DataFrame({
        "trip_id": test_raw["trip_id"],
        "spend_category": test_pred_nb.astype(int)
    })
    submission_nb.to_csv("submission_nb.csv", index=False)
    print("submission_nb.csv saved.")

    submission_lr = pd.DataFrame({
        "trip_id": test_raw["trip_id"],
        "spend_category": test_pred_lr.astype(int)
    })
    submission_lr.to_csv("submission_lr.csv", index=False)
    print("submission_lr.csv saved.")

    submission_svm = pd.DataFrame({
        "trip_id": test_raw["trip_id"],
        "spend_category": test_pred_svm.astype(int)
    })
    submission_svm.to_csv("submission_svm.csv", index=False)
    print("submission_svm.csv saved.")

    submission_xgb = pd.DataFrame({
        "trip_id": test_raw["trip_id"],
        "spend_category": test_pred_xgb.astype(int)
    })
    submission_xgb.to_csv("submission_xgb.csv", index=False)
    print("submission_xgb.csv saved.")

    submission_lgbm = pd.DataFrame({
        "trip_id": test_raw["trip_id"],
        "spend_category": test_pred_lgbm.astype(int)
    })
    submission_lgbm.to_csv("submission_lgbm.csv", index=False)
    print("submission_lgbm.csv saved.")

    submission_cbt = pd.DataFrame({
    # Use the raw_test_id created during CatBoost prep
    "trip_id": test_raw["trip_id"], 
    "spend_category": test_pred_cbt_flat.astype(int)
    })
    submission_cbt.to_csv("submission_cbt.csv", index=False)
    print("submission_cbt.csv saved.")