In [None]:
import os 
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline  # <- Use imblearn's Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC

In [2]:
os.getcwd()

'c:\\Users\\Multimatics\\Documents\\halalina-ml\\sourceCode'

In [3]:
# Read dataset
os.chdir("c:/Users/Multimatics/Documents/halalina-ml/")
clustered_df = pd.read_csv("data/clustered_data_cluster_minmax_scaler.csv")

# Clean label
clustered_df["age_group"] = clustered_df["age_group"].map({
    'gen_x': 'gen_x',
    'millennials': 'millennials',
    'gen-z': 'gen_z',
    'boomers': 'gen_x'  # Treat boomers as gen_x
})

In [4]:
clustered_df

Unnamed: 0,job,marital,balance,age_group,is_having_debt,Cluster
0,white-collar,married,36362424,gen_x,1,2
1,white-collar,single,492072,gen_x,1,1
2,entrepreneur,married,33936,millennials,2,0
3,blue-collar,married,25553808,gen_x,1,2
4,others,single,16968,millennials,0,1
...,...,...,...,...,...,...
45206,white-collar,married,13998600,gen_x,0,2
45207,others,single,29337672,gen_x,0,1
45208,others,married,96972120,gen_x,0,2
45209,blue-collar,married,11334624,gen_x,0,2


In [5]:
clustered_df["age_group"].unique()

array(['gen_x', 'millennials', 'gen_z'], dtype=object)

In [6]:
X = clustered_df.drop("Cluster", axis=1)
y = clustered_df["Cluster"]

In [None]:
# Categorical & numerical columns
cat_cols = ['job', 'marital', 'age_group']
num_cols = [col for col in X.columns if col not in cat_cols]

# Column transformer
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
preprocessor = ColumnTransformer([
    ('onehot', ohe, cat_cols),
    ('num', 'passthrough', num_cols)
])

# Undersampler
undersampler = RandomUnderSampler(random_state=42)

# Full pipeline with undersampling
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('scaler', RobustScaler()),
    ('undersampler', undersampler),
    ('classifier', SVC(probability=True, random_state=42))
])

# Grid search parameters for SVM
param_grid = {
    'classifier__kernel': ['rbf', 'sigmoid'],
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': ['scale', 'auto']  # Only used for RBF kernel
}

# CV strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv_strategy,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Run the grid search
grid_search.fit(X, y)

# Best results
print("\n✅ Best Parameters:")
print(grid_search.best_params_)

print("\n🎯 Best Cross-Validation Accuracy:")
print(grid_search.best_score_)