In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
# Load dataset again
df = pd.read_csv("Clustered_Customer_Data.csv")

# Drop unnecessary column
df.drop(columns=["Unnamed: 0"], inplace=True)

# Separate features and target variable
X = df.drop(columns=["Cluster"])
y = df["Cluster"]

In [15]:
# Handling Imbalanced Data
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Tree Based Models

In [16]:
# Initialize models
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)

# Train models
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Make predictions
dt_preds = dt_model.predict(X_test)
rf_preds = rf_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)

# Evaluate models
print("📌 Decision Tree Results")
print("Accuracy:", accuracy_score(y_test, dt_preds))
print(classification_report(y_test, dt_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_preds))

print("\n📌 Random Forest Results")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))

print("\n📌 XGBoost Results")
print("Accuracy:", accuracy_score(y_test, xgb_preds))
print(classification_report(y_test, xgb_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_preds))

Parameters: { "use_label_encoder" } are not used.



📌 Decision Tree Results
Accuracy: 0.9619736015084852
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       796
           1       0.98      0.99      0.99       795
           2       0.96      0.96      0.96       796
           3       0.95      0.94      0.95       795

    accuracy                           0.96      3182
   macro avg       0.96      0.96      0.96      3182
weighted avg       0.96      0.96      0.96      3182

Confusion Matrix:
 [[760   0  20  16]
 [  0 786   1   8]
 [ 18   1 765  12]
 [ 25  11   9 750]]

📌 Random Forest Results
Accuracy: 0.9798868636077939
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       796
           1       0.98      1.00      0.99       795
           2       0.97      1.00      0.99       796
           3       0.98      0.96      0.97       795

    accuracy                           0.98      3182
   macro avg       0.98      0.98   

In [6]:
# Hyperparameter Tuning with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define hyperparameter grids
dt_params = {
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

xgb_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 6, 9],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1],
    "colsample_bytree": [0.8, 1]
}

# Run RandomizedSearchCV for each model
dt_search = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), dt_params, n_iter=10, cv=3, n_jobs=-1, verbose=1, random_state=42)
rf_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, n_iter=10, cv=3, n_jobs=-1, verbose=1, random_state=42)
xgb_search = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42), xgb_params, n_iter=10, cv=3, n_jobs=-1, verbose=1, random_state=42)

# Fit the models
print("🔍 Tuning Decision Tree...")
dt_search.fit(X_train, y_train)

print("🔍 Tuning Random Forest...")
rf_search.fit(X_train, y_train)

print("🔍 Tuning XGBoost...")
xgb_search.fit(X_train, y_train)

# Best parameters and accuracy
print("\n📌 Best Decision Tree Params:", dt_search.best_params_)
print("Best DT Accuracy:", dt_search.best_score_)

print("\n📌 Best Random Forest Params:", rf_search.best_params_)
print("Best RF Accuracy:", rf_search.best_score_)

print("\n📌 Best XGBoost Params:", xgb_search.best_params_)
print("Best XGB Accuracy:", xgb_search.best_score_)


🔍 Tuning Decision Tree...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
🔍 Tuning Random Forest...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
🔍 Tuning XGBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.




📌 Best Decision Tree Params: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 15}
Best DT Accuracy: 0.9555241238409554

📌 Best Random Forest Params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20}
Best RF Accuracy: 0.9754046833254755

📌 Best XGBoost Params: {'subsample': 1, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 1}
Best XGB Accuracy: 0.9849127769919849


In [8]:
# Hyperparameter Tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

# Updated parameter grids (smaller search space around best params)
dt_grid = {
    "max_depth": [10, 15, 20],
    "min_samples_split": [2, 3, 5],
    "min_samples_leaf": [1, 2, 3]
}

rf_grid = {
    "n_estimators": [150, 200, 250],
    "max_depth": [15, 20, 25],
    "min_samples_split": [4, 5, 6],
    "min_samples_leaf": [1, 2]
}

xgb_grid = {
    "n_estimators": [250, 300, 350],
    "max_depth": [2, 3, 4],
    "learning_rate": [0.08, 0.1, 0.12],
    "subsample": [0.9, 1],
    "colsample_bytree": [0.9, 1]
}

# Initialize models with best parameters from RandomizedSearchCV
dt_model = DecisionTreeClassifier(random_state=42, criterion="entropy")
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)

# GridSearchCV for each model
dt_search = GridSearchCV(dt_model, dt_grid, cv=5, n_jobs=-1, verbose=1)
rf_search = GridSearchCV(rf_model, rf_grid, cv=5, n_jobs=-1, verbose=1)
xgb_search = GridSearchCV(xgb_model, xgb_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the models
print("🔍 Tuning Decision Tree with GridSearchCV...")
dt_search.fit(X_train, y_train)

print("🔍 Tuning Random Forest with GridSearchCV...")
rf_search.fit(X_train, y_train)

print("🔍 Tuning XGBoost with GridSearchCV...")
xgb_search.fit(X_train, y_train)

# Best parameters and accuracy
print("\n📌 Optimized Decision Tree Params:", dt_search.best_params_)
print("Best DT Accuracy:", dt_search.best_score_)

print("\n📌 Optimized Random Forest Params:", rf_search.best_params_)
print("Best RF Accuracy:", rf_search.best_score_)

print("\n📌 Optimized XGBoost Params:", xgb_search.best_params_)
print("Best XGB Accuracy:", xgb_search.best_score_)

🔍 Tuning Decision Tree with GridSearchCV...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
🔍 Tuning Random Forest with GridSearchCV...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
🔍 Tuning XGBoost with GridSearchCV...
Fitting 5 folds for each of 108 candidates, totalling 540 fits


Parameters: { "use_label_encoder" } are not used.




📌 Optimized Decision Tree Params: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best DT Accuracy: 0.9603965695254469

📌 Optimized Random Forest Params: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 200}
Best RF Accuracy: 0.9765050149932788

📌 Optimized XGBoost Params: {'colsample_bytree': 1, 'learning_rate': 0.12, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.9}
Best XGB Accuracy: 0.9875844847729093


# converting it to model
import joblib

# Save the best XGBoost model
joblib.dump(xgb_search.best_estimator_, "customer_model.pkl")

print("✅ XGBoost Model saved successfully!")


In [17]:
import pickle
filename = 'customer_model.sav'
pickle.dump(dt_model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result,'% Acuuracy')

0.9619736015084852 % Acuuracy
