In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/data.csv")
print("Initial shape:", df.shape)

# Drop duplicates
df = df.drop_duplicates()

# Drop rows where essential columns are NaN
df = df.dropna(subset=["Customer_ID","Amount","Product_Category"])

# Fill optional columns
df = df.fillna({"Income":"Unknown","Feedback":"No Feedback"})

print("Remaining missing values:", df.isnull().sum().sum())


Initial shape: (302010, 30)
Remaining missing values: 6941


In [2]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ["Gender", "Income", "Customer_Segment", "Product_Category"]
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

print(df[cat_cols].head())


   Gender  Income  Customer_Segment  Product_Category
0       1       1                 2                 1
1       0       1                 1                 2
2       1       1                 2                 0
3       1       0                 1                 4
4       1       1                 1                 3


In [3]:
# Features and target
X = df[["Age", "Gender", "Income", "Customer_Segment", "Amount", "Total_Purchases"]]
y = df["Product_Category"]

# Example of interaction features
X["Amount_per_Purchase"] = X["Amount"] / (X["Total_Purchases"] + 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Amount_per_Purchase"] = X["Amount"] / (X["Total_Purchases"] + 1)


In [4]:
from sklearn.impute import SimpleImputer

num_cols = ["Age", "Amount", "Total_Purchases", "Amount_per_Purchase"]
imputer = SimpleImputer(strategy="median")
X[num_cols] = imputer.fit_transform(X[num_cols])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = imputer.fit_transform(X[num_cols])


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (240847, 7) Test shape: (60212, 7)


In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("After SMOTE:", X_train_res.shape, y_train_res.value_counts())


After SMOTE: (284105, 7) Product_Category
1    56821
2    56821
0    56821
3    56821
4    56821
Name: count, dtype: int64


In [7]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# RandomForest
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
rf.fit(X_train_res, y_train_res)

# XGBoost
xgb = XGBClassifier(n_estimators=200, random_state=42, eval_metric='mlogloss')
xgb.fit(X_train_res, y_train_res)



0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [8]:
models = {"RandomForest": rf, "XGBoost": xgb}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))



RandomForest Test Accuracy: 0.2333
              precision    recall  f1-score   support

           0       0.19      0.20      0.20     10904
           1       0.19      0.20      0.20     10925
           2       0.28      0.27      0.27     14205
           3       0.29      0.28      0.28     13328
           4       0.19      0.20      0.19     10850

    accuracy                           0.23     60212
   macro avg       0.23      0.23      0.23     60212
weighted avg       0.24      0.23      0.23     60212


XGBoost Test Accuracy: 0.2577
              precision    recall  f1-score   support

           0       0.20      0.20      0.20     10904
           1       0.19      0.19      0.19     10925
           2       0.30      0.39      0.33     14205
           3       0.39      0.28      0.32     13328
           4       0.20      0.19      0.20     10850

    accuracy                           0.26     60212
   macro avg       0.26      0.25      0.25     60212
weighted a

In [9]:
# Proper preprocessing (handle NaNs, encode categoricals, scale features)
# Class imbalance handling (SMOTE or class_weight)
# Feature engineering (ratios, interactions)
# Ensemble models (RandomForest, XGBoost/LightGBM)
# Hyperparameter tuning

In [10]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_features = ["Age", "Amount", "Total_Purchases"]
cat_features = ["Gender", "Income", "Customer_Segment"]

# Column Transformer for scaling + encoding
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

X_prepared = preprocessor.fit_transform(X)


In [11]:
from imblearn.combine import SMOTETomek

smk = SMOTETomek(random_state=42)
X_res, y_res = smk.fit_resample(X_prepared, y)

print("Resampled shape:", X_res.shape)


Resampled shape: (206746, 14)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)


In [17]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

lgbm = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    class_weight='balanced',
    random_state=42
)

lgbm.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 841
[LightGBM] [Info] Number of data points in the train set: 165396, number of used features: 14
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[10]	training's multi_logloss: 1.54602	valid_1's multi_logloss: 1.54947
[20]	training's multi_logloss: 1.52535	valid_1's multi_logloss: 1.53074
[30]	training's multi_logloss: 1.5163	valid_1's multi_logloss: 1.52311
[40]	training's multi_logloss: 1.51127	valid_1's multi_logloss: 1.51956
[50]	training's multi_logloss: 1.50748	valid_1's multi_logloss: 1.51727
[60]	training's mu



Test Accuracy: 0.32440145102781137
Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.29      0.29      8457
           1       0.29      0.28      0.28      8413
           2       0.32      0.47      0.38      7838
           3       0.51      0.31      0.39      8157
           4       0.29      0.28      0.29      8485

    accuracy                           0.32     41350
   macro avg       0.34      0.33      0.33     41350
weighted avg       0.34      0.32      0.32     41350



In [15]:
y_pred = lgbm.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))




Test Accuracy: 0.32440145102781137
              precision    recall  f1-score   support

           0       0.29      0.29      0.29      8457
           1       0.29      0.28      0.28      8413
           2       0.32      0.47      0.38      7838
           3       0.51      0.31      0.39      8157
           4       0.29      0.28      0.29      8485

    accuracy                           0.32     41350
   macro avg       0.34      0.33      0.33     41350
weighted avg       0.34      0.32      0.32     41350

