In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("../data/raw/data.csv")
df = df.drop_duplicates()
df = df.dropna(subset=["Customer_ID", "Amount", "Product_Category"])
df = df.fillna({"Income": "Unknown", "Feedback": "No Feedback"})
df['Date'] = pd.to_datetime(df['Date'])

cat_cols = ["Gender", "Income", "Customer_Segment", "Product_Category"]
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))


In [2]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

X = df[["Age", "Gender", "Income", "Amount", "Total_Purchases"]]
y = df["Customer_Segment"]

num_cols = ["Age", "Amount", "Total_Purchases"]
X.loc[:, num_cols] = SimpleImputer(strategy="median").fit_transform(X[num_cols])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [3]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)


In [4]:
from sklearn.metrics import accuracy_score, classification_report

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.4674317411811599

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.39      0.39     18180
           1       0.28      0.28      0.28     12834
           2       0.59      0.60      0.60     29155
           3       0.00      0.00      0.00        43

    accuracy                           0.47     60212
   macro avg       0.32      0.32      0.32     60212
weighted avg       0.47      0.47      0.47     60212



In [None]:
#Class 3 has extremely few samples (43) → model literally cannot learn it.
#Classes are heavily imbalanced → simple RandomForest mostly predicts the majority class (2).