In [9]:
# 1️⃣ Import Libraries
#----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# 2️⃣ Load Dataset
#----------------------------
df = pd.read_csv("../data/raw/data.csv")
df = df.drop_duplicates()
df = df.dropna(subset=["Customer_ID", "Amount", "Product_Category"])
df = df.fillna({"Income": "Unknown", "Feedback": "No Feedback"})
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

print("Data shape after preprocessing:", df.shape)
print(df.head())

# 3️⃣ Feature Selection
#----------------------------
# Drop irrelevant columns (IDs, personal info, etc.)
drop_cols = ["Transaction_ID", "Customer_ID", "Name", "Email", "Phone",
             "Address", "Zipcode", "products", "Date"]
X = df.drop(columns=drop_cols + ["Product_Category"])  # Features
y = df["Product_Category"]  # Target

# 4️⃣ Handle Missing / Infinite Values
#----------------------------
# Replace inf/-inf with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill numeric columns with median
num_cols = X.select_dtypes(include=[np.number]).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# Fill categorical columns with mode
for col in X.columns:
    if X[col].isna().sum() > 0:
        if X[col].dtype == 'object':
            X[col].fillna(X[col].mode()[0], inplace=True)
        else:
            X[col].fillna(X[col].median(), inplace=True)

print("Any NaN left in X?", X.isna().sum().sum())

# 5️⃣ Encode Categorical Variables
#----------------------------
cat_cols = X.select_dtypes(include=["object"]).columns
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    le_dict[col] = le  # Save encoders for inverse_transform if needed

# 6️⃣ Train-Test Split
#----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 7️⃣ SMOTE Oversampling
#----------------------------
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Original class distribution:\n", y_train.value_counts())
print("Resampled class distribution:\n", y_train_res.value_counts())

# 8️⃣ Feature Scaling (Optional for tree-based models)
#----------------------------
scaler = StandardScaler()
X_train_res[num_cols] = scaler.fit_transform(X_train_res[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# 9️⃣ Model Training
#----------------------------
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train_res, y_train_res)

# 10️⃣ Evaluation
#----------------------------
y_pred = clf.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Data shape after preprocessing: (301059, 30)
   Transaction_ID  Customer_ID                 Name                Email  \
0       8691788.0      37249.0  Michelle Harrington    Ebony39@gmail.com   
1       2174773.0      69749.0          Kelsey Hill     Mark36@gmail.com   
2       6679610.0      30192.0         Scott Jensen    Shane85@gmail.com   
3       7232460.0      62101.0        Joseph Miller     Mary34@gmail.com   
4       4983775.0      27901.0        Debra Coleman  Charles30@gmail.com   

          Phone                      Address        City            State  \
0  1.414787e+09            3959 Amanda Burgs    Dortmund           Berlin   
1  6.852900e+09           82072 Dawn Centers  Nottingham          England   
2  8.362160e+09            4133 Young Canyon     Geelong  New South Wales   
3  2.776752e+09  8148 Thomas Creek Suite 100    Edmonton          Ontario   
4  9.098268e+09    5813 Lori Ports Suite 269     Bristol          England   

   Zipcode    Country  ...  Total_A

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mode()[0], inplace=True)


Any NaN left in X? 0
Original class distribution:
 Product_Category
Electronics    56821
Grocery        53310
Clothing       43700
Books          43618
Home Decor     43398
Name: count, dtype: int64
Resampled class distribution:
 Product_Category
Clothing       56821
Electronics    56821
Books          56821
Grocery        56821
Home Decor     56821
Name: count, dtype: int64
Confusion Matrix:
 [[10901     0     0     3     0]
 [    1 10920     0     4     0]
 [    1     0 14196     8     0]
 [    6     0     0 13322     0]
 [   11     0     0     2 10837]]

Classification Report:
               precision    recall  f1-score   support

       Books       1.00      1.00      1.00     10904
    Clothing       1.00      1.00      1.00     10925
 Electronics       1.00      1.00      1.00     14205
     Grocery       1.00      1.00      1.00     13328
  Home Decor       1.00      1.00      1.00     10850

    accuracy                           1.00     60212
   macro avg       1.00      1.0