In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Load and clean data
df = pd.read_csv("../data/raw/data.csv")
df = df.drop_duplicates()
df = df.dropna(subset=["Customer_ID", "Amount", "Product_Category"])
df = df.fillna({"Income": "Unknown", "Feedback": "No Feedback"})
df['Date'] = pd.to_datetime(df['Date'])

# Outlier handling for numerical columns
num_cols = ["Age", "Amount", "Total_Purchases"]
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

# Feature engineering
df['Avg_Amount'] = df['Amount'] / df['Total_Purchases'].replace(0, 1)  # Avoid division by zero
df['Month'] = df['Date'].dt.month
df['Day_of_Week'] = df['Date'].dt.dayofweek

# Encode categorical variables
ordinal_cols = ["Income"]  # Assuming Income is ordinal (e.g., Low, Medium, High)
nominal_cols = ["Gender", "Product_Category"]  # Customer_Segment is target, not feature

# Label encoding for ordinal variables
for col in ordinal_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# One-hot encoding for nominal variables
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_data = pd.DataFrame(ohe.fit_transform(df[nominal_cols]),
                        columns=ohe.get_feature_names_out(nominal_cols),
                        index=df.index)
df = pd.concat([df.drop(nominal_cols, axis=1), ohe_data], axis=1)

# Prepare features and target
X = df[["Age", "Income", "Amount", "Total_Purchases", "Avg_Amount", "Month", "Day_of_Week"] +
       list(ohe.get_feature_names_out(nominal_cols))]
y = LabelEncoder().fit_transform(df["Customer_Segment"])  # Encode target

# Impute missing values and scale numerical features
imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()
X[num_cols + ["Avg_Amount", "Month", "Day_of_Week"]] = imputer.fit_transform(X[num_cols + ["Avg_Amount", "Month", "Day_of_Week"]])
X[num_cols + ["Avg_Amount", "Month", "Day_of_Week"]] = scaler.fit_transform(X[num_cols + ["Avg_Amount", "Month", "Day_of_Week"]])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Handle class imbalance with SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Hyperparameter tuning for RandomForest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', None]
}
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

# Best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluation
print("Best Parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Cross-validation score
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='f1_weighted')
print("\nCross-Validation F1 Scores:", cv_scores)
print("Mean CV F1 Score:", cv_scores.mean())

# Feature importance
importances = best_model.feature_importances_
feature_imp = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:\n", feature_imp)