In [2]:
# 04_model.ipynb

# -------------------------
# 1. Import Libraries
# -------------------------
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -------------------------
# 2. Load Processed Data
# -------------------------
data = pd.read_csv("../data/processed/amazon_sales_processed.csv")

print("Data Shape:", data.shape)
print(data.head())

# -------------------------
# 3. Define Features & Target
# -------------------------
# Change 'discounted_price_num' to your target column if needed
target = "discounted_price_num"  
X = data.drop(columns=[target])
y = data[target]

# -------------------------
# 4. Train/Test Split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# -------------------------
# 5. Separate Categorical & Numerical Columns
# -------------------------
cat_cols = X_train.select_dtypes(include=["object"]).columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

print("Categorical Columns:", list(cat_cols))
print("Numerical Columns:", list(num_cols))

# -------------------------
# 6. Impute Missing Values
# -------------------------
cat_imputer = SimpleImputer(strategy="most_frequent")
num_imputer = SimpleImputer(strategy="mean")

X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

# -------------------------
# 7. Encode Categorical Columns
# -------------------------
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_encoded = encoder.fit_transform(X_train[cat_cols])
X_test_encoded = encoder.transform(X_test[cat_cols])

encoded_cols = encoder.get_feature_names_out(cat_cols)

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_cols, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_cols, index=X_test.index)

# Combine numerical + encoded categorical
X_train_final = pd.concat([X_train[num_cols], X_train_encoded_df], axis=1)
X_test_final = pd.concat([X_test[num_cols], X_test_encoded_df], axis=1)

print("Final Train Shape:", X_train_final.shape)
print("Final Test Shape:", X_test_final.shape)

# -------------------------
# 8. Train Random Forest Model
# -------------------------
rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_final, y_train)

# -------------------------
# 9. Evaluate Model
# -------------------------
y_pred = rf_model.predict(X_test_final)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

# -------------------------
# 10. Save Model & Preprocessing Objects
# -------------------------
joblib.dump(rf_model, "./random_forest_model.pkl")
joblib.dump(cat_imputer, "./cat_imputer.pkl")
joblib.dump(num_imputer, "./num_imputer.pkl")
joblib.dump(encoder, "./onehot_encoder.pkl")

print("✅ Model and preprocessing objects saved successfully in notebooks/")


Data Shape: (1465, 26)
   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹34