In [4]:
# 05_predict.ipynb

# -------------------------
# 1. Import Libraries
# -------------------------
import pandas as pd
import numpy as np
import joblib

# -------------------------
# 2. Load New Product Data
# -------------------------
try:
    new_data = pd.read_csv("../data/processed/amazon_sales_new.csv")
    print("Loaded new product data successfully!")
except FileNotFoundError:
    print("New data file not found. Using a sample of processed training data instead.")
    full_data = pd.read_csv("../data/processed/amazon_sales_processed.csv")
    new_data = full_data.sample(5, random_state=42)  # 5 sample rows

print("New Data Shape:", new_data.shape)
print(new_data.head())

# -------------------------
# 3. Drop Columns Not Needed for Prediction
# -------------------------
target_col = "discounted_price_num"
if target_col in new_data.columns:
    new_data = new_data.drop(columns=[target_col])

# -------------------------
# 4. Load Saved Model & Preprocessing Objects
# -------------------------
rf_model = joblib.load("./random_forest_model.pkl")
cat_imputer = joblib.load("./cat_imputer.pkl")
num_imputer = joblib.load("./num_imputer.pkl")
encoder = joblib.load("./onehot_encoder.pkl")

# -------------------------
# 5. Use Preprocessed Training Column Names
# -------------------------
# Load one row from training data to get numerical + categorical column order
training_data = pd.read_csv("../data/processed/amazon_sales_processed.csv")
X_train_sample = training_data.drop(columns=[target_col])

cat_cols = X_train_sample.select_dtypes(include=["object"]).columns
num_cols = X_train_sample.select_dtypes(include=["int64", "float64"]).columns

# Keep only the columns present in training
new_data = new_data.reindex(columns=list(cat_cols) + list(num_cols), fill_value=0)

# -------------------------
# 6. Impute Missing Values
# -------------------------
new_data[cat_cols] = cat_imputer.transform(new_data[cat_cols])
new_data[num_cols] = num_imputer.transform(new_data[num_cols])

# -------------------------
# 7. Encode Categorical Columns
# -------------------------
new_encoded = encoder.transform(new_data[cat_cols])
encoded_cols = encoder.get_feature_names_out(cat_cols)
new_encoded_df = pd.DataFrame(new_encoded, columns=encoded_cols, index=new_data.index)

# Combine numerical + encoded categorical
new_X = pd.concat([new_data[num_cols], new_encoded_df], axis=1)
print("Processed New Data Shape:", new_X.shape)

# -------------------------
# 8. Predict Prices
# -------------------------
predictions = rf_model.predict(new_X)
new_data["predicted_discounted_price"] = predictions

# -------------------------
# 9. Show & Save Predictions
# -------------------------
output_cols = ["predicted_discounted_price"]
if "product_name" in new_data.columns:
    output_cols = ["product_name"] + output_cols

print(new_data[output_cols])

# Save to CSV
new_data[output_cols].to_csv("../outputs/predicted_prices.csv", index=False)
print("✅ Predictions saved to ../outputs/predicted_prices.csv")


New data file not found. Using a sample of processed training data instead.
New Data Shape: (5, 26)
     product_id                                       product_name  \
976  B09XXZXQC1  Xiaomi Pad 5| Qualcomm Snapdragon 860| 120Hz R...   
175  B09LHXNZLR  Skadioo WiFi Adapter for pc | Car Accessories,...   
275  B09LV13JFB  LOHAYA Voice Assistant Remote Compatible for A...   
548  B08M66K48D  POPIO Tempered Glass Screen Protector Compatib...   
869  B07KRCW6LZ  TP-Link Nano AC600 USB Wi-Fi Adapter(Archer T2...   

                                              category discounted_price  \
976                      Computers&Accessories|Tablets          ₹26,999   
175  Computers&Accessories|NetworkingDevices|Networ...             ₹199   
275  Electronics|HomeTheater,TV&Video|Accessories|R...             ₹399   
548  Electronics|Mobiles&Accessories|MobileAccessor...             ₹299   
869  Computers&Accessories|NetworkingDevices|Networ...             ₹999   

    actual_price discount_pe