In [1]:
# ============================================================
# AMAZON PRODUCT RATING PREDICTION MODEL
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

print("="*70)
print("AMAZON PRODUCT RATING PREDICTION")
print("Strategy: Baseline → Tree-based → Advanced")
print("="*70)

# ============================================================
# 1. LOAD DATA
# ============================================================
print("\n" + "="*70)
print("1. DATA LOADING")
print("="*70)

df = pd.read_csv('../data/processed/amazon.csv')

print(f"✓ Loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"\nTarget: rating")
print(f"  Range: {df['rating'].min():.1f} - {df['rating'].max():.1f}")
print(f"  Mean:  {df['rating'].mean():.2f}")
print(f"  Std:   {df['rating'].std():.2f}")

# ============================================================
# 2. FEATURE ENGINEERING (NO REVIEW DATA)
# ============================================================
print("\n" + "="*70)
print("2. FEATURE ENGINEERING")
print("="*70)

# Parse numeric features
numeric_cols = ['discounted_price', 'actual_price', 'discount_percentage', 'rating_count']

for col in numeric_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.replace('₹', '').str.replace(',', '').str.replace('%', '')
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Derived features
df['price_ratio'] = df['discounted_price'] / df['actual_price']
df['discount_amount'] = df['actual_price'] - df['discounted_price']
df['log_rating_count'] = np.log1p(df['rating_count'])
df['is_high_discount'] = (df['discount_percentage'] > 0.5).astype(int)

# Price category
df['price_category'] = pd.cut(
    df['actual_price'], 
    bins=[0, 500, 2000, 10000, float('inf')],
    labels=['budget', 'mid', 'premium', 'luxury']
)

# Category encoding
if 'category' in df.columns:
    df['category_main'] = df['category'].str.split('|').str[0]
    le_category = LabelEncoder()
    df['category_encoded'] = le_category.fit_transform(df['category_main'].fillna('Unknown'))
else:
    df['category_encoded'] = 0

# Product name length
if 'product_name' in df.columns:
    df['product_name_length'] = df['product_name'].str.len()
else:
    df['product_name_length'] = 0

# Price category encoding
le_price = LabelEncoder()
df['price_category_encoded'] = le_price.fit_transform(df['price_category'].astype(str))

# Final feature list
feature_cols = [
    'rating_count',
    'discounted_price',
    'actual_price', 
    'discount_percentage',
    'price_ratio',
    'discount_amount',
    'log_rating_count',
    'is_high_discount',
    'category_encoded',
    'price_category_encoded',
    'product_name_length'
]

print(f"\n✓ Engineered {len(feature_cols)} features:")
for feat in feature_cols:
    print(f"  - {feat}")

# Prepare X and y
X = df[feature_cols].copy()
y = df['rating'].copy()

X = X.fillna(X.median())

print(f"\n✓ Feature matrix: {X.shape}")
print(f"✓ Target vector: {y.shape}")
print(f"✓ Missing values: {X.isnull().sum().sum()}")

# ============================================================
# 3. TRAIN-TEST SPLIT
# ============================================================
print("\n" + "="*70)
print("3. TRAIN-TEST SPLIT")
print("="*70)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=pd.cut(y, bins=5)
)

print(f"✓ Train: {X_train.shape[0]} samples")
print(f"✓ Test:  {X_test.shape[0]} samples")
print(f"✓ Train mean: {y_train.mean():.3f}")
print(f"✓ Test mean:  {y_test.mean():.3f}")

# Scaling for linear models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✓ Features scaled")

# ============================================================
# PHASE 1: BASELINE MODELS
# ============================================================
print("\n" + "="*70)
print("PHASE 1: BASELINE MODELS")
print("="*70)

results = []

# 1.1 Mean Predictor
print("\n[1.1] Mean Predictor...")
baseline_pred = np.full(len(y_test), y_train.mean())
mae = mean_absolute_error(y_test, baseline_pred)
rmse = np.sqrt(mean_squared_error(y_test, baseline_pred))
r2 = r2_score(y_test, baseline_pred)

results.append({
    'Phase': 'Baseline',
    'Model': 'Mean Predictor',
    'MAE': mae,
    'RMSE': rmse,
    'R²': r2,
    'CV_MAE': mae
})

print(f"  MAE:  {mae:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  R²:   {r2:.4f}")

# 1.2 Linear Regression
print("\n[1.2] Linear Regression...")
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

cv_scores = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_mae = -cv_scores.mean()

results.append({
    'Phase': 'Baseline',
    'Model': 'Linear Regression',
    'MAE': mae,
    'RMSE': rmse,
    'R²': r2,
    'CV_MAE': cv_mae
})

print(f"  MAE:    {mae:.4f}")
print(f"  RMSE:   {rmse:.4f}")
print(f"  R²:     {r2:.4f}")
print(f"  CV MAE: {cv_mae:.4f}")

# 1.3 Ridge Regression
print("\n[1.3] Ridge Regression...")
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred = ridge.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

cv_scores = cross_val_score(ridge, X_train_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_mae = -cv_scores.mean()

results.append({
    'Phase': 'Baseline',
    'Model': 'Ridge',
    'MAE': mae,
    'RMSE': rmse,
    'R²': r2,
    'CV_MAE': cv_mae
})

print(f"  MAE:    {mae:.4f}")
print(f"  RMSE:   {rmse:.4f}")
print(f"  R²:     {r2:.4f}")
print(f"  CV MAE: {cv_mae:.4f}")

# ============================================================
# PHASE 2: TREE-BASED MODELS
# ============================================================
print("\n" + "="*70)
print("PHASE 2: TREE-BASED MODELS")
print("="*70)

# 2.1 Decision Tree
print("\n[2.1] Decision Tree...")
dt = DecisionTreeRegressor(max_depth=10, min_samples_split=20, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

cv_scores = cross_val_score(dt, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_mae = -cv_scores.mean()

results.append({
    'Phase': 'Tree-based',
    'Model': 'Decision Tree',
    'MAE': mae,
    'RMSE': rmse,
    'R²': r2,
    'CV_MAE': cv_mae
})

print(f"  MAE:    {mae:.4f}")
print(f"  RMSE:   {rmse:.4f}")
print(f"  R²:     {r2:.4f}")
print(f"  CV MAE: {cv_mae:.4f}")

# 2.2 Random Forest
print("\n[2.2] Random Forest...")
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_mae = -cv_scores.mean()

results.append({
    'Phase': 'Tree-based',
    'Model': 'Random Forest',
    'MAE': mae,
    'RMSE': rmse,
    'R²': r2,
    'CV_MAE': cv_mae
})

print(f"  MAE:    {mae:.4f}")
print(f"  RMSE:   {rmse:.4f}")
print(f"  R²:     {r2:.4f}")
print(f"  CV MAE: {cv_mae:.4f}")

# 2.3 Gradient Boosting
print("\n[2.3] Gradient Boosting...")
gb = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

cv_scores = cross_val_score(gb, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_mae = -cv_scores.mean()

results.append({
    'Phase': 'Tree-based',
    'Model': 'Gradient Boosting',
    'MAE': mae,
    'RMSE': rmse,
    'R²': r2,
    'CV_MAE': cv_mae
})

print(f"  MAE:    {mae:.4f}")
print(f"  RMSE:   {rmse:.4f}")
print(f"  R²:     {r2:.4f}")
print(f"  CV MAE: {cv_mae:.4f}")

# ============================================================
# PHASE 3: ADVANCED MODELS
# ============================================================
print("\n" + "="*70)
print("PHASE 3: ADVANCED MODELS")
print("="*70)

# 3.1 XGBoost
print("\n[3.1] XGBoost...")
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_mae = -cv_scores.mean()

results.append({
    'Phase': 'Advanced',
    'Model': 'XGBoost',
    'MAE': mae,
    'RMSE': rmse,
    'R²': r2,
    'CV_MAE': cv_mae
})

print(f"  MAE:    {mae:.4f}")
print(f"  RMSE:   {rmse:.4f}")
print(f"  R²:     {r2:.4f}")
print(f"  CV MAE: {cv_mae:.4f}")

# ============================================================
# PHASE 4: MODEL COMPARISON & SELECTION
# ============================================================
print("\n" + "="*70)
print("PHASE 4: MODEL COMPARISON")
print("="*70)

results_df = pd.DataFrame(results)

print("\n" + results_df.to_string(index=False))

# Best model
best_idx = results_df['MAE'].idxmin()
best_model_name = results_df.loc[best_idx, 'Model']
best_mae = results_df.loc[best_idx, 'MAE']
best_r2 = results_df.loc[best_idx, 'R²']

baseline_mae = results_df.loc[0, 'MAE']
improvement = (baseline_mae - best_mae) / baseline_mae * 100

print(f"\n{'='*70}")
print(f"BEST MODEL: {best_model_name}")
print(f"  MAE:  {best_mae:.4f}")
print(f"  R²:   {best_r2:.4f}")
print(f"  Improvement: {improvement:.1f}% vs baseline")
print(f"{'='*70}")

# ============================================================
# 5. FEATURE IMPORTANCE
# ============================================================
print("\n" + "="*70)
print("5. FEATURE IMPORTANCE (Best Model)")
print("="*70)

# Get best model object
model_map = {
    'Mean Predictor': None,
    'Linear Regression': lr,
    'Ridge': ridge,
    'Decision Tree': dt,
    'Random Forest': rf,
    'Gradient Boosting': gb,
    'XGBoost': xgb_model
}

best_model = model_map[best_model_name]

if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': importances
    }).sort_values('Importance', ascending=False)

    print("\nTop 10 Features:")
    print(feature_importance.head(10).to_string(index=False))

# ============================================================
# 6. SAVE RESULTS
# ============================================================
print("\n" + "="*70)
print("6. SAVING RESULTS")
print("="*70)

import pickle
import os

os.makedirs('../models', exist_ok=True)
os.makedirs('../results', exist_ok=True)

# Save best model
model_artifacts = {
    'model': best_model,
    'scaler': scaler,
    'features': feature_cols,
    'label_encoders': {
        'category': le_category,
        'price': le_price
    },
    'metrics': results_df.loc[best_idx].to_dict()
}

with open('../models/rating_prediction_model.pkl', 'wb') as f:
    pickle.dump(model_artifacts, f)

print(f"✓ Model saved: ../models/rating_prediction_model.pkl")

# Save results
results_df.to_csv('../results/model_comparison.csv', index=False)
print(f"✓ Results saved: ../results/model_comparison.csv")

print("\n" + "="*70)
print("MODEL TRAINING COMPLETE")
print("="*70)


AMAZON PRODUCT RATING PREDICTION
Strategy: Baseline → Tree-based → Advanced

1. DATA LOADING
✓ Loaded: 1351 rows, 25 columns

Target: rating
  Range: 2.0 - 5.0
  Mean:  4.09
  Std:   0.30

2. FEATURE ENGINEERING

✓ Engineered 11 features:
  - rating_count
  - discounted_price
  - actual_price
  - discount_percentage
  - price_ratio
  - discount_amount
  - log_rating_count
  - is_high_discount
  - category_encoded
  - price_category_encoded
  - product_name_length

✓ Feature matrix: (1351, 11)
✓ Target vector: (1351,)
✓ Missing values: 0

3. TRAIN-TEST SPLIT
✓ Train: 1080 samples
✓ Test:  271 samples
✓ Train mean: 4.092
✓ Test mean:  4.091
✓ Features scaled

PHASE 1: BASELINE MODELS

[1.1] Mean Predictor...
  MAE:  0.2209
  RMSE: 0.3023
  R²:   -0.0000

[1.2] Linear Regression...
  MAE:    0.2093
  RMSE:   0.2791
  R²:     0.1479
  CV MAE: 0.2110

[1.3] Ridge Regression...
  MAE:    0.2091
  RMSE:   0.2790
  R²:     0.1485
  CV MAE: 0.2109

PHASE 2: TREE-BASED MODELS

[2.1] Decision Tre