In [1]:
# 📦 IMPORTS
import pandas as pd
import numpy as np
from catboost_model_trainer import train_category_specific_catboost
from sklearn.model_selection import TimeSeriesSplit
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, mean_squared_log_error

print("✅ Imports loaded")


✅ Imports loaded


In [2]:
# 📊 LOAD DATA
print("📊 Loading data...")
category_data = pd.read_csv('../outputs/data_to_train.csv')

print(f"✅ Data loaded: {len(category_data):,} rows, {category_data['categ'].nunique()} categories")
print(f"📅 Date range: {category_data['week'].min()} to {category_data['week'].max()}")
print(f"🎯 Target range: {category_data['target_2w'].min():.1f} to {category_data['target_2w'].max():.1f}")


📊 Loading data...
✅ Data loaded: 7,384 rows, 71 categories
📅 Date range: 2016-08-29 to 2018-08-20
🎯 Target range: 0.0 to 438.0


In [3]:
category_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
demand,7384.0,15.01788,33.415042,0.0,0.0,1.0,11.0,438.0
price,7384.0,169.2422,169.703497,35.327857,85.415501,134.791235,183.16009,1366.261327
active_products,7384.0,455.3239,737.385845,1.0,31.0,95.0,517.0,3029.0
lag_1w,7384.0,14.85699,33.32972,0.0,0.0,1.0,10.0,438.0
lag_2w,7384.0,14.58031,32.950964,0.0,0.0,1.0,10.0,438.0
lag_4w,7384.0,13.96641,32.066413,0.0,0.0,1.0,9.0,438.0
rolling_mean_4w,7384.0,14.42054,31.578954,0.0,0.0,1.5,9.75,251.5
rolling_std_4w,7384.0,4.118196,9.013293,0.0,0.0,1.258306,4.041452,149.810992
simple_trend,7384.0,0.2541644,6.150576,-103.25,-0.25,0.0,0.5,95.75
demand_growth,7384.0,0.08027529,0.872542,-1.0,-0.129903,0.0,0.0,13.0


In [4]:
def safe_rmsle(y_true, y_pred, epsilon=0.001):
    y_true_safe = np.maximum(y_true, epsilon)
    y_pred_safe = np.maximum(y_pred, epsilon)
    return np.sqrt(mean_squared_log_error(y_true_safe, y_pred_safe))

In [5]:
# 🔧 DEFINE FEATURES (CLEAN & SIMPLE)
features_to_train = [
    'price', 'active_products', 'lag_1w', 'lag_2w', 'lag_4w', 'rolling_mean_4w', 
    'rolling_std_4w', 'simple_trend', 'demand_growth', 'price_change', 'price_change_pct', 
    'price_volatility', 'price_vs_market', 'active_products_change', 'month', 'weekofyear',
    'quarter', 'days_to_christmas', 'is_holiday_season', 'is_summer', 'is_back_to_school', 
    'market_share', 'demand_rank', 'relative_growth'
]

print("🚀 TRAINING GLOBAL LightGBM MODEL ON WEEKLY DATA WITH CROSS-VALIDATION")
print("=" * 60)

# Prepare features and target
cat_features = features_to_train + ['categ']
X = category_data[cat_features]
y = category_data['target_2w']

tscv = TimeSeriesSplit(n_splits=3)

print("🚀 TRAINING GLOBAL CATBOOST MODEL ON WEEKLY DATA WITH CROSS-VALIDATION")
print("=" * 60)

r2_scores = []
mae_scores = []
rmsle_scores = []

print("Running TimeSeries Cross-Validation...")
for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    train_X, test_X = X.iloc[train_idx], X.iloc[test_idx]
    train_y, test_y = y.iloc[train_idx], y.iloc[test_idx]

    # Train category-specific model for this fold
    cat_global_model = CatBoostRegressor(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        random_seed=42,
        verbose=False,
        thread_count=-1
    )
    
    # Train model
    cat_global_model.fit(train_X, train_y, cat_features=['categ'])
    y_pred = cat_global_model.predict(test_X)

    # ✅ SAFE RMSLE CALCULATION
    r2 = r2_score(test_y, y_pred)
    mae = mean_absolute_error(test_y, y_pred)
    rmsle = safe_rmsle(test_y, y_pred)
    
    r2_scores.append(r2)
    mae_scores.append(mae)
    rmsle_scores.append(rmsle)
    print(f"  Fold {i+1} -> R² Score: {r2:.3f} | RMSLE: {rmsle:.3f} | MAE: {mae:.2f}")

# Print results
print("-" * 60)
print(f"  ✅ Average R2 Score: {np.mean(r2_scores):.3f}")
print(f"  ✅ Average RMSLE Score: {np.mean(rmsle_scores):.3f}")
print(f"  ✅ Average MAE: {np.mean(mae_scores):.2f}")


🚀 TRAINING GLOBAL LightGBM MODEL ON WEEKLY DATA WITH CROSS-VALIDATION
🚀 TRAINING GLOBAL CATBOOST MODEL ON WEEKLY DATA WITH CROSS-VALIDATION
Running TimeSeries Cross-Validation...
  Fold 1 -> R² Score: 0.820 | RMSLE: 0.658 | MAE: 2.77
  Fold 2 -> R² Score: 0.863 | RMSLE: 0.708 | MAE: 5.97
  Fold 3 -> R² Score: 0.849 | RMSLE: 0.698 | MAE: 6.26
------------------------------------------------------------
  ✅ Average R2 Score: 0.844
  ✅ Average RMSLE Score: 0.688
  ✅ Average MAE: 5.00


In [6]:
def train_catboost_model(data, features, target_col, catboost_params, verbose=False):
    
    tscv = TimeSeriesSplit(n_splits=3)

    r2_scores = []
    mae_scores = []
    rmsle_scores = []

    print("Running TimeSeries Cross-Validation...")
    for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
        train_X, test_X = X.iloc[train_idx], X.iloc[test_idx]
        train_y, test_y = y.iloc[train_idx], y.iloc[test_idx]

        # Train category-specific model for this fold
        cat_global_model = CatBoostRegressor(
            **catboost_params,
            random_seed=42,
            verbose=False,
            thread_count=-1
        )
        
        # Train model
        cat_global_model.fit(train_X, train_y, cat_features=['categ'])
        y_pred = cat_global_model.predict(test_X)

        # ✅ SAFE RMSLE CALCULATION
        r2 = r2_score(test_y, y_pred)
        mae = mean_absolute_error(test_y, y_pred)
        rmsle = safe_rmsle(test_y, y_pred)
        
        r2_scores.append(r2)
        mae_scores.append(mae)
        rmsle_scores.append(rmsle)
        print(f"  Fold {i+1} -> R² Score: {r2:.3f} | RMSLE: {rmsle:.3f} | MAE: {mae:.2f}")
        
    return r2_scores, mae_scores, rmsle_scores


In [7]:
            

def train_catboost_model_by_category(data, features, target_col, catboost_params, verbose=False):
    
    tscv = TimeSeriesSplit(n_splits=3)

    results = {}

    for category in data['categ'].unique():
        print(f"Training model for category: {category}")
        cat_data = data[data['categ'] == category].copy()
        X = cat_data[features]
        y = cat_data[target_col]
        
        for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
            train_X, test_X = X.iloc[train_idx], X.iloc[test_idx]
            train_y, test_y = y.iloc[train_idx], y.iloc[test_idx]

        # Train category-specific model for this fold
        model = CatBoostRegressor(
            **catboost_params,
            random_seed=42,
            verbose=False,
            thread_count=-1
        )
        
        # Train model
        model.fit(train_X, train_y, cat_features=['categ'])
        y_pred = model.predict(test_X)

        metrics = {
            'r2': r2_score(test_y, y_pred),
            'mae': mean_absolute_error(test_y, y_pred),
            'rmsle': safe_rmsle(test_y, y_pred)
        }
      
        print(f"  Fold {i+1} -> R² Score: {r2:.3f} | RMSLE: {rmsle:.3f} | MAE: {mae:.2f}")
                # Store results
        results[category] = {
            'model': model,
            'metrics': metrics,
            'predictions': {
                'y_true': test_y.values,
                'y_pred': y_pred
            },
            'data_info': {
                'total_samples': len(cat_data),
                'train_samples': len(train_X),
                'test_samples': len(test_X),
                'sparsity': float((data == 0).mean())
            },
            'feature_importance': dict(zip(features, model.feature_importances_))
        }           
        return results


In [8]:
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

def safe_rmsle(y_true, y_pred):
    y_true = np.maximum(y_true, 0)
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

def train_catboost_model_by_category(data, features, target_col, catboost_params, verbose=False):
    tscv = TimeSeriesSplit(n_splits=3)
    results = {}

    for category in data['categ'].unique():
        if verbose:
            print(f"📦 Training model for category: {category}")
        cat_data = data[data['categ'] == category].copy()
        X = cat_data[features]
        y = cat_data[target_col]

        fold_metrics = []
        fold_predictions = []
        model = None

        for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
            train_X, test_X = X.iloc[train_idx], X.iloc[test_idx]
            train_y, test_y = y.iloc[train_idx], y.iloc[test_idx]

            model = CatBoostRegressor(
                **catboost_params,
                random_seed=42,
                verbose=False,
                thread_count=-1
            )

            model.fit(train_X, train_y, cat_features=['categ'])

            y_pred = model.predict(test_X)

            r2 = r2_score(test_y, y_pred)
            mae = mean_absolute_error(test_y, y_pred)
            rmsle = safe_rmsle(test_y, y_pred)

            fold_metrics.append({'r2': r2, 'mae': mae, 'rmsle': rmsle})
            fold_predictions.append((test_y.values, y_pred))

            if verbose:
                print(f"  Fold {i+1} -> R²: {r2:.3f}, RMSLE: {rmsle:.3f}, MAE: {mae:.2f}")

        avg_metrics = {
            'r2': np.mean([m['r2'] for m in fold_metrics]),
            'mae': np.mean([m['mae'] for m in fold_metrics]),
            'rmsle': np.mean([m['rmsle'] for m in fold_metrics]),
        }

        results[category] = {
            'model': model,
            'metrics': avg_metrics,
            'predictions': {
                'y_true': np.concatenate([p[0] for p in fold_predictions]),
                'y_pred': np.concatenate([p[1] for p in fold_predictions])
            },
            'data_info': {
                'total_samples': len(cat_data),
                'sparsity': float((cat_data[features] == 0).mean().mean())
            },
            'feature_importance': dict(zip(features, model.feature_importances_))
        }

    return results



categ_results = train_catboost_model_by_category(
    data=category_data,
    features=features_to_train + ['categ'],
    target_col='target_2w',
    catboost_params={
        'iterations': 200,
        'depth': 6,
        'learning_rate': 0.1,
    },
    verbose=True
)

# ✅ Now categ_results contains all your trained models and metrics!
print(f"✅ Successfully trained {len(categ_results)} models")
print(f"📊 Categories: {list(categ_results.keys())[:5]}...")

📦 Training model for category: agro_industry_and_commerce
  Fold 1 -> R²: -0.149, RMSLE: 0.450, MAE: 0.56
  Fold 2 -> R²: -1.153, RMSLE: 1.166, MAE: 3.12
  Fold 3 -> R²: -0.159, RMSLE: 0.740, MAE: 2.74
📦 Training model for category: air_conditioning
  Fold 1 -> R²: -0.270, RMSLE: 0.513, MAE: 1.41
  Fold 2 -> R²: -0.339, RMSLE: 0.754, MAE: 2.11
  Fold 3 -> R²: -0.266, RMSLE: 0.701, MAE: 2.41
📦 Training model for category: art
  Fold 1 -> R²: -0.661, RMSLE: 0.767, MAE: 1.15
  Fold 2 -> R²: -0.163, RMSLE: 0.673, MAE: 1.38
  Fold 3 -> R²: -1.126, RMSLE: 1.036, MAE: 3.93
📦 Training model for category: arts_and_craftmanship


CatBoostError: catboost/libs/metrics/metric.cpp:6935: All train targets are equal

In [None]:
print(f"\n📊 FEATURE IMPORTANCE ANALYSIS")


In [7]:
# 📊 ANALYZE RESULTS
print("📊 RESULTS ANALYSIS")
print("=" * 50)

if catboost_results:
    # Extract performance metrics
    performance = []
    for cat, result in catboost_results.items():
        performance.append({
            'category': cat,
            'r2': result['metrics']['r2'],
            'rmsle': result['metrics']['rmsle'],
            'mae': result['metrics']['mae']
        })
    
    # Sort by RMSLE (lower = better)
    performance.sort(key=lambda x: x['rmsle'])
    
    # Overall statistics
    all_rmsle = [p['rmsle'] for p in performance]
    all_r2 = [p['r2'] for p in performance]
    
    print(f"📈 OVERALL PERFORMANCE:")
    print(f"   Models trained: {len(performance)}")
    print(f"   Average RMSLE: {np.mean(all_rmsle):.3f} ± {np.std(all_rmsle):.3f}")
    print(f"   Average R²: {np.mean(all_r2):.3f} ± {np.std(all_r2):.3f}")
    print(f"   Best RMSLE: {min(all_rmsle):.3f}")
    print(f"   Worst RMSLE: {max(all_rmsle):.3f}")
    
    print(f"\n🏆 TOP 5 BEST CATEGORIES:")
    for i, p in enumerate(performance[:5]):
        print(f"   {i+1}. {p['category']}: RMSLE={p['rmsle']:.3f}, R²={p['r2']:.3f}")
    
    print(f"\n⚠️  WORST 3 CATEGORIES:")
    for i, p in enumerate(performance[-3:]):
        print(f"   {i+1}. {p['category']}: RMSLE={p['rmsle']:.3f}, R²={p['r2']:.3f}")
    
    # Count good performers
    good_models = sum(1 for p in performance if p['rmsle'] < 1.0)
    print(f"\n📈 PERFORMANCE BREAKDOWN:")
    print(f"   Models with RMSLE < 1.0: {good_models}/{len(performance)} ({good_models/len(performance)*100:.1f}%)")
    
else:
    print("❌ No models were trained")


📊 RESULTS ANALYSIS
📈 OVERALL PERFORMANCE:
   Models trained: 71
   Average RMSLE: 0.842 ± 0.256
   Average R²: -0.818 ± 1.413
   Best RMSLE: 0.071
   Worst RMSLE: 1.220

🏆 TOP 5 BEST CATEGORIES:
   1. security_and_services: RMSLE=0.071, R²=0.000
   2. fashion_childrens_clothes: RMSLE=0.204, R²=-0.544
   3. furniture_mattress_and_upholstery: RMSLE=0.315, R²=-2.815
   4. home_comfort_2: RMSLE=0.403, R²=-0.093
   5. fashion_sport: RMSLE=0.423, R²=-6.503

⚠️  WORST 3 CATEGORIES:
   1. bed_bath_table: RMSLE=1.204, R²=0.149
   2. construction_tools_lights: RMSLE=1.219, R²=-1.532
   3. furniture_decor: RMSLE=1.220, R²=0.336

📈 PERFORMANCE BREAKDOWN:
   Models with RMSLE < 1.0: 48/71 (67.6%)


In [8]:
# 🥊 COMPARE WITH LIGHTGBM
print("🥊 MODEL COMPARISON")
print("=" * 50)

# LightGBM baseline results (from previous analysis)
lgbm_r2 = 0.818
lgbm_rmsle = 0.620
lgbm_mae = 5.14

if catboost_results:
    # CatBoost average results
    catboost_r2 = np.mean([r['metrics']['r2'] for r in catboost_results.values()])
    catboost_rmsle = np.mean([r['metrics']['rmsle'] for r in catboost_results.values()])
    catboost_mae = np.mean([r['metrics']['mae'] for r in catboost_results.values()])
    
    print(f"📊 PERFORMANCE COMPARISON:")
    print(f"   LightGBM (Global):  R²={lgbm_r2:.3f}, RMSLE={lgbm_rmsle:.3f}, MAE={lgbm_mae:.2f}")
    print(f"   CatBoost (Average): R²={catboost_r2:.3f}, RMSLE={catboost_rmsle:.3f}, MAE={catboost_mae:.2f}")
    
    # Determine winner by RMSLE (most important for demand forecasting)
    if catboost_rmsle < lgbm_rmsle:
        winner = "CatBoost"
        improvement = (lgbm_rmsle - catboost_rmsle) / lgbm_rmsle * 100
        print(f"\n🏆 WINNER: {winner} (RMSLE improved by {improvement:.1f}%)")
    else:
        winner = "LightGBM"
        difference = (catboost_rmsle - lgbm_rmsle) / lgbm_rmsle * 100
        print(f"\n🏆 WINNER: {winner} (CatBoost RMSLE {difference:.1f}% higher)")
    
    print(f"\n🎯 RECOMMENDATION:")
    if winner == "CatBoost":
        print(f"   Use CatBoost category-specific models for production")
        print(f"   Focus on categories with RMSLE < 0.8")
    else:
        print(f"   LightGBM global model performs better overall")
        print(f"   Consider using CatBoost only for best-performing categories")

else:
    print("❌ No CatBoost results to compare")


🥊 MODEL COMPARISON
📊 PERFORMANCE COMPARISON:
   LightGBM (Global):  R²=0.818, RMSLE=0.620, MAE=5.14
   CatBoost (Average): R²=-0.818, RMSLE=0.842, MAE=9.62

🏆 WINNER: LightGBM (CatBoost RMSLE 35.8% higher)

🎯 RECOMMENDATION:
   LightGBM global model performs better overall
   Consider using CatBoost only for best-performing categories


In [None]:
# 💾 SAVE RESULTS
print("💾 SAVING RESULTS")
print("=" * 50)

if catboost_results:
    # Create results dataframe
    results_data = []
    for cat, result in catboost_results.items():
        results_data.append({
            'category': cat,
            'r2_score': result['metrics']['r2'],
            'rmsle': result['metrics']['rmsle'],
            'mae': result['metrics']['mae'],
            'train_samples': result['data_info']['train_samples'],
            'test_samples': result['data_info']['test_samples'],
            'sparsity_pct': result['data_info']['sparsity'] * 100
        })
    
    results_df = pd.DataFrame(results_data)
    results_df = results_df.sort_values('rmsle')  # Sort by performance
    
    # Save to file
    output_file = '../outputs/simple_catboost_results.csv'
    results_df.to_csv(output_file, index=False)
    
    print(f"✅ Results saved to: {output_file}")
    print(f"📊 Saved {len(results_df)} category results")
    
    # Show sample
    print(f"\n📋 BEST 3 RESULTS PREVIEW:")
    print(results_df[['category', 'rmsle', 'r2_score', 'mae']].head(3).to_string(index=False))
    
else:
    print("❌ No results to save")


In [9]:
# 🎯 FINAL SUMMARY & NEXT STEPS
print("🎯 FINAL SUMMARY")
print("=" * 50)

if catboost_results:
    best_category = min(catboost_results.items(), key=lambda x: x[1]['metrics']['rmsle'])
    worst_category = max(catboost_results.items(), key=lambda x: x[1]['metrics']['rmsle'])
    
    print(f"📊 TRAINING SUMMARY:")
    print(f"   ✅ Successfully trained {len(catboost_results)} CatBoost models")
    print(f"   🏆 Best category: {best_category[0]} (RMSLE: {best_category[1]['metrics']['rmsle']:.3f})")
    print(f"   ⚠️  Worst category: {worst_category[0]} (RMSLE: {worst_category[1]['metrics']['rmsle']:.3f})")
    
    avg_rmsle = np.mean([r['metrics']['rmsle'] for r in catboost_results.values()])
    print(f"   📈 Average RMSLE: {avg_rmsle:.3f}")
    
    print(f"\n🚀 NEXT STEPS FOR PRODUCTION:")
    print(f"   1. Deploy models for categories with RMSLE < 0.8")
    print(f"   2. Use ensemble approach for critical categories")
    print(f"   3. Monitor model performance weekly")
    print(f"   4. Retrain models monthly with new data")
    
    # Production-ready categories
    production_ready = [cat for cat, result in catboost_results.items() 
                       if result['metrics']['rmsle'] < 0.8]
    
    print(f"\n✅ PRODUCTION-READY CATEGORIES ({len(production_ready)}):")
    for cat in production_ready[:10]:  # Show first 10
        rmsle = catboost_results[cat]['metrics']['rmsle']
        print(f"   • {cat}: RMSLE={rmsle:.3f}")
    
    if len(production_ready) > 10:
        print(f"   ... and {len(production_ready) - 10} more")

else:
    print("❌ No models were trained successfully")

print(f"\n🎉 ANALYSIS COMPLETE (KISS PRINCIPLE APPLIED!)")


🎯 FINAL SUMMARY
📊 TRAINING SUMMARY:
   ✅ Successfully trained 71 CatBoost models
   🏆 Best category: security_and_services (RMSLE: 0.071)
   ⚠️  Worst category: furniture_decor (RMSLE: 1.220)
   📈 Average RMSLE: 0.842

🚀 NEXT STEPS FOR PRODUCTION:
   1. Deploy models for categories with RMSLE < 0.8
   2. Use ensemble approach for critical categories
   3. Monitor model performance weekly
   4. Retrain models monthly with new data

✅ PRODUCTION-READY CATEGORIES (29):
   • agro_industry_and_commerce: RMSLE=0.780
   • air_conditioning: RMSLE=0.729
   • arts_and_craftmanship: RMSLE=0.797
   • audio: RMSLE=0.713
   • books_imported: RMSLE=0.703
   • cds_dvds_musicals: RMSLE=0.423
   • christmas_supplies: RMSLE=0.730
   • costruction_tools_garden: RMSLE=0.716
   • costruction_tools_tools: RMSLE=0.678
   • diapers_and_hygiene: RMSLE=0.533
   ... and 19 more

🎉 ANALYSIS COMPLETE (KISS PRINCIPLE APPLIED!)
