# Enhanced Model Comparison

Comparing enhanced models with advanced features.


In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load enhanced model results
results_dir = '../../models/enhanced/'

try:
    with open(f'{results_dir}/linear_results.pkl', 'rb') as f:
        linear_results = pickle.load(f)
    with open(f'{results_dir}/ridge_results.pkl', 'rb') as f:
        ridge_results = pickle.load(f)
    with open(f'{results_dir}/random_forest_results.pkl', 'rb') as f:
        rf_results = pickle.load(f)
    
    print("✓ All enhanced model results loaded")
    
except FileNotFoundError as e:
    print(f"⚠️ Missing results file: {e}")
    print("Please run notebooks 4-2, 4-3, and 4-4 first")


✓ All enhanced model results loaded


In [2]:
# Performance comparison
enhanced_comparison = pd.DataFrame([linear_results, ridge_results, rf_results])

print("=== ENHANCED MODEL PERFORMANCE COMPARISON ===")
print("\n1. Cross Validation Performance:")
print(f"{'Model':<25} {'RMSE(log)':<12} {'MAE(log)':<12} {'R²':<12} {'RMSE($)':<12} {'MAE($)':<12}")
print("-" * 85)

for _, row in enhanced_comparison.iterrows():
    print(f"{row['model_name']:<25} "
          f"{row['cv_rmse_log']:<12.4f} "
          f"{row['cv_mae_log']:<12.4f} "
          f"{row['cv_r2']:<12.4f} "
          f"${row['cv_rmse_original']:<11,.0f} "
          f"${row['cv_mae_original']:<11,.0f}")

print("\n2. Test Set Performance:")
print(f"{'Model':<25} {'RMSE(log)':<12} {'MAE(log)':<12} {'R²':<12} {'RMSE($)':<12} {'MAE($)':<12}")
print("-" * 85)

for _, row in enhanced_comparison.iterrows():
    print(f"{row['model_name']:<25} "
          f"{row['test_rmse_log']:<12.4f} "
          f"{row['test_mae_log']:<12.4f} "
          f"{row['test_r2']:<12.4f} "
          f"${row['test_rmse_original']:<11,.0f} "
          f"${row['test_mae_original']:<11,.0f}")

print("\n3. Prediction Accuracy:")
print(f"{'Model':<25} {'Within 10%':<12} {'Within 20%':<12}")
print("-" * 50)

for _, row in enhanced_comparison.iterrows():
    print(f"{row['model_name']:<25} "
          f"{row['within_10pct']:<12.1f}% "
          f"{row['within_20pct']:<12.1f}%")


=== ENHANCED MODEL PERFORMANCE COMPARISON ===

1. Cross Validation Performance:
Model                     RMSE(log)    MAE(log)     R²           RMSE($)      MAE($)      
-------------------------------------------------------------------------------------
Enhanced Linear Regression 0.1176       0.0854       0.8988       $0           $0          
Enhanced Ridge Regression 0.1178       0.0854       0.8984       $0           $0          
Enhanced Random Forest    0.1107       0.0736       0.9102       $0           $0          

2. Test Set Performance:
Model                     RMSE(log)    MAE(log)     R²           RMSE($)      MAE($)      
-------------------------------------------------------------------------------------
Enhanced Linear Regression 0.1109       0.0784       0.9123       $20,920      $14,257     
Enhanced Ridge Regression 0.1110       0.0782       0.9121       $20,961      $14,200     
Enhanced Random Forest    0.1102       0.0678       0.9134       $21,854      $12,5

In [3]:
# Best model selection
best_model = enhanced_comparison.loc[enhanced_comparison['test_r2'].idxmax()]

print("\n=== BEST ENHANCED MODEL ===")
print(f"🏆 {best_model['model_name']}")
print(f"Test R²: {best_model['test_r2']:.4f}")
print(f"Test RMSE: ${best_model['test_rmse_original']:,.0f}")
print(f"Within 10%: {best_model['within_10pct']:.1f}%")

# Load and compare with basic models
try:
    basic_results_dir = '../../models/basic'
    
    with open(f'{basic_results_dir}/linear_results.pkl', 'rb') as f:
        basic_linear = pickle.load(f)
    with open(f'{basic_results_dir}/ridge_results.pkl', 'rb') as f:
        basic_ridge = pickle.load(f)
    with open(f'{basic_results_dir}/random_forest_results.pkl', 'rb') as f:
        basic_rf = pickle.load(f)
    
    basic_comparison = pd.DataFrame([basic_linear, basic_ridge, basic_rf])
    best_basic = basic_comparison.loc[basic_comparison['test_r2'].idxmax()]
    
    print(f"\n=== IMPROVEMENT OVER BASIC MODELS ===")
    print(f"Best Basic Model: {best_basic['model_name']}")
    print(f"  Test R²: {best_basic['test_r2']:.4f}")
    print(f"  Within 10%: {best_basic['within_10pct']:.1f}%")
    
    print(f"\nImprovement:")
    r2_improvement = ((best_model['test_r2'] - best_basic['test_r2']) / best_basic['test_r2']) * 100
    accuracy_improvement = best_model['within_10pct'] - best_basic['within_10pct']
    
    print(f"  R² improvement: {r2_improvement:+.1f}%")
    print(f"  10% accuracy improvement: {accuracy_improvement:+.1f} percentage points")
    
except FileNotFoundError:
    print("\n⚠️ Basic model results not found for comparison")

print("\n✓ Enhanced model comparison completed")



=== BEST ENHANCED MODEL ===
🏆 Enhanced Random Forest
Test R²: 0.9134
Test RMSE: $21,854
Within 10%: 79.1%

=== IMPROVEMENT OVER BASIC MODELS ===
Best Basic Model: Linear Regression
  Test R²: 0.8903
  Within 10%: 69.5%

Improvement:
  R² improvement: +2.6%
  10% accuracy improvement: +9.6 percentage points

✓ Enhanced model comparison completed
