# Basic Model Comparison

Comparing performance of basic machine learning models.


In [15]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load basic model results
results_dir = '../../models/basic/'

try:
    with open(f'{results_dir}/linear_results.pkl', 'rb') as f:
        linear_results = pickle.load(f)
    with open(f'{results_dir}/ridge_results.pkl', 'rb') as f:
        ridge_results = pickle.load(f)
    with open(f'{results_dir}/random_forest_results.pkl', 'rb') as f:
        rf_results = pickle.load(f)
    
    print("✓ All basic model results loaded")
    
except FileNotFoundError as e:
    print(f"⚠️ Missing results file: {e}")
    print("Please run notebooks 3-1, 3-2, and 3-3 first")


✓ All basic model results loaded


In [16]:
# Performance comparison
basic_comparison = pd.DataFrame([linear_results, ridge_results, rf_results])

print("=== BASIC MODEL PERFORMANCE COMPARISON ===")
print("\n1. Cross Validation Performance:")
print(f"{'Model':<20} {'RMSE(log)':<12} {'MAE(log)':<12} {'R²':<12} {'RMSE($)':<12} {'MAE($)':<12}")
print("-" * 80)

for _, row in basic_comparison.iterrows():
    print(f"{row['model_name']:<20} "
          f"{row['cv_rmse_log']:<12.4f} "
          f"{row['cv_mae_log']:<12.4f} "
          f"{row['cv_r2']:<12.4f} "
          f"${row['cv_rmse_original']:<11,.0f} "
          f"${row['cv_mae_original']:<11,.0f}")

print("\n2. Test Set Performance:")
print(f"{'Model':<20} {'RMSE(log)':<12} {'MAE(log)':<12} {'R²':<12} {'RMSE($)':<12} {'MAE($)':<12}")
print("-" * 80)

for _, row in basic_comparison.iterrows():
    print(f"{row['model_name']:<20} "
          f"{row['test_rmse_log']:<12.4f} "
          f"{row['test_mae_log']:<12.4f} "
          f"{row['test_r2']:<12.4f} "
          f"${row['test_rmse_original']:<11,.0f} "
          f"${row['test_mae_original']:<11,.0f}")

print("\n3. Prediction Accuracy:")
print(f"{'Model':<20} {'Within 10%':<12} {'Within 20%':<12}")
print("-" * 45)

for _, row in basic_comparison.iterrows():
    print(f"{row['model_name']:<20} "
          f"{row['within_10pct']:<12.1f}% "
          f"{row['within_20pct']:<12.1f}%")


=== BASIC MODEL PERFORMANCE COMPARISON ===

1. Cross Validation Performance:
Model                RMSE(log)    MAE(log)     R²           RMSE($)      MAE($)      
--------------------------------------------------------------------------------
Linear Regression    0.1283       0.0920       0.8794       $0           $0          
Ridge Regression     0.1281       0.0917       0.8798       $0           $0          
Random Forest        0.1331       0.0949       0.8698       $0           $0          

2. Test Set Performance:
Model                RMSE(log)    MAE(log)     R²           RMSE($)      MAE($)      
--------------------------------------------------------------------------------
Linear Regression    0.1240       0.0853       0.8903       $23,813      $15,540     
Ridge Regression     0.1242       0.0856       0.8900       $23,859      $15,562     
Random Forest        0.1353       0.0950       0.8694       $26,320      $17,341     

3. Prediction Accuracy:
Model                W

In [17]:
# Best model selection
best_model = basic_comparison.loc[basic_comparison['test_r2'].idxmax()]

print("\n=== BEST BASIC MODEL ===")
print(f"🏆 {best_model['model_name']}")
print(f"Test R²: {best_model['test_r2']:.4f}")
print(f"Test RMSE: ${best_model['test_rmse_original']:,.0f}")
print(f"Test MAE: ${best_model['test_mae_original']:,.0f}")
print(f"Within 10%: {best_model['within_10pct']:.1f}%")
print(f"Within 20%: {best_model['within_20pct']:.1f}%")

print("\n✓ Basic model comparison completed")



=== BEST BASIC MODEL ===
🏆 Linear Regression
Test R²: 0.8903
Test RMSE: $23,813
Test MAE: $15,540
Within 10%: 69.5%
Within 20%: 92.6%

✓ Basic model comparison completed
