# Final Model Selection

Comprehensive comparison of all models including tuned Ridge Basic and Advanced models for final selection of the best performing model.


In [3]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
sys.path.insert(0, os.path.abspath('../../scripts/'))
import warnings
warnings.filterwarnings('ignore')
from pipelines import load_and_prepare_data

# Load all model results including tuned models
def load_model_results():
    """Load results from all model experiments"""
    results = {}
    
    # Basic models
    basic_path = '../../models/basic/'
    for filename in ['linear_results.pkl', 'ridge_results.pkl', 'random_forest_results.pkl']:
        if os.path.exists(basic_path + filename):
            with open(basic_path + filename, 'rb') as f:
                results[filename.replace('_results.pkl', '_basic')] = pickle.load(f)
    
    # Advanced models
    advanced_path = '../../models/advanced/'
    for filename in ['linear_advanced_results.pkl', 'ridge_advanced_results.pkl', 'random_forest_advanced_results.pkl']:
        if os.path.exists(advanced_path + filename):
            with open(advanced_path + filename, 'rb') as f:
                results[filename.replace('_advanced_results.pkl', '_advanced')] = pickle.load(f)
    
    # Tuned models
    tuned_path = '../../models/tuned/'
    for filename in ['ridge_basic_tuning_results.pkl', 'ridge_advanced_tuning_results.pkl']:
        if os.path.exists(tuned_path + filename):
            with open(tuned_path + filename, 'rb') as f:
                results[filename.replace('_tuning_results.pkl', '_tuned')] = pickle.load(f)
    
    return results

results = load_model_results()
print(f"Loaded results for {len(results)} model configurations")
print("Available models:", list(results.keys()))


Loaded results for 8 model configurations
Available models: ['linear_basic', 'ridge_basic', 'random_forest_basic', 'linear_advanced', 'ridge_advanced', 'random_forest_advanced', 'ridge_basic_tuned', 'ridge_advanced_tuned']


In [4]:
# Create comprehensive comparison table
comparison_data = []

for model_key, model_result in results.items():
    # Extract metrics based on available keys
    if 'cv_rmse' in model_result:
        rmse = model_result['cv_rmse']
        mae = model_result['cv_mae']
        r2 = model_result['cv_r2']
        rmse_std = model_result['cv_rmse_std']
        mae_std = model_result['cv_mae_std']
        r2_std = model_result['cv_r2_std']
    elif 'phase_results' in model_result:
        rmse = model_result['phase_results']['final_cv_rmse']
        mae = model_result['phase_results']['final_cv_mae']
        r2 = model_result['phase_results']['final_cv_r2']
        rmse_std = model_result['phase_results']['final_cv_rmse_std']
        mae_std = model_result['phase_results']['final_cv_mae_std']
        r2_std = model_result['phase_results']['final_cv_r2_std']
    else:
        continue
    
    comparison_data.append({
        'Model': model_result['model_name'],
        'Type': model_key.split('_')[-1].title(),
        'RMSE': rmse,
        'RMSE_std': rmse_std,
        'MAE': mae,
        'MAE_std': mae_std,
        'R2': r2,
        'R2_std': r2_std
    })

# Create DataFrame and sort by RMSE
comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('RMSE').reset_index(drop=True)

print("Model Performance Comparison (sorted by RMSE)")
print("=" * 80)
print(f"{'Model':<35} {'RMSE':<12} {'MAE':<12} {'R²':<8}")
print("-" * 80)
for _, row in comparison_df.iterrows():
    print(f"{row['Model']:<35} ${row['RMSE']:>8,.0f}   ${row['MAE']:>8,.0f}   {row['R2']:>6.4f}")

print(f"\nBest Model: {comparison_df.iloc[0]['Model']}")
print(f"Best RMSE: ${comparison_df.iloc[0]['RMSE']:,.0f}")
print(f"Best R²: {comparison_df.iloc[0]['R2']:.4f}")

# Show top 3 models
print(f"\nTop 3 Models:")
for i in range(min(3, len(comparison_df))):
    model = comparison_df.iloc[i]
    print(f"  {i+1}. {model['Model']}: RMSE ${model['RMSE']:,.0f}, R² {model['R2']:.4f}")


Model Performance Comparison (sorted by RMSE)
Model                               RMSE         MAE          R²      
--------------------------------------------------------------------------------
Ridge Regression Basic (Tuned)      $  21,622   $  13,618   0.9212
Ridge Regression (Tuned)            $  22,063   $  13,589   0.9195
Ridge Regression                    $  22,145   $  14,284   0.9158
Ridge Regression (Advanced)         $  22,672   $  14,371   0.9122
Linear Regression (Advanced)        $  26,940   $  18,073   0.8775
Random Forest (Advanced)            $  27,944   $  17,361   0.8734
Random Forest                       $  29,739   $  17,736   0.8563
Linear Regression                   $  29,948   $  20,099   0.8448

Best Model: Ridge Regression Basic (Tuned)
Best RMSE: $21,622
Best R²: 0.9212

Top 3 Models:
  1. Ridge Regression Basic (Tuned): RMSE $21,622, R² 0.9212
  2. Ridge Regression (Tuned): RMSE $22,063, R² 0.9195
  3. Ridge Regression: RMSE $22,145, R² 0.9158


In [5]:
# Detailed analysis of best model
best_model_name = comparison_df.iloc[0]['Model']
best_model_key = None

for key, result in results.items():
    if result['model_name'] == best_model_name:
        best_model_key = key
        break

if best_model_key and 'phase_results' in results[best_model_key]:
    best_result = results[best_model_key]
    
    print(f"Detailed Analysis: {best_model_name}")
    print("=" * 60)
    
    # Hyperparameter tuning progression
    if 'phase_results' in best_result:
        phases = best_result['phase_results']
        print("Tuning Progression:")
        print(f"  Random Search: ${phases['random_search_rmse']:,.0f}")
        print(f"  Grid Search:   ${phases['grid_search_rmse']:,.0f}")
        print(f"  Final CV:      ${phases['final_cv_rmse']:,.0f}")
        
        # Improvement calculation
        improvement = ((phases['random_search_rmse'] - phases['final_cv_rmse']) / phases['random_search_rmse']) * 100
        print(f"  Improvement:   {improvement:.2f}%")
    
    # Best hyperparameters
    if 'best_params' in best_result:
        print(f"\nOptimal Hyperparameters:")
        for param, value in best_result['best_params'].items():
            param_short = param.replace('regressor__model__', '').replace('regressor__select__', 'selector_')
            print(f"  {param_short}: {value}")
    
    # Performance stability
    if 'phase_results' in best_result:
        cv_std = phases['final_cv_rmse_std']
        cv_mean = phases['final_cv_rmse']
        cv_coeff = (cv_std / cv_mean) * 100
        print(f"\nModel Stability:")
        print(f"  CV Standard Deviation: ${cv_std:,.0f}")
        print(f"  Coefficient of Variation: {cv_coeff:.1f}%")

else:
    print(f"Best model: {best_model_name}")
    print(f"RMSE: ${comparison_df.iloc[0]['RMSE']:,.0f}")
    print(f"MAE: ${comparison_df.iloc[0]['MAE']:,.0f}")
    print(f"R²: {comparison_df.iloc[0]['R2']:.4f}")


Detailed Analysis: Ridge Regression Basic (Tuned)
Tuning Progression:
  Random Search: $21,622
  Grid Search:   $21,622
  Final CV:      $21,622
  Improvement:   -0.00%

Optimal Hyperparameters:
  alpha: 0.868511373751352
  solver: sag
  selector_threshold: mean

Model Stability:
  CV Standard Deviation: $3,281
  Coefficient of Variation: 15.2%


In [10]:
# Load and analyze final selected model
# Determine which model file to load based on best model
best_model_name = comparison_df.iloc[0]['Model']

# Determine the correct model file path
if 'Basic' in best_model_name and 'Tuned' in best_model_name:
    final_model_path = '../../models/tuned/ridge_basic_best_model.pkl'
elif 'Advanced' in best_model_name and 'Tuned' in best_model_name:
    final_model_path = '../../models/tuned/ridge_advanced_best_model.pkl'
else:
    final_model_path = None

if final_model_path and os.path.exists(final_model_path):
    with open(final_model_path, 'rb') as f:
        final_model = pickle.load(f)
    
    print("Final Selected Model Analysis")
    print("=" * 50)
    print(f"Model Type: {type(final_model).__name__}")
    print(f"Model File: {final_model_path}")
    
    # Pipeline structure
    if hasattr(final_model, 'regressor'):
        print(f"\nPipeline Steps:")
        for i, (name, step) in enumerate(final_model.regressor.steps):
            print(f"  {i+1}. {name}: {type(step).__name__}")
        
        # Target transformation
        if hasattr(final_model, 'func'):
            print(f"  Target Transform: log1p/expm1")
    
    # Model parameters
    if hasattr(final_model, 'regressor') and hasattr(final_model.regressor, 'named_steps'):
        if 'model' in final_model.regressor.named_steps:
            model_params = final_model.regressor.named_steps['model'].get_params()
            print(f"\nKey Model Parameters:")
            key_params = ['alpha', 'solver', 'max_iter']
            for param in key_params:
                if param in model_params:
                    print(f"  {param}: {model_params[param]}")
    
    # Full dataset training
    X, y = load_and_prepare_data('../../data/cleaned/domain_cleaned.csv')
    final_model.fit(X, y)
    print("\nFinal model has been refit on the full cleaned dataset.")

    # Save the final model 
    os.makedirs('../../models/final/', exist_ok=True)
    with open('../../models/final/final_model.pkl', 'wb') as f:
        pickle.dump(final_model, f)
    print("Final model has been saved to models/final/final_model.pkl")


else:
    print("Final model file not found. Please run hyperparameter tuning first.")


Final Selected Model Analysis
Model Type: TransformedTargetRegressor
Model File: ../../models/tuned/ridge_basic_best_model.pkl

Pipeline Steps:
  1. smart_impute: SmartImputer
  2. outlier_handler: StatisticalOutlierHandler
  3. feature_eng: BasicFeatureBuilder
  4. skew_correct: SkewnessCorrector
  5. dtype_opt: DataTypeOptimizer
  6. preprocess: ColumnTransformer
  7. select: SelectFromModel
  8. model: Ridge
  Target Transform: log1p/expm1

Key Model Parameters:
  alpha: 0.868511373751352
  solver: sag
  max_iter: None

Final model has been refit on the full cleaned dataset.
Final model has been saved to models/final/final_model.pkl


In [12]:

# Model selection summary
print(f"\nFinal Model Selection Summary")
print("=" * 50)
print(f"Selected Model: {best_model_name}")
print(f"Expected RMSE: ${comparison_df.iloc[0]['RMSE']:,.0f}")
print(f"Expected MAE:  ${comparison_df.iloc[0]['MAE']:,.0f}")
print(f"Expected R²:   {comparison_df.iloc[0]['R2']:.4f}")

# Save selection results
selection_summary = {
    'selected_model': best_model_name,
    'model_file_path': final_model_path if final_model_path else 'Not available',
    'selection_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'performance_metrics': {
        'rmse': float(comparison_df.iloc[0]['RMSE']),
        'mae': float(comparison_df.iloc[0]['MAE']),
        'r2': float(comparison_df.iloc[0]['R2'])
    },
    'all_models_comparison': comparison_df.to_dict('records')
}

os.makedirs('../../models/final/', exist_ok=True)
with open('../../models/final/model_selection_summary.pkl', 'wb') as f:
    pickle.dump(selection_summary, f)

print(f"\nModel selection completed")
print(f"Summary saved to models/final/model_selection_summary.pkl")



Final Model Selection Summary
Selected Model: Ridge Regression Basic (Tuned)
Expected RMSE: $21,622
Expected MAE:  $13,618
Expected R²:   0.9212

Model selection completed
Summary saved to models/final/model_selection_summary.pkl
