# Ridge Basic Hyperparameter Tuning

Ridge regression hyperparameter optimization for improved house price prediction using basic feature engineering pipeline.


In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.insert(0, os.path.abspath('../../scripts/'))
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# Import basic pipeline
from pipelines import (
    load_and_prepare_data,
    make_pipeline, 
    get_ridge_config,
    evaluate_pipeline_cv
)

# Load data
X, y = load_and_prepare_data('../../data/cleaned/domain_cleaned.csv')

print(f"Dataset: {X.shape}")
print(f"Features: {X.shape[1]}")
print(f"Target range: ${y.min():,.0f} - ${y.max():,.0f}")


Dataset: (1161, 80)
Features: 80
Target range: $52,500 - $755,000


In [2]:
# Baseline performance (from basic model comparison)
baseline_r2 = 0.9158
baseline_rmse = 22145

print(f"Baseline Performance (Ridge Basic):")
print(f"R²: {baseline_r2:.4f}")
print(f"RMSE: ${baseline_rmse:,.0f}")
print(f"\nTuning Goal: R² > 0.920, RMSE < $22,000")


Baseline Performance (Ridge Basic):
R²: 0.9158
RMSE: $22,145

Tuning Goal: R² > 0.920, RMSE < $22,000


In [3]:
# Phase 1: Random Search (Coarse tuning)
print("Phase 1: Random Search Hyperparameter Tuning")
print("=" * 50)

# Create base pipeline with basic feature engineering
base_pipeline = make_pipeline(
    Ridge(),
    **get_ridge_config()
)

# Define parameter search space
param_dist = {
    'regressor__model__alpha': np.logspace(-3, 3, 50),  # 0.001 to 1000
    'regressor__model__solver': ['auto', 'cholesky', 'lsqr', 'sag'],
    'regressor__model__max_iter': [1000, 2000, 5000],
    'regressor__select__threshold': ['mean', 'median', '0.1*mean', '2*mean']  # Lasso threshold
}

# Random search
random_search = RandomizedSearchCV(
    base_pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=0
)

print("Running Random Search (100 iterations)...")
random_search.fit(X, y)

print(f"\nBest Random Search Score: {-random_search.best_score_:,.0f}")
print(f"Best Parameters:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")


Phase 1: Random Search Hyperparameter Tuning
Running Random Search (100 iterations)...

Best Random Search Score: 21,622
Best Parameters:
  regressor__select__threshold: mean
  regressor__model__solver: sag
  regressor__model__max_iter: 5000
  regressor__model__alpha: 0.868511373751352


In [7]:
# Phase 2: Grid Search (Fine tuning around best parameters)
print("\nPhase 2: Grid Search Fine Tuning")
print("=" * 50)

# Extract best parameters from random search
best_alpha = random_search.best_params_['regressor__model__alpha']
best_solver = random_search.best_params_['regressor__model__solver']
best_threshold = random_search.best_params_['regressor__select__threshold']

print(f"Random Search Best Alpha: {best_alpha:.6f}")
print(f"Creating fine-tuned grid around best alpha...")

# Create finer alpha grid around best value
alpha_range = np.logspace(
    np.log10(best_alpha * 0.3),  # Narrower range for finer search
    np.log10(best_alpha * 3.0), 
    20  # More points for finer granularity
)

# Add some threshold variations to avoid identical results
threshold_options = ['mean', 'median']
if best_threshold not in threshold_options:
    threshold_options.append(best_threshold)
if '0.5*mean' not in threshold_options:
    threshold_options.append('0.5*mean')

param_grid = {
    'regressor__model__alpha': alpha_range,
    'regressor__model__solver': [best_solver],  # Keep best solver
    'regressor__select__threshold': threshold_options
}

print(f"Grid Search Space:")
print(f"  Alpha range: {alpha_range[0]:.6f} to {alpha_range[-1]:.6f} ({len(alpha_range)} points)")
print(f"  Solver: {best_solver}")
print(f"  Thresholds: {threshold_options}")
print(f"  Total combinations: {len(alpha_range) * len(threshold_options)}")

# Grid search
grid_search = GridSearchCV(
    base_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=0
)

print(f"\nRunning Grid Search...")
grid_search.fit(X, y)

print(f"\nGrid Search Results:")
print(f"Best Score: {-grid_search.best_score_:,.0f}")
print(f"Best Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

# Compare with Random Search
random_score = -random_search.best_score_
grid_score = -grid_search.best_score_
improvement = ((random_score - grid_score) / random_score) * 100

print(f"\nPhase Comparison:")
print(f"Random Search: ${random_score:,.0f}")
print(f"Grid Search:   ${grid_score:,.0f}")
if improvement > 0:
    print(f"Grid Search improved by {improvement:.2f}%")
elif improvement < 0:
    print(f"Grid Search is {abs(improvement):.2f}% worse")
else:
    print("No improvement from Grid Search")



Phase 2: Grid Search Fine Tuning
Random Search Best Alpha: 0.868511
Creating fine-tuned grid around best alpha...
Grid Search Space:
  Alpha range: 0.260553 to 2.605534 (20 points)
  Solver: sag
  Thresholds: ['mean', 'median', '0.5*mean']
  Total combinations: 60

Running Grid Search...



Grid Search Results:
Best Score: 21,622
Best Parameters:
  regressor__model__alpha: 0.8754121186125722
  regressor__model__solver: sag
  regressor__select__threshold: mean

Phase Comparison:
Random Search: $21,622
Grid Search:   $21,622
Grid Search is 0.00% worse


In [8]:
# Phase 3: Final Model Evaluation
print("\nPhase 3: Final Model Evaluation")
print("=" * 50)

from sklearn.model_selection import cross_validate

# Create final optimized pipeline with best parameters
final_pipeline = make_pipeline(
    Ridge(**{k.split('__model__')[1]: v for k, v in grid_search.best_params_.items() if '__model__' in k}),
    **get_ridge_config()
)

# Update selector threshold if needed
if 'regressor__select__threshold' in grid_search.best_params_:
    final_config = get_ridge_config()
    final_config['selector_kwargs'] = {'threshold': grid_search.best_params_['regressor__select__threshold']}
    final_pipeline = make_pipeline(
        Ridge(**{k.split('__model__')[1]: v for k, v in grid_search.best_params_.items() if '__model__' in k}),
        **final_config
    )

print("Evaluating optimized Ridge Basic model with 5-fold cross-validation...")
final_scores = cross_validate(
    final_pipeline, X, y,
    cv=5,
    scoring=['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
    n_jobs=-1,
    return_train_score=False
)

final_rmse = -final_scores['test_neg_root_mean_squared_error'].mean()
final_mae = -final_scores['test_neg_mean_absolute_error'].mean()
final_r2 = final_scores['test_r2'].mean()

print(f"\nFinal Optimized Ridge Basic Results:")
print(f"RMSE: ${final_rmse:,.0f} (±{final_scores['test_neg_root_mean_squared_error'].std():,.0f})")
print(f"MAE:  ${final_mae:,.0f} (±{final_scores['test_neg_mean_absolute_error'].std():,.0f})")
print(f"R²:   {final_r2:.4f} (±{final_scores['test_r2'].std():.4f})")

# Performance comparison across phases
print(f"\nPhase Performance Comparison:")
print(f"Random Search RMSE: ${-random_search.best_score_:,.0f}")
print(f"Grid Search RMSE:   ${-grid_search.best_score_:,.0f}")
print(f"Final CV RMSE:      ${final_rmse:,.0f}")

# Baseline comparison
baseline_rmse_improvement = ((baseline_rmse - final_rmse) / baseline_rmse) * 100
baseline_r2_improvement = ((final_r2 - baseline_r2) / baseline_r2) * 100

print(f"\nImprovement vs Baseline:")
print(f"RMSE: {baseline_rmse_improvement:+.2f}%")
print(f"R²:   {baseline_r2_improvement:+.2f}%")



Phase 3: Final Model Evaluation
Evaluating optimized Ridge Basic model with 5-fold cross-validation...

Final Optimized Ridge Basic Results:
RMSE: $21,622 (±3,283)
MAE:  $13,618 (±890)
R²:   0.9212 (±0.0261)

Phase Performance Comparison:
Random Search RMSE: $21,622
Grid Search RMSE:   $21,622
Final CV RMSE:      $21,622

Improvement vs Baseline:
RMSE: +2.36%
R²:   +0.59%


In [None]:
# Save comprehensive tuning results
tuning_results = {
    'model_name': 'Ridge Regression Basic (Tuned)',
    'best_params': grid_search.best_params_,
    'phase_results': {
        'random_search_rmse': -random_search.best_score_,
        'grid_search_rmse': -grid_search.best_score_,
        'final_cv_rmse': final_rmse,
        'final_cv_mae': final_mae,
        'final_cv_r2': final_r2,
        'final_cv_rmse_std': final_scores['test_neg_root_mean_squared_error'].std(),
        'final_cv_mae_std': final_scores['test_neg_mean_absolute_error'].std(),
        'final_cv_r2_std': final_scores['test_r2'].std()
    },
    'final_cv_rmse': final_rmse,
    'final_cv_rmse_std': final_scores['test_neg_root_mean_squared_error'].std(),
    'final_cv_mae': final_mae,
    'final_cv_mae_std': final_scores['test_neg_mean_absolute_error'].std(),
    'final_cv_r2': final_r2,
    'final_cv_r2_std': final_scores['test_r2'].std(),
    'baseline_comparison': {
        'rmse_improvement_pct': baseline_rmse_improvement,
        'r2_improvement_pct': baseline_r2_improvement
    },
    'cv_results': {
        'ROOT_MEAN_SQUARED_ERROR': {
            'mean': final_rmse,
            'std': final_scores['test_neg_root_mean_squared_error'].std()
        },
        'MEAN_ABSOLUTE_ERROR': {
            'mean': final_mae,
            'std': final_scores['test_neg_mean_absolute_error'].std()
        },
        'R2': {
            'mean': final_r2,
            'std': final_scores['test_r2'].std()
        }
    },
    'final_cv_raw_scores': final_scores
}

import pickle
os.makedirs('../../models/tuned/', exist_ok=True)
with open('../../models/tuned/ridge_basic_tuning_results.pkl', 'wb') as f:
    pickle.dump(tuning_results, f)

# Save best model (use final_pipeline for most accurate model)
with open('../../models/tuned/ridge_basic_best_model.pkl', 'wb') as f:
    pickle.dump(final_pipeline, f)

print("\nRidge Basic 3-Phase Hyperparameter Tuning completed")
print(f"Phase 1: Random Search → RMSE: ${-random_search.best_score_:,.0f}")
print(f"Phase 2: Grid Search → RMSE: ${-grid_search.best_score_:,.0f}")
print(f"Phase 3: Final Evaluation → RMSE: ${final_rmse:,.0f}")
print(f"Results saved to models/tuned/ridge_basic_tuning_results.pkl")
print(f"Best model saved to models/tuned/ridge_basic_best_model.pkl")



Ridge Basic 3-Phase Hyperparameter Tuning completed
Phase 1: Random Search → RMSE: $21,622
Phase 2: Grid Search → RMSE: $21,622
Phase 3: Final Evaluation → RMSE: $21,622
Results saved to models/tuned/ridge_basic_tuning_results.pkl
Best model saved to models/tuned/ridge_basic_best_model.pkl
