# Advanced Random Forest Regression

Random forest ensemble model with advanced feature engineering for improved house price prediction using enhanced ML pipeline with cross-validation.


In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.insert(0, os.path.abspath('../../scripts/'))
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# Import advanced pipeline
from pipelines import (
    load_and_prepare_data,
    make_pipeline, 
    get_advanced_rf_config,
    evaluate_pipeline_cv
)

# Load data
X, y = load_and_prepare_data('../../data/cleaned/domain_cleaned.csv')

print(f"Dataset: {X.shape}")
print(f"Features: {X.shape[1]}")
print(f"Target range: ${y.min():,.0f} - ${y.max():,.0f}")


Dataset: (1161, 80)
Features: 80
Target range: $52,500 - $755,000


In [2]:
# Create advanced pipeline for Random Forest
pipeline = make_pipeline(
    RandomForestRegressor(n_estimators=100, random_state=42),
    **get_advanced_rf_config()
)

print("Advanced Pipeline created with components:")
for i, (name, step) in enumerate(pipeline.regressor.steps):
    print(f"  {i+1}. {name}: {type(step).__name__}")
print(f"  Target transformation: log1p/expm1")
print(f"  Ensemble: 100 trees, no feature scaling needed")
print(f"  Feature engineering: Advanced (location, nonlinear, remodeling features)")


Advanced Pipeline created with components:
  1. smart_impute: SmartImputer
  2. feature_eng: AdvancedFeatureBuilder
  3. dtype_opt: DataTypeOptimizer
  4. preprocess: ColumnTransformer
  5. model: RandomForestRegressor
  Target transformation: log1p/expm1
  Ensemble: 100 trees, no feature scaling needed
  Feature engineering: Advanced (location, nonlinear, remodeling features)


In [3]:
# Cross-validation evaluation
results = evaluate_pipeline_cv(
    pipeline, X, y, 
    cv=5, 
    random_state=42, 
    verbose=True
)

# Extract key metrics
cv_rmse = results['ROOT_MEAN_SQUARED_ERROR']['mean']
cv_mae = results['MEAN_ABSOLUTE_ERROR']['mean']
cv_r2 = results['R2']['mean']

print(f"\nSummary:")
print(f"RMSE: ${cv_rmse:,.0f} (±{results['ROOT_MEAN_SQUARED_ERROR']['std']:,.0f})")
print(f"MAE:  ${cv_mae:,.0f} (±{results['MEAN_ABSOLUTE_ERROR']['std']:,.0f})")
print(f"R²:   {cv_r2:.4f} (±{results['R2']['std']:.4f})")


Cross-Validation Results (5-fold):
----------------------------------------
ROOT_MEAN_SQUARED_ERROR: 27944.4218 (+/- 5837.1991)
MEAN_ABSOLUTE_ERROR : 17360.8807 (+/- 935.9956)
R2                  : 0.8734 (+/- 0.0244)
----------------------------------------

Summary:
RMSE: $27,944 (±5,837)
MAE:  $17,361 (±936)
R²:   0.8734 (±0.0244)


In [4]:
# Save results (CV-based only for model comparison)
rf_advanced_results = {
    'model_name': 'Random Forest (Advanced)',
    'cv_rmse': cv_rmse,
    'cv_rmse_std': results['ROOT_MEAN_SQUARED_ERROR']['std'],
    'cv_mae': cv_mae,
    'cv_mae_std': results['MEAN_ABSOLUTE_ERROR']['std'],
    'cv_r2': cv_r2,
    'cv_r2_std': results['R2']['std'],
    'cv_scores': results
}

import os
import pickle
os.makedirs('../../models/advanced/', exist_ok=True)
with open('../../models/advanced/random_forest_advanced_results.pkl', 'wb') as f:
    pickle.dump(rf_advanced_results, f)

print("✓ Advanced Random Forest Regression completed")
print(f"✓ Results saved to models/advanced/random_forest_advanced_results.pkl")


✓ Advanced Random Forest Regression completed
✓ Results saved to models/advanced/random_forest_advanced_results.pkl
