# Linear Regression

Basic linear regression model for house price prediction using advanced ML pipeline with cross-validation.


In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.insert(0, os.path.abspath('../../scripts/'))
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

# Import advanced pipeline
from pipelines import (
    load_and_prepare_data,
    make_pipeline, 
    get_linear_config,
    evaluate_pipeline_cv
)

# Load data
X, y = load_and_prepare_data('../../data/cleaned/domain_cleaned.csv')

print(f"Dataset: {X.shape}")
print(f"Features: {X.shape[1]}")
print(f"Target range: ${y.min():,.0f} - ${y.max():,.0f}")


Dataset: (1161, 80)
Features: 80
Target range: $52,500 - $755,000


In [2]:
# Create advanced pipeline for Linear Regression
pipeline = make_pipeline(
    LinearRegression(),
    **get_linear_config()
)

print("Pipeline created with components:")
for i, (name, step) in enumerate(pipeline.regressor.steps):
    print(f"  {i+1}. {name}: {type(step).__name__}")
print(f"  Target transformation: log1p/expm1")


Pipeline created with components:
  1. smart_impute: SmartImputer
  2. outlier_handler: StatisticalOutlierHandler
  3. feature_eng: FeatureBuilder
  4. skew_correct: SkewnessCorrector
  5. dtype_opt: DataTypeOptimizer
  6. preprocess: ColumnTransformer
  7. select: SelectKBest
  8. model: LinearRegression
  Target transformation: log1p/expm1


In [3]:
# Cross-validation evaluation
results = evaluate_pipeline_cv(
    pipeline, X, y, 
    cv=5, 
    random_state=42, 
    verbose=True
)

# Extract key metrics
cv_rmse = results['ROOT_MEAN_SQUARED_ERROR']['mean']
cv_mae = results['MEAN_ABSOLUTE_ERROR']['mean']
cv_r2 = results['R2']['mean']

print(f"\nSummary:")
print(f"RMSE: ${cv_rmse:,.0f} (±{results['ROOT_MEAN_SQUARED_ERROR']['std']:,.0f})")
print(f"MAE:  ${cv_mae:,.0f} (±{results['MEAN_ABSOLUTE_ERROR']['std']:,.0f})")
print(f"R²:   {cv_r2:.4f} (±{results['R2']['std']:.4f})")


Cross-Validation Results (5-fold):
----------------------------------------
ROOT_MEAN_SQUARED_ERROR: 27075.1824 (+/- 1715.5422)
MEAN_ABSOLUTE_ERROR : 18082.4446 (+/- 746.7383)
R2                  : 0.8761 (+/- 0.0280)
----------------------------------------

Summary:
RMSE: $27,075 (±1,716)
MAE:  $18,082 (±747)
R²:   0.8761 (±0.0280)


In [4]:
# Save results (CV-based only for model comparison)
linear_results = {
    'model_name': 'Linear Regression',
    'cv_rmse': cv_rmse,
    'cv_rmse_std': results['ROOT_MEAN_SQUARED_ERROR']['std'],
    'cv_mae': cv_mae,
    'cv_mae_std': results['MEAN_ABSOLUTE_ERROR']['std'],
    'cv_r2': cv_r2,
    'cv_r2_std': results['R2']['std'],
    'cv_scores': results
}

import os
import pickle
os.makedirs('../../models/basic/', exist_ok=True)
with open('../../models/basic/linear_results.pkl', 'wb') as f:
    pickle.dump(linear_results, f)

print("✓ Linear Regression completed")
print(f"✓ Results saved to models/basic/linear_results.pkl")


✓ Linear Regression completed
✓ Results saved to models/basic/linear_results.pkl
