# Linear Regression

Basic linear regression model for house price prediction.


In [33]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load preprocessed data
X = pd.read_pickle('../../data/processed/df_linear_ready.pkl').drop(['SalePrice'], axis=1)
y = pd.read_pickle('../../data/processed/df_linear_ready.pkl')['SalePrice']

print(f"Dataset: {X.shape}")
print(f"Features: {X.shape[1]}")


Dataset: (1408, 24)
Features: 24


In [34]:
# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Target range: ${np.expm1(y).min():,.0f} - ${np.expm1(y).max():,.0f}")


Training set: (1126, 24)
Test set: (282, 24)
Target range: $52,000 - $451,950


In [35]:
# Train model and cross validation
model = LinearRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2')

print("Cross Validation (log scale):")
print(f"RMSE: {cv_rmse.mean():.4f} (±{cv_rmse.std():.4f})")
print(f"MAE:  {cv_mae.mean():.4f} (±{cv_mae.std():.4f})")
print(f"R²:   {cv_r2.mean():.4f} (±{cv_r2.std():.4f})")

# Convert to original scale
cv_rmse_orig = np.expm1(cv_rmse.mean())
cv_mae_orig = np.expm1(cv_mae.mean())
print(f"\nOriginal scale:")
print(f"RMSE: ${cv_rmse_orig:,.0f}")
print(f"MAE:  ${cv_mae_orig:,.0f}")


Cross Validation (log scale):
RMSE: 0.1283 (±0.0085)
MAE:  0.0920 (±0.0062)
R²:   0.8794 (±0.0151)

Original scale:
RMSE: $0
MAE:  $0


In [36]:
# Test set performance
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

y_test_orig = np.expm1(y_test)
y_pred_orig = np.expm1(y_pred)
test_rmse_orig = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
test_mae_orig = mean_absolute_error(y_test_orig, y_pred_orig)

# Prediction accuracy
errors = np.abs(y_pred_orig - y_test_orig) / y_test_orig
within_10pct = (errors <= 0.10).mean() * 100
within_20pct = (errors <= 0.20).mean() * 100

print("Test Set Performance:")
print(f"RMSE (log): {test_rmse:.4f}")
print(f"MAE (log):  {test_mae:.4f}")
print(f"R²:         {test_r2:.4f}")
print(f"RMSE:       ${test_rmse_orig:,.0f}")
print(f"MAE:        ${test_mae_orig:,.0f}")
print(f"Within 10%: {within_10pct:.1f}%")
print(f"Within 20%: {within_20pct:.1f}%")


Test Set Performance:
RMSE (log): 0.1240
MAE (log):  0.0853
R²:         0.8903
RMSE:       $23,813
MAE:        $15,540
Within 10%: 69.5%
Within 20%: 92.6%


In [37]:
# Save results
linear_results = {
    'model_name': 'Linear Regression',
    'cv_rmse_log': cv_rmse.mean(),
    'cv_mae_log': cv_mae.mean(),
    'cv_r2': cv_r2.mean(),
    'cv_rmse_original': cv_rmse_orig,
    'cv_mae_original': cv_mae_orig,
    'test_rmse_log': test_rmse,
    'test_mae_log': test_mae,
    'test_r2': test_r2,
    'test_rmse_original': test_rmse_orig,
    'test_mae_original': test_mae_orig,
    'within_10pct': within_10pct,
    'within_20pct': within_20pct
}

import os
os.makedirs('../../models/basic/', exist_ok=True)
import pickle
with open('../../models/basic/linear_results.pkl', 'wb') as f:
    pickle.dump(linear_results, f)

print("✓ Linear Regression completed")


✓ Linear Regression completed
