# Enhanced Linear Regression

Using advanced features to improve prediction accuracy.


In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load enhanced data
X = pd.read_pickle('../../data/enhanced/df_enhanced_linear.pkl')
y = pd.read_pickle('../../data/enhanced/y_enhanced.pkl')

print(f"Enhanced dataset: {X.shape}")
print(f"Features: {X.shape[1]}")


Enhanced dataset: (1408, 36)
Features: 36


In [2]:
# Prepare data
y_log = np.log1p(y)
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")


Training set: (1126, 36)
Test set: (282, 36)


In [3]:
# Train model and cross validation
model = LinearRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_rmse = np.sqrt(-cross_val_score(model, X_train_scaled, y_train_log, cv=kf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(model, X_train_scaled, y_train_log, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(model, X_train_scaled, y_train_log, cv=kf, scoring='r2')

print("Cross Validation (log scale):")
print(f"RMSE: {cv_rmse.mean():.4f} (±{cv_rmse.std():.4f})")
print(f"MAE:  {cv_mae.mean():.4f} (±{cv_mae.std():.4f})")
print(f"R²:   {cv_r2.mean():.4f} (±{cv_r2.std():.4f})")

# Convert to original scale
cv_rmse_orig = np.expm1(cv_rmse.mean())
cv_mae_orig = np.expm1(cv_mae.mean())
print(f"\nOriginal scale:")
print(f"RMSE: ${cv_rmse_orig:,.0f}")
print(f"MAE:  ${cv_mae_orig:,.0f}")


Cross Validation (log scale):
RMSE: 0.1176 (±0.0069)
MAE:  0.0854 (±0.0059)
R²:   0.8988 (±0.0118)

Original scale:
RMSE: $0
MAE:  $0


In [4]:
# Test set performance
model.fit(X_train_scaled, y_train_log)
y_pred_log = model.predict(X_test_scaled)

test_rmse_log = np.sqrt(mean_squared_error(y_test_log, y_pred_log))
test_mae_log = mean_absolute_error(y_test_log, y_pred_log)
test_r2 = r2_score(y_test_log, y_pred_log)

y_test_orig = np.expm1(y_test_log)
y_pred_orig = np.expm1(y_pred_log)
test_rmse_orig = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
test_mae_orig = mean_absolute_error(y_test_orig, y_pred_orig)

# Prediction accuracy
errors = np.abs(y_pred_orig - y_test_orig) / y_test_orig
within_10pct = (errors <= 0.10).mean() * 100
within_20pct = (errors <= 0.20).mean() * 100

print("Test Set Performance:")
print(f"RMSE (log): {test_rmse_log:.4f}")
print(f"MAE (log):  {test_mae_log:.4f}")
print(f"R²:         {test_r2:.4f}")
print(f"RMSE:       ${test_rmse_orig:,.0f}")
print(f"MAE:        ${test_mae_orig:,.0f}")
print(f"Within 10%: {within_10pct:.1f}%")
print(f"Within 20%: {within_20pct:.1f}%")


Test Set Performance:
RMSE (log): 0.1109
MAE (log):  0.0784
R²:         0.9123
RMSE:       $20,920
MAE:        $14,257
Within 10%: 72.0%
Within 20%: 94.0%


In [None]:
# Save results
enhanced_linear_results = {
    'model_name': 'Enhanced Linear Regression',
    'cv_rmse_log': cv_rmse.mean(),
    'cv_mae_log': cv_mae.mean(),
    'cv_r2': cv_r2.mean(),
    'cv_rmse_original': cv_rmse_orig,
    'cv_mae_original': cv_mae_orig,
    'test_rmse_log': test_rmse_log,
    'test_mae_log': test_mae_log,
    'test_r2': test_r2,
    'test_rmse_original': test_rmse_orig,
    'test_mae_original': test_mae_orig,
    'within_10pct': within_10pct,
    'within_20pct': within_20pct
}

import os
os.makedirs('../../models/enhanced/', exist_ok=True)
import pickle
with open('../../models/enhanced/linear_results.pkl', 'wb') as f:
    pickle.dump(enhanced_linear_results, f)

print("✓ Enhanced Linear Regression completed")


✓ Enhanced Linear Regression completed
