# House price prediction (Kaggle House Prices)
Goal: estimate a house sale price given its features.

Instructions:
- Download the Kaggle dataset and place `train.csv` and `test.csv` under `ml_immo/data/house_prices/`.
- Run the cells below to train and evaluate models (RMSE, R²).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

# Files (adjust paths if different)
train_csv = '../../data/house_prices/train.csv'
test_csv = '../../data/house_prices/test.csv'

# Load
try:
    df = pd.read_csv(train_csv)
except FileNotFoundError:
    raise SystemExit('Put train.csv under ml_immo/data/house_prices/')

target = 'SalePrice'
y = df[target]
X = df.drop(columns=[target])

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

preprocess = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_cols),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('oh', OneHotEncoder(handle_unknown='ignore'))]), categorical_cols)
])

models = {
    'Ridge': Ridge(alpha=10.0),
    'Lasso': Lasso(alpha=0.0005, max_iter=20000),
    'GBR'  : GradientBoostingRegressor(random_state=42)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
rows = []
for name, model in models.items():
    pipe = Pipeline([('prep', preprocess), ('model', model)])
    rmse = (-cross_val_score(pipe, X, y, scoring='neg_root_mean_squared_error', cv=cv)).mean()
    r2 = cross_val_score(pipe, X, y, scoring='r2', cv=cv).mean()
    rows.append((name, rmse, r2))

results_df = pd.DataFrame(rows, columns=['model', 'rmse', 'r2']).sort_values('rmse')
results_df


In [None]:
# Optional target transform (log1p to stabilize variance)
USE_LOG_TARGET = True

# Recompute y (log if enabled)
if USE_LOG_TARGET:
    y_model = np.log1p(y)
else:
    y_model = y.copy()

# Cross-validation setup (kept consistent across experiments)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

print(f"Using log target: {USE_LOG_TARGET}")


In [None]:
# Baseline models with tuned grids (quick)
from sklearn.model_selection import GridSearchCV

candidates = {
    'Ridge': (Ridge(), { 'alpha': [0.1, 1.0, 10.0, 100.0] }),
    'Lasso': (Lasso(max_iter=50000), { 'alpha': [5e-5, 1e-4, 5e-4, 1e-3] }),
    'GBR'  : (GradientBoostingRegressor(random_state=42), {
        'n_estimators': [300, 600],
        'learning_rate': [0.05, 0.1],
        'max_depth': [2, 3]
    })
}

rows = []
for name, (est, param_grid) in candidates.items():
    pipe = Pipeline([('prep', preprocess), ('model', est)])
    grid = GridSearchCV(pipe, param_grid={f'model__{k}': v for k, v in param_grid.items()},
                        cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1)
    grid.fit(X, y_model)
    rmse = -grid.best_score_
    rows.append((name, rmse, grid.best_params_))

results_grid = pd.DataFrame(rows, columns=['model', 'rmse', 'best_params']).sort_values('rmse')
results_grid


In [None]:
# Fit best model and report test metrics if test labels available (optional)
best_name = results_grid.iloc[0]['model']
print(f"Best model from grid: {best_name}")

# Refit on full training data
est, _ = candidates[best_name]
best_pipe = Pipeline([('prep', preprocess), ('model', est)])

# If we had hold-out labels, we would evaluate here. For Kaggle, we usually generate submission.
# Below code shows how to fit and predict on the provided test CSV if available.
try:
    df_test = pd.read_csv(test_csv)
    X_test = df_test.copy()
    best_pipe.fit(X, y_model)
    y_pred = best_pipe.predict(X_test)
    if USE_LOG_TARGET:
        y_pred = np.expm1(y_pred)
    preview = pd.DataFrame({'Id': df_test.index, 'SalePrice_pred': y_pred}).head()
    preview
except FileNotFoundError:
    print("test.csv not found – skipping prediction preview.")
