# House price prediction (Kaggle House Prices)
Goal: estimate a house sale price given its features.

Instructions:
- Download the Kaggle dataset and place `train.csv` and `test.csv` under `ml_immo/data/house_prices/`.
- Run the cells below to train and evaluate models (RMSE, R²).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

# Files (adjust paths if different)
train_csv = '../../data/house_prices/train.csv'
test_csv = '../../data/house_prices/test.csv'

# Load
try:
    df = pd.read_csv(train_csv)
except FileNotFoundError:
    raise SystemExit('Put train.csv under ml_immo/data/house_prices/')

target = 'SalePrice'
y = df[target]
X = df.drop(columns=[target])

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

preprocess = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_cols),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('oh', OneHotEncoder(handle_unknown='ignore'))]), categorical_cols)
])

models = {
    'Ridge': Ridge(alpha=10.0),
    'Lasso': Lasso(alpha=0.0005, max_iter=20000),
    'GBR'  : GradientBoostingRegressor(random_state=42)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
rows = []
for name, model in models.items():
    pipe = Pipeline([('prep', preprocess), ('model', model)])
    rmse = (-cross_val_score(pipe, X, y, scoring='neg_root_mean_squared_error', cv=cv)).mean()
    r2 = cross_val_score(pipe, X, y, scoring='r2', cv=cv).mean()
    rows.append((name, rmse, r2))

results_df = pd.DataFrame(rows, columns=['model', 'rmse', 'r2']).sort_values('rmse')
results_df
