In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import pickle
import json

In [None]:
# 1. Load dataset
df = pd.read_csv("AmesHousing.csv")
df.head()

In [None]:
# 2. Select features and target
features = [
   'LotFrontage', 'GrLivArea', 'GarageArea',
   'Neighborhood', 'HouseStyle', 'ExterQual', 'MasVnrType',
   'YearBuilt', 'YrSold', 'OverallQual'
]
target = 'SalePrice'

df = df.dropna(subset=[target])  # remove rows with missing target
X = df[features]
y = df[target]

In [None]:
# 3. Train-test split (before preprocessing to avoid leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

In [None]:
df[features].info()

In [None]:
# 4. Determine transformation parameters from training set
lotfrontage_mean = X_train['LotFrontage'].mean()
masvnrtype_mode = X_train['MasVnrType'].mode()[0]
categorical_cols = ['Neighborhood', 'HouseStyle', 'MasVnrType']
exterqual_mapping = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
exterqual_na = 'TA'

In [None]:
# 5. Preprocess training set
X_train = X_train.copy()
X_train['HouseAge'] = X_train['YrSold'] - X_train['YearBuilt']
X_train['OverallQual'] = X_train['OverallQual'].clip(lower=1, upper=10)
X_train['LotFrontage'] = X_train['LotFrontage'].fillna(lotfrontage_mean)
X_train['MasVnrType'] = X_train['MasVnrType'].fillna(masvnrtype_mode)
X_train = pd.get_dummies(X_train, columns=categorical_cols)
X_train['ExterQual'] = X_train['ExterQual'].map(exterqual_mapping)
X_train['ExterQual'] = X_train['ExterQual'].fillna(exterqual_mapping[exterqual_na])

# Save the final column order for test set alignment
final_columns = X_train.columns.tolist()

In [None]:
# 6. Preprocess test set using same parameters
X_test = X_test.copy()
X_test['HouseAge'] = X_test['YrSold'] - X_test['YearBuilt']
X_test['OverallQual'] = X_test['OverallQual'].clip(lower=1, upper=10)
X_test['LotFrontage'] = X_test['LotFrontage'].fillna(lotfrontage_mean)
X_test['MasVnrType'] = X_test['MasVnrType'].fillna(masvnrtype_mode)
X_test = pd.get_dummies(X_test, columns=categorical_cols)
X_test['ExterQual'] = X_test['ExterQual'].map(exterqual_mapping)
X_test['ExterQual'] = X_test['ExterQual'].fillna(exterqual_mapping[exterqual_na])

# Align test columns to training columns
X_test = X_test.reindex(columns=final_columns, fill_value=0)

In [None]:
# 7. Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# 8. Evaluate model
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Error: ${mae:,.0f}")

In [None]:
# 9. Save the model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)