In [27]:
# %load ./src/main.py
"""
Module for training a RandomForestRegressor model on the Ames dataset.

This module loads the Ames housing dataset, preprocesses the data by handling
categorical and numerical features differently, and trains a
RandomForestRegressor model. It performs 10-fold cross-validation to evaluate
model performance using RMSE and visualizes feature importances.

Usage:
    python main.py

Dependencies:
    - pandas
    - numpy
    - matplotlib
    - scikit-learn
    - joblib
"""

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# import the dataset
df = pd.read_csv("./raw_data/ames_unprocessed_data.csv")
print(df.shape)

TARGET_COL = "SalePrice"
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# CREATE LISTS WITH NUMERICAL AND CATEGORICAL COLUMNS
# Create a boolean mask for categorical columns
categorical_columns = df.select_dtypes(include=["object", "category"]).columns.tolist()
print(categorical_columns)

feature_columns = df.select_dtypes(include=["number"]).columns.tolist()
feature_columns.remove(TARGET_COL)
print(feature_columns)


# CREATE PREPROCESSING PIPELINES
# one-hot-encode categorical features
cat_pipeline = Pipeline([("cat", OneHotEncoder(sparse_output=False))])

# impute and scale numerical features
num_pipeline = Pipeline([("imputer", SimpleImputer(fill_value=0)), ("scaler", StandardScaler())])


# CREATE COLUMN TRANSFORMER

# The column transformer preprocesses the numerical and categorical features differently
preprocessor = ColumnTransformer(
    transformers=[("num", num_pipeline, feature_columns), ("cat", cat_pipeline, categorical_columns)],
    remainder="passthrough",
)

X_trans = preprocessor.fit_transform(X)


# Get column names after preprocessing
column_names = preprocessor.get_feature_names_out()

# Create a DataFrame with the transformed data and new column names
X_trans_df = pd.DataFrame(X_trans, columns=column_names)

# Optional: inspect the first few rows
display(X_trans_df.head())

X_trans_df.insert(loc=0, column=TARGET_COL, value=y)
display(X_trans_df.head())

(1460, 21)
['MSZoning', 'Neighborhood', 'BldgType', 'HouseStyle', 'PavedDrive']
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'Remodeled', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'Fireplaces', 'GarageArea']


Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__Remodeled,num__GrLivArea,num__BsmtFullBath,num__BsmtHalfBath,...,cat__HouseStyle_1.5Unf,cat__HouseStyle_1Story,cat__HouseStyle_2.5Fin,cat__HouseStyle_2.5Unf,cat__HouseStyle_2Story,cat__HouseStyle_SFoyer,cat__HouseStyle_SLvl,cat__PavedDrive_N,cat__PavedDrive_P,cat__PavedDrive_Y
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,-0.95446,0.370333,1.10781,-0.241061,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.95446,-0.482512,-0.819964,3.948809,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,1.047712,0.515013,1.10781,-0.241061,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,1.047712,0.383659,1.10781,-0.241061,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,-0.95446,1.299326,1.10781,-0.241061,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


Unnamed: 0,SalePrice,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__Remodeled,num__GrLivArea,num__BsmtFullBath,...,cat__HouseStyle_1.5Unf,cat__HouseStyle_1Story,cat__HouseStyle_2.5Fin,cat__HouseStyle_2.5Unf,cat__HouseStyle_2Story,cat__HouseStyle_SFoyer,cat__HouseStyle_SLvl,cat__PavedDrive_N,cat__PavedDrive_P,cat__PavedDrive_Y
0,208500,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,-0.95446,0.370333,1.10781,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,181500,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.95446,-0.482512,-0.819964,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,223500,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,1.047712,0.515013,1.10781,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,140000,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,1.047712,0.383659,1.10781,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,250000,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,-0.95446,1.299326,1.10781,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# test the column transformer
# num cols have mean=0 and std=1
# x = preprocessor.fit_transform(df)
# df_trans = pd.DataFrame(x, columns=preprocessor.get_feature_names_out())
# display(df_trans.describe())
# print(df_trans.shape)

steps = [("preprocessor", preprocessor), ("clf", RandomForestRegressor(max_depth=5))]

rf_pipeline = Pipeline(steps)

# Cross-validate the model
cross_val_scores = cross_val_score(rf_pipeline, X, y, cv=10, scoring="neg_mean_squared_error")

# Print the 10-fold RMSE
print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))

rf_pipeline.fit(X, y)
# Get feature names after preprocessing
feature_names = rf_pipeline.named_steps["preprocessor"].get_feature_names_out()
# print(feature_names)
feature_importance = rf_pipeline.named_steps["clf"].feature_importances_


df = pd.DataFrame({"feature": feature_names, "importance": feature_importance}).sort_values(
    by="importance", ascending=False
)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(df["feature"], df["importance"], color="skyblue")
plt.xlabel("Mean Feature Importance")
plt.title("Random Forest Regressor Feature Importances (Cross-Validated)")
plt.gca().invert_yaxis()  # Highest at top
plt.tight_layout()
# plt.show()
plt.savefig("feature_importances.png")  # Save the figure as a PNG image

# Save the fitted pipeline
joblib.dump(rf_pipeline, "./saved_models/model_pipeline.pkl")

# Later: Load it back
loaded_pipeline = joblib.load("./saved_models/model_pipeline.pkl")

# Use it for prediction
# y_pred = loaded_pipeline.predict(X_new)