# **Modeling and Evaluation Notebook**

## Objectives

- Answer Business Requirement 2: train regression models to predict house sale prices
- Fit and evaluate a regression model to predict the sale price of a house

## Inputs

- outputs/datasets/cleaned/HousePricesCleaned.csv
- Instructions on which variables to use for data cleaning and feature engineering. They are found in each respective notebook.

## Outputs

- Train set (features and target)
- Test set (features and target)
- ML pipeline to predict sale price
- Feature importance plot



---

# Change working directory

In [None]:
import os

# set project root
dir_path = os.getcwd()
os.chdir(os.path.dirname(dir_path))
print("Working dir:", os.getcwd())

## Load Feature Engineered Datasets

In [None]:
import pandas as pd

# Load Train Set
train_path = "outputs/datasets/feature_engineered/Train_FE.csv"
train_df = pd.read_csv(train_path)

# Load Test Set
test_path = "outputs/datasets/feature_engineered/Test_FE.csv"
test_df = pd.read_csv(test_path)

# Split features and target
X_train = train_df.drop("SalePrice", axis=1)
y_train = train_df["SalePrice"]

X_test = test_df.drop("SalePrice", axis=1)
y_test = test_df["SalePrice"]

print("* Train set:", X_train.shape, y_train.shape)
print("* Test set:", X_test.shape, y_test.shape)

---

## Import Libraries and Suppress Warnings

In [None]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    AdaBoostRegressor,
    ExtraTreesRegressor,
)
from sklearn.linear_model import LinearRegression

### Load feature engineered datasets

In [None]:

X_train = pd.read_csv("outputs/datasets/feature_engineered/Train_FE.csv").drop(
    "SalePrice", axis=1
)
y_train = pd.read_csv("outputs/datasets/feature_engineered/Train_FE.csv")["SalePrice"]

X_test = pd.read_csv("outputs/datasets/feature_engineered/Test_FE.csv").drop(
    "SalePrice", axis=1
)
y_test = pd.read_csv("outputs/datasets/feature_engineered/Test_FE.csv")["SalePrice"]

Custom Class for Hyperparameter Optimization from Code Institute:

In [None]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")
            model = PipelineRgr(self.models[key])

            params = self.params[key]
            gs = GridSearchCV(
                model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring
            )
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by="mean_score"):
        def row(key, scores, params):
            d = {
                "estimator": key,
                "min_score": min(scores),
                "max_score": max(scores),
                "mean_score": np.mean(scores),
                "std_score": np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_["params"]
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ["estimator", "min_score", "mean_score", "max_score", "std_score"]
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns], self.grid_searches

In [None]:
def PipelineRgr(model):
    pipeline_base = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("feat_selection", SelectFromModel(model)),
            ("model", model),
        ]
    )
    return pipeline_base

---

## Grid Search CV - SKlearn

Use standard hyperparameters to find most suitable algorithm

In [None]:
models_quick_search = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    "LinearRegression": {},
    "DecisionTreeRegressor": {
        "model__max_depth": [None, 4, 15],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 50],
        "model__max_leaf_nodes": [None, 50],
    },
    "RandomForestRegressor": {
        "model__n_estimators": [100, 50, 140],
        "model__max_depth": [None, 4, 15],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 50],
        "model__max_leaf_nodes": [None, 50],
    },
    "ExtraTreesRegressor": {
        "model__n_estimators": [100, 50, 150],
        "model__max_depth": [None, 3, 15],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 50],
    },
    "AdaBoostRegressor": {
        "model__n_estimators": [50, 25, 80, 150],
        "model__learning_rate": [1, 0.1, 2],
        "model__loss": ["linear", "square", "exponential"],
    },
    "GradientBoostingRegressor": {
        "model__n_estimators": [100, 50, 140],
        "model__learning_rate": [0.1, 0.01, 0.001],
        "model__max_depth": [3, 15, None],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 50],
        "model__max_leaf_nodes": [None, 50],
    },
    "XGBRegressor": {
        "model__n_estimators": [30, 80, 200],
        "model__max_depth": [None, 3, 15],
        "model__learning_rate": [0.01, 0.1, 0.001],
        "model__gamma": [0, 0.1],
    },
}

Quick GridSearch CV - Regression

In [None]:
search = HyperparameterOptimizationSearch(
    models=models_quick_search, params=params_quick_search
)
search.fit(X_train, y_train, scoring="r2", n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by="mean_score")
grid_search_summary

The result suggests that GradientBoosterRegressor is giving the best result. Therefore, it will be explored in more detail.

#### Do an extensive search on the most suitable algorithm, ie GradientBoosterRegressor, to find the best hyperparameter configuration.

In [None]:
models_search = {
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
}

params_search = {
    "ExtraTreesRegressor": {
        "model__n_estimators": [50, 100, 150],
        "model__max_depth": [None, 3, 15],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 50],
    },
}

Extensive GridSearch CV - Regression

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring="r2", n_jobs=-1, cv=5)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by="mean_score")
grid_search_summary

Get best model name programmatically

In [None]:
best_model = grid_search_summary.iloc[0, 0]
best_model

Parameters for best model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Define the best regressor pipeline

In [None]:
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
best_regressor_pipeline

## Assess Feature Importance

In [None]:
X_train.head(3)

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

df_feature_importance = pd.DataFrame(
    data={
        "Feature": X_train.columns[
            best_regressor_pipeline["feat_selection"].get_support()
        ],
        "Importance": best_regressor_pipeline["model"].feature_importances_,
    }
).sort_values(by="Importance", ascending=False)


best_features = df_feature_importance["Feature"].to_list()


print(
    f"* These are the {len(best_features)} most important features in descending order. "
    f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}"
)

df_feature_importance.plot(kind="bar", x="Feature", y="Importance")
plt.show()

## Evaluate Pipeline on Train and Test Sets

Measure how close regression line is to data points with mean and absolute error

In [None]:
# MSE
from sklearn.metrics import mean_squared_error

y_pred_train = best_regressor_pipeline.predict(X_train)
print("Mean squared error")
mean_squared_error(y_train, predict)

In [None]:
# MAE
from sklearn.metrics import mean_absolute_error

y_pred_train = best_regressor_pipeline.predict(X_train)
print("Mean absolute error")
mean_absolute_error(y_train, predict)

In [None]:
# R²
from sklearn.metrics import r2_score

y_pred_train = best_regressor_pipeline.predict(X_train)
r2_score(y_train, predict)

The R² score of 0.87+ is a very good score and indicates a good fit for our model.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(x=y_pred_train, y=y_train, alpha=0.5)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Predicted vs Actual - Training Set")
plt.show()

## Rewrite Pipeline with the best parameters

In [None]:
best_features

ML Pipeline for Modelling and Hyperparameter Optimization

In [None]:
def PipelineRgr(model):  # new def PipelineRgr
    pipeline_base = Pipeline(
        [
            ("scaler", StandardScaler()),
            # ("feat_selection", SelectFromModel(model)),
            ("model", model),
        ]
    )

    return pipeline_base

Update Train and Test Sets with best features

In [None]:
X_train = X_train.filter(best_features)
X_test = X_test.filter(best_features)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train.head(5)

In [None]:
best_model

In [None]:
best_parameters

Now we only apply the best parameters:

In [None]:
models_search = {
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
}

params_search = {
    "GradientBoostingRegressor": {
        "model__learning_rate": [0.05],
        "model__max_depth": [5],
        "model__max_leaf_nodes": [50],
        "model__min_samples_leaf": [10],
        "model__min_samples_split": [25],
        "model__n_estimators": [75],
        "model__subsample": [0.8],
    }
}

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring="r2", n_jobs=-1, cv=5)

Fitting 5 folds for each of 1 candidates, totalling 5 fits

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by="mean_score")
grid_search_summary

Mean Score of 0.80 is still very good.

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train.head(3)

In [None]:
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
best_regressor_pipeline

---

# Push files to Repo

The following files will be created and pushed to the repo:

- Train Set
- Test Set
- Modeling Pipeline
- Feature importance Plot

In [None]:
import joblib
import os

version = "v1"
file_path = f"outputs/ml_pipeline/predict_sale_price/{version}"

try:
    os.makedirs(name=file_path)
except Exception as e:
    print(e)

## Train Set: Features And Target

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train.head()

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

## Test Set: Features And Target

In [None]:
print(X_test.shape)
X_test.head()

In [None]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test.head()

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

## Pipeline

In [None]:
best_regressor_pipeline

In [None]:
joblib.dump(
    value=best_regressor_pipeline, filename=f"{file_path}/best_regressor_pipeline.pkl"
)

## Feature Importance Plot

In [None]:
df_feature_importance.plot(kind="bar", x="Feature", y="Importance")
plt.show()

In [None]:
df_feature_importance.plot(kind="bar", x="Feature", y="Importance")
plt.savefig(f"{file_path}/feature_importance.png", bbox_inches="tight")

In [None]:
import os

print("Files saved to:", file_path)
print("Saved files:", os.listdir(file_path))

This completes the notebook.