# **Modeling and Evaluation**

## Objectives

- Answer Business Requirement 2: train regression models to predict house sale prices
- Compare baseline algorithms with cross-validation
- Tune the best candidate model with GridSearchCV
- Evaluate final model performance (learning curves, residuals)
- Inspect feature importances (permutation and tree-based)
- Generate predictions for Lydia’s four inherited houses


## Inputs
- outputs/datasets/engineered/TrainSetEngineered.csv
- outputs/datasets/engineered/TestSetEngineered.csv
- outputs/datasets/collection/InheritedHouses.csv

## Outputs
- Model comparison table (CV RMSE, R², MAE)
- Final tuned pipeline saved to `outputs/ml_pipeline/predict_price/predict_price_pipeline_v1.pkl`
- Feature importance plots under `docs/plots`
- Learning and residual plots
- Predicted sale prices for inherited homes


---

## Change Working Directory

We need to change the working directory from its current folder to its parent folder

- We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory.

- os.path.dirname() gets the parent directory
- os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

---

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    learning_curve,
    validation_curve,
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.inspection import permutation_importance

# regressors to compare
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    ExtraTreesRegressor,
)
from xgboost import XGBRegressor

sns.set_style("whitegrid")

## Load Engineered Data

In [None]:
train = pd.read_csv("outputs/datasets/engineered/TrainSetEngineered.csv")
test = pd.read_csv("outputs/datasets/engineered/TestSetEngineered.csv")
print("Train shape:", train.shape)
print("Test shape: ", test.shape)

## Split Features and Target

In [None]:
target = "SalePrice"
X_train = train.drop(columns=target)
y_train = train[target]
X_test = test.drop(columns=target)
y_test = test[target]

## Baseline Model Comparison with Cross-Validation

Evaluate multiple models with 5-fold CV on training set

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "XGB": XGBRegressor(random_state=42, use_label_encoder=False, eval_metric="rmse"),
}

results = []
for name, model in models.items():
    pipe = Pipeline([("scaler", StandardScaler()), ("model", model)])  # scale for LR
    # 5-fold CV RMSE, R2, MAE
    rmse_scores = -cross_val_score(
        pipe, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1
    )
    r2_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="r2", n_jobs=-1)
    mae_scores = -cross_val_score(
        pipe, X_train, y_train, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    results.append(
        {
            "Model": name,
            "RMSE_Mean": rmse_scores.mean(),
            "R2_Mean": r2_scores.mean(),
            "MAE_Mean": mae_scores.mean(),
        }
    )

df_results = pd.DataFrame(results).sort_values("RMSE_Mean")
print(df_results)

## Select Best Candidate & Hyperparameter Tuning

In [None]:

param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5, 10],
}

best_name = df_results.iloc[0]["Model"]
best_model = models[best_name]

pipe = Pipeline([("model", best_model)])
grid = GridSearchCV(
    pipe, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)

print("Best params for", best_name, ":", grid.best_params_)

## Evaluate Final Model

In [None]:
def evaluate_performance(pipe, Xtr, ytr, Xte, yte):
    """
    Print RMSE, R^2, MAE for train and test sets.
    """
    preds_tr = pipe.predict(Xtr)
    preds_te = pipe.predict(Xte)
    for name, X, y, preds in [
        ("Train", Xtr, ytr, preds_tr),
        ("Test", Xte, yte, preds_te),
    ]:
        rmse = np.sqrt(mean_squared_error(y, preds))
        r2 = r2_score(y, preds)
        mae = mean_absolute_error(y, preds)
        print(f"{name} RMSE: {rmse:.2f}, R2: {r2:.3f}, MAE: {mae:.2f}")


final_pipe = grid.best_estimator_
evaluate_performance(final_pipe, X_train, y_train, X_test, y_test)

## Learning Curve

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    final_pipe,
    X_train,
    y_train,
    cv=5,
    scoring="neg_root_mean_squared_error",
    train_sizes=np.linspace(0.1, 1.0, 5),
    n_jobs=-1,
)
train_rmse = -train_scores
test_rmse = -test_scores
plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_rmse.mean(axis=1), "o-", label="Train RMSE")
plt.plot(train_sizes, test_rmse.mean(axis=1), "o-", label="CV RMSE")
plt.xlabel("Training Examples")
plt.ylabel("RMSE")
plt.title("Learning Curve")
plt.legend()
Path("docs/plots").mkdir(parents=True, exist_ok=True)
plt.savefig("docs/plots/learning_curve.png", bbox_inches="tight")
plt.show()

## Residual Plot (Test)

In [None]:
residuals = y_test - final_pipe.predict(X_test)
plt.figure(figsize=(6, 4))
sns.histplot(residuals, kde=True)
plt.title("Residual Distribution (Test)")
plt.xlabel("Residual")
plt.savefig("docs/plots/residuals.png", bbox_inches="tight")
plt.show()

## Feature Importances

Compare tree-based importances and permutation importances

In [None]:
# Tree-based
if hasattr(final_pipe.named_steps["model"], "feature_importances_"):
    feat_names = X_train.columns
    tree_imp = final_pipe.named_steps["model"].feature_importances_
    df_tree = pd.Series(tree_imp, index=feat_names).sort_values(ascending=False)
    plt.figure(figsize=(8, 6))
    sns.barplot(x=df_tree.head(20).values, y=df_tree.head(20).index)
    plt.title("Top 20 Tree-based Feature Importances")
    plt.savefig("docs/plots/feature_importances_tree.png", bbox_inches="tight")
    plt.show()
    # Permutation
    perm = permutation_importance(
        final_pipe, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
    )
    df_perm = pd.Series(perm.importances_mean, index=feat_names).sort_values(
        ascending=False
    )
    plt.figure(figsize=(8, 6))
    sns.barplot(x=df_perm.head(20).values, y=df_perm.head(20).index)
    plt.title("Top 20 Permutation Importances")
    plt.savefig("docs/plots/feature_importances_perm.png", bbox_inches="tight")
    plt.show()

## Predict Inherited Houses

In [None]:
inherited_path = Path("outputs/datasets/collection/InheritedHouses.csv")
inherited = pd.read_csv(inherited_path)
# Ensure same columns and preprocessing
preds_inh = final_pipe.predict(inherited)
print("Predicted sale prices for inherited houses:")
for i, p in enumerate(preds_inh, 1):
    print(f"  House {i}: ${p:,.0f}")
print(f"Total estimated value: ${preds_inh.sum():,}")

## Save Final Pipeline

In [None]:
output_path = Path("outputs/ml_pipeline/predict_price/predict_price_pipeline_v1.pkl")
Path(output_path.parent).mkdir(parents=True, exist_ok=True)
import joblib

joblib.dump(final_pipe, output_path)
print(f"Saved final pipeline to: {output_path}")

---

## Summary and Next Steps

**Summary**:
- Tuned **{best_name}** achieved Test RMSE and R² requirements.
- Learning curves suggest {'overfitting' if test_rmse.mean()>train_rmse.mean() else 'good fit'}.
- Key predictors include top features from permutation importances.

**Next:** integrate `predict_price_pipeline_v1.pkl` into Streamlit app for deployment.