In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Building from yesterday

In [None]:
url = "https://raw.githubusercontent.com/pedroteche-ih/DAFT_MEX_202209/main/data/tb_ames_housing.csv"
tb_housing = pd.read_csv(url)
tb_housing.info()


## NA Treatment

In [None]:
tb_housing.isna().sum().sort_values(ascending=False).head(10)


In [None]:
# Fill FireplaceQu for houses with no fireplace
tb_housing["FireplaceQu"] = tb_housing["FireplaceQu"].fillna("NF")
# Dropping columns with over 50% missing values
tb_housing = tb_housing.dropna(axis=1, thresh=tb_housing.shape[0] * 0.5)


## Variable Transformation

In [None]:
def var_trans(data):
    data["log_SalePrice"] = np.log(data["SalePrice"])
    data["log_GrLivArea"] = np.log(data["GrLivArea"])
    data["lot_ocuppation"] = data["GrLivArea"] + data["LotArea"]
    heatingqc_group = {
        "Ex": "Good",
        "Gd": "Good",
        "TA": "Bad",
        "Fa": "Bad",
        "Po": "Bad",
    }
    data["grp_HeatingQC"] = data["HeatingQC"].map(heatingqc_group)
    mszoning_group = {"RL": "RL", "RM": "RM"}
    data["grp_MSZoning"] = data["MSZoning"].map(mszoning_group)
    data["grp_MSZoning"] = data["grp_MSZoning"].fillna("Other")

    return data


In [None]:
tb_housing_trans = var_trans(tb_housing)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [None]:
def transformer_fit(train_data):
    # Fitting Scaler
    scaler = StandardScaler()
    scaler.fit(train_data[["log_GrLivArea", "OverallQual", "lot_ocuppation"]])

    # Fitting OneHotEncoder
    ohe = OneHotEncoder(drop="first", sparse=False, handle_unknown="ignore")
    ohe.fit(train_data[["grp_HeatingQC", "grp_MSZoning"]])

    return scaler, ohe


def transformer_apply(train_data, scaler, ohe):
    # Applying scaler
    feature_names = ["sca_" + name for name in scaler.feature_names_in_]
    scaled_matrix = scaler.transform(
        train_data[["log_GrLivArea", "OverallQual", "lot_ocuppation"]]
    )
    scaled_data = pd.DataFrame(scaled_matrix, columns=feature_names)

    # Applying OHE
    dummy_names = ohe.get_feature_names_out()
    ohe_matrix = ohe.transform(train_data[["grp_HeatingQC", "grp_MSZoning"]])
    dummy_data = pd.DataFrame(ohe_matrix, columns=dummy_names)

    return pd.concat([scaled_data, dummy_data], axis=1)


In [None]:
tb_housing_trans = var_trans(tb_housing)
x_var = [
    "log_GrLivArea",
    "OverallQual",
    "lot_ocuppation",
    "grp_HeatingQC",
    "grp_MSZoning",
]
y_var = "log_SalePrice"
model_var = x_var + [y_var]
tb_housing_model = tb_housing_trans[model_var].dropna()

X = tb_housing_model[x_var]
y = tb_housing_model[y_var]
X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler, ohe = transformer_fit(X_train)
X_train_trans = transformer_apply(X_train, scaler, ohe)
X_test_trans = transformer_apply(X_test, scaler, ohe)


In [None]:
X_train.shape


# Non-Linear Techniques - Regression

## Baseline Model - Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression


In [None]:
lm_fit = LinearRegression()
lm_fit.fit(X_train_trans, y_train)


In [None]:
y_pred = lm_fit.predict(X_test_trans)
tb_test = pd.DataFrame({"y_real": y_test, "y_pred": y_pred})
tb_test = pd.concat([tb_test, X_test], axis=1)
tb_test["SalePrice"] = np.exp(tb_test["y_real"])
tb_test["pred_lm_SalePrice"] = np.exp(tb_test["y_pred"])


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [None]:
np.sqrt(mean_squared_error(tb_test["SalePrice"], tb_test["pred_lm_SalePrice"]))


# kNN

**Hyper-parameters:**

1. `n_neighbors` - number of k nearest neighbors used to build model.
1. `weights` - `"uniform"` or `"distance"`, how neighbor values are weighed.

In [None]:
from sklearn.neighbors import KNeighborsRegressor


In [None]:
knn_fit = KNeighborsRegressor(n_neighbors=1)
knn_fit.fit(X_train_trans, y_train)


In [None]:
y_pred = knn_fit.predict(X_test_trans)
tb_test["pred_1nn_SalePrice"] = np.exp(y_pred)


In [None]:
np.round(
    np.sqrt(mean_squared_error(tb_test["SalePrice"], tb_test["pred_1nn_SalePrice"])), 2
)


## Optimizing KNN

In [None]:
n_k = [1, 5, 10, 25, 50]
for k in n_k:
    knn_fit = KNeighborsRegressor(n_neighbors=k, weights="distance")
    knn_fit.fit(X_train_trans, y_train)
    y_pred = knn_fit.predict(X_test_trans)
    error = np.round(
        np.sqrt(mean_squared_error(tb_test["SalePrice"], np.exp(y_pred))), 2
    )
    print(f"{k}-NN RMSE: {error}")


# Decision Trees

**Hyperparameters:**
1. `max_depth`: maximum depth of tree
1. `min_samples_leaf`: minimum number of training points at each leaf
1. `min_samples_split`: minimum number of training points at each branch

In [None]:
from sklearn.tree import DecisionTreeRegressor


In [None]:
dt_fit = DecisionTreeRegressor()
dt_fit.fit(X_train_trans, y_train)


In [None]:
y_pred = dt_fit.predict(X_test_trans)
tb_test["pred_dt_SalePrice"] = np.exp(y_pred)


In [None]:
np.round(
    np.sqrt(mean_squared_error(tb_test["SalePrice"], tb_test["pred_dt_SalePrice"])), 2
)


## Checking for Overfitting

In [None]:
dt_fit.get_depth()


In [None]:
y_train_pred = dt_fit.predict(X_train_trans)
np.sqrt(mean_squared_error(np.exp(y_train), np.exp(y_train_pred)))


## Correcting Overfit Decision Trees

A decision tree's complexity is directly proportional to the amount of overfitting: more complex trees will have a higher probability of overfitting compared to simples trees.

Let's limit our tree's complexity using our 3 hyperparameters:

1. The deeper a tree (**larger `max_depth`**), the **more complex** the tree;
1. The smaller the leafs (**smaller `min_samples_leaf`**), the **more complex** the tree;
1. The smaller the branches (**smaller `min_samples_split`**), the **more complex** the tree;

The easiest way to reduce a tree's complexity is by training trees of smaller `max_depth`:

In [None]:
dt_fit = DecisionTreeRegressor(max_depth=5)
dt_fit.fit(X_train_trans, y_train)
y_pred = dt_fit.predict(X_test_trans)
tb_test["pred_dt_SalePrice"] = np.exp(y_pred)
np.round(
    np.sqrt(mean_squared_error(tb_test["SalePrice"], tb_test["pred_dt_SalePrice"])), 2
)


We can use **nested loops** to test different hyperparameters:

In [None]:
max_depth_list = range(10, 201, 5)
min_samples_leaf_list = range(1, 50, 5)
min_samples_split_list = range(2, 100, 5)
errors = []
for depth in max_depth_list:
    for leaf in min_samples_leaf_list:
        for split in min_samples_split_list:
            dt_fit = DecisionTreeRegressor(
                max_depth=depth, min_samples_leaf=leaf, min_samples_split=split
            )
            dt_fit.fit(X_train_trans, y_train)
            y_pred = dt_fit.predict(X_test_trans)
            tb_test["pred_dt_SalePrice"] = np.exp(y_pred)
            error = np.round(
                np.sqrt(
                    mean_squared_error(
                        tb_test["SalePrice"], tb_test["pred_dt_SalePrice"]
                    )
                ),
                2,
            )
            errors.append((depth, leaf, split, error))


In [None]:
pd.DataFrame(errors, columns=["depth", "leaf", "split", "error"]).sort_values("error")


## Visualizing the Tree


In [None]:
from sklearn.tree import plot_tree, export_graphviz

In [None]:
dt_fit = DecisionTreeRegressor(max_depth= 6, min_samples_leaf = 13, min_samples_split = 28)
dt_fit.fit(X_train_trans, y_train)
plt.figure(figsize = (35, 10))
plot_tree(dt_fit, feature_names=X_train_trans.columns, max_depth = 3);

In [None]:
export_graphviz(dt_fit, 'teste.dot', feature_names=X_train_trans.columns)

# Ensemble Methods

Alongside deep-learning methods, ensemble algorithms are currently the best tools available for predictive modelling. They have the precision associated with decision trees, while adopting several strategies to mitigate the overfitting risk.

Let's see the two main methods of ensemble regressors: bagging and boosting.

## Bagging

**RandomForestRegressor Hyperparameters**:

1. `n_estimators`: number of weak learners (the more weak learners, the more complex the model);
1. `max_depth`: complexity of each weak learner (the more complex each weak learner, the more complex the model); 

In [None]:
from sklearn.ensemble import RandomForestRegressor


In [None]:
rf_fit = RandomForestRegressor(n_estimators=1000, max_depth=1)  # RandomStumps
rf_fit.fit(X_train_trans, y_train)


In [None]:
y_pred = dt_fit.predict(X_test_trans)
tb_test["pred_rf_SalePrice"] = np.exp(y_pred)


In [None]:
np.round(
    np.sqrt(mean_squared_error(tb_test["SalePrice"], tb_test["pred_rf_SalePrice"])), 2
)


## Optimizing w/ GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
hyper_parameters = {"n_estimators": range(10, 1001, 50), "max_depth": range(1, 20, 2)}
rf_fit = RandomForestRegressor()


In [None]:
cv_rf_fit = GridSearchCV(rf_fit, hyper_parameters, n_jobs=-1)
cv_rf_fit.fit(X_train_trans, y_train)


In [None]:
cv_rf_fit.best_estimator_


In [None]:
y_pred = cv_rf_fit.predict(X_test_trans)
tb_test["pred_rf_SalePrice"] = np.exp(y_pred)
np.round(
    np.sqrt(mean_squared_error(tb_test["SalePrice"], tb_test["pred_rf_SalePrice"])), 2
)


## Boosting

### Cat Boosting!

In [None]:
import catboost as cat


In [None]:
cat_fit = cat.CatBoostRegressor(iterations=20000, depth=3, od_type="Iter", od_wait=500)
cat_fit.fit(X_train_trans, y_train, eval_set=(X_test_trans, y_test))


In [None]:
y_pred = cat_fit.predict(X_test_trans)
tb_test["pred_cat_SalePrice"] = np.exp(y_pred)
np.round(
    np.sqrt(mean_squared_error(tb_test["SalePrice"], tb_test["pred_cat_SalePrice"])), 2
)


# Including more Variables

In [None]:
tb_housing_trans.select_dtypes(include="number").columns[
    np.abs(tb_housing_trans.corr()["SalePrice"]).sort_values() > 0.2
]


In [None]:
tb_housing_trans = var_trans(tb_housing)
x_var = [
    "log_GrLivArea",
    "OverallQual",
    "LowQualFinSF",
    "GrLivArea",
    "BsmtFullBath",
    "BsmtHalfBath",
    "FullBath",
    "HalfBath",
    "BedroomAbvGr",
    "KitchenAbvGr",
    "TotRmsAbvGrd",
    "Fireplaces",
    "GarageYrBlt",
    "GarageCars",
    "GarageArea",
    "WoodDeckSF",
    "OpenPorchSF",
    "EnclosedPorch",
    "3SsnPorch",
    "ScreenPorch",
    "PoolArea",
    "MiscVal",
    "lot_ocuppation",
    "grp_HeatingQC",
    "grp_MSZoning",
]
y_var = "log_SalePrice"
model_var = x_var + [y_var]
tb_housing_model = tb_housing_trans[model_var].dropna()

X = tb_housing_model[x_var]
y = tb_housing_model[y_var]
X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler, ohe = transformer_fit(X_train)
X_train_trans = transformer_apply(X_train, scaler, ohe)
X_test_trans = transformer_apply(X_test, scaler, ohe)


In [None]:
lm_fit = LinearRegression()
lm_fit.fit(X_train_trans, y_train)
y_pred = lm_fit.predict(X_test_trans)
np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))

In [None]:
cat_fit = cat.CatBoostRegressor(iterations=20000, depth=8, od_type="Iter", od_wait=1500, verbose = False)
cat_fit.fit(X_train_trans, y_train, eval_set=(X_test_trans, y_test))
y_pred = cat_fit.predict(X_test_trans)
np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))