## Modelo Baseline

In [0]:
import sys
import os

sys.path.append(os.path.abspath("../"))

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from src import config
import mlflow

In [0]:
prices_df = spark.sql("select * from lp_processed_modeling").toPandas()
string_columns = prices_df.select_dtypes(include='object').columns

for column in string_columns:
    prices_df[column] = prices_df[column].astype("category")

prices_enc = spark.sql("select * from lp_processed_modeling_dummy").toPandas()

In [0]:
X, y = prices_df.drop(config.MODEL_CONFIG["TARGET"], axis=1), prices_df[config.MODEL_CONFIG["TARGET"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config.MODEL_CONFIG["TEST_SIZE"])

X_enc, y_enc = prices_enc.drop(config.MODEL_CONFIG["TARGET"], axis=1), prices_enc[config.MODEL_CONFIG["TARGET"]]
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_enc, y_enc, test_size=config.MODEL_CONFIG["TEST_SIZE"])

In [0]:
def test_model_without_encoding(prices_df):

    X, y = prices_df.drop(config.MODEL_CONFIG["TARGET"], axis=1), prices_df[config.MODEL_CONFIG["TARGET"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config.MODEL_CONFIG["TEST_SIZE"])

    with mlflow.start_run(run_name="xgb_categorical") as run:
        model = xgb.XGBRegressor(
            tree_method="hist", enable_categorical=True, max_cat_to_onehot=20, device="cpu"
        )

        model.fit(X_train, y_train, eval_set=[(X_train, y_train)])

        predictions = model.predict(X_test)

        mae = mean_absolute_error(y_test, predictions)
        mape = mean_absolute_percentage_error(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))

        mlflow.log_metrics({
            "MAE": mae,
            "MAPE": mape,
            "RMSE": rmse
        })

        mlflow.log_params(model.get_params())

        
        fig1, axs = plt.subplots(1, 2, figsize=(16, 6))

        # plot diagonal line
        axs[0].axline((1,1), slope=1, color='r', ls='--')
        # Scatter predictions vs true
        axs[0].scatter(x=y_test, y=predictions)
        axs[0].set_title(f"Scatter plot predictions evaluation\nMAE: {round(mae, 2)} | MAPE: {round(mape, 4)*100}%")
        axs[0].set_xlabel("Actual Price")
        axs[0].set_ylabel("Predicted price")
        axs[0].grid()

        # Plot the Residuals
        residuals = y_test - predictions
        axs[1].hist(residuals, density=True)
        axs[1].set_title("Model Resid")
        axs[1].set_xlabel("Residual")
        axs[1].set_ylabel("Frequency")
        sns.kdeplot(residuals, label='Residual')
        s = np.random.normal(0, np.std(residuals), 10000)
        sns.kdeplot(s, label='Normal')

        axs[1].grid()
        plt.legend()
        mlflow.log_figure(fig1, "metrics_residual.png")
        plt.show()

        fig2, axs2 = plt.subplots()
        xgb.plot_importance(model, ax=axs2)
        mlflow.log_figure(fig2, "feature_importance.png")

In [0]:
def test_model_with_encoding(prices_enc):

    X, y = prices_enc.drop(config.MODEL_CONFIG["TARGET"], axis=1), prices_enc[config.MODEL_CONFIG["TARGET"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config.MODEL_CONFIG["TEST_SIZE"])

    with mlflow.start_run(run_name="xgb_encoded") as run:
        model = xgb.XGBRegressor(
            tree_method="hist", device="cpu", eval_metric=["logloss", "mae"]
        )

        model.fit(X_train, y_train, eval_set=[(X_train, y_train)])
        predictions = model.predict(X_test)

        mae = mean_absolute_error(y_test, predictions)
        mape = mean_absolute_percentage_error(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))

        mlflow.log_metrics({
            "MAE": mae,
            "MAPE": mape,
            "RMSE": rmse
        })

        mlflow.log_params(model.get_params())

        
        fig1, axs = plt.subplots(1, 2, figsize=(16, 6))

        # plot diagonal line
        axs[0].axline((1,1), slope=1, color='r', ls='--')
        # Scatter predictions vs true
        axs[0].scatter(x=y_test_enc, y=predictions)
        axs[0].set_title(f"Scatter plot predictions evaluation\nMAE: {round(mae, 2)} | MAPE: {round(mape, 4)*100}%\nEncoded")
        axs[0].set_xlabel("Actual Price")
        axs[0].set_ylabel("Predicted price")
        axs[0].grid()

        # Plot the Residuals
        residuals_enc = y_test_enc - predictions
        axs[1].hist(residuals_enc, density=True)
        axs[1].set_title("Model Resid\nEncoded")
        axs[1].set_xlabel("Residual")
        axs[1].set_ylabel("Frequency")
        sns.kdeplot(residuals_enc, label='Residual')
        s = np.random.normal(0, np.std(residuals_enc), 10000)
        sns.kdeplot(s, label='Normal')

        axs[1].grid()
        plt.legend()

        mlflow.log_figure(fig1, "metrics_residual_encoded.png")
        plt.show()

        fig2, axs2 = plt.subplots()
        xgb.plot_importance(model, ax=axs2)
        mlflow.log_figure(fig2, "feature_importance_encoded.png")

In [0]:
test_model_without_encoding(prices_df)
test_model_with_encoding(prices_enc)

### 3.3 Perform Hyperparameter Optimization

In [0]:
param_grid = {
        'min_child_weight': [10, 12, 15, 20, None],
        'gamma': [0.05, 0.1, 0.25, 0.5, None],
        'subsample': [0.6, 0.8, 1.0, None],
        'colsample_bytree': [0.8, 1.0, None],
        'max_depth': [7, 10, 13, None],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, None],
        'n_estimators': [40, 100, 200, None],
        'reg_lambda': [1, 5, 10, None]
}

In [0]:
gridsearcher = GridSearchCV(
    estimator=XGBRegressor(tree_method="hist", enable_categorical=True, max_cat_to_onehot=20, device="cpu"),
    param_grid = param_grid,
    scoring='neg_mean_squared_error',
    cv=4,
    verbose=2,
    n_jobs=-1
)

gridsearcher = gridsearcher.fit(X_train, y_train)

In [0]:
print('\n Best hyperparameters:')
print(gridsearcher.best_params_)
results = pd.DataFrame(gridsearcher.cv_results_)

In [0]:
results.head()

In [0]:
print('\n Best estimator:')
print(gridsearcher.best_estimator_)

In [0]:
model = XGBRegressor(**{'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 10, 'n_estimators': 200, 'subsample': 1.0}, eval_metric=["mae", "logloss"])
model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)]
)

In [0]:
# predict on the validation set
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mape = mean_absolute_percentage_error(y_test, predictions)
model.score(X_test, y_test)

In [0]:
fig, axs = plt.subplots(1, 2, figsize=(16, 6))

# plot diagonal line
axs[0].axline((1,1), slope=1, color='r', ls='--')
# Scatter predictions vs true
axs[0].scatter(x=y_test, y=predictions)
axs[0].set_title(f"Scatter plot predictions evaluation\nMAE: {round(mae, 2)} | MAPE: {round(mape, 4)*100}%")
axs[0].set_xlabel("Actual Price")
axs[0].set_ylabel("Predicted price")
axs[0].grid()

# Plot the Residuals
residuals = y_test - predictions
axs[1].hist(residuals, density=True)
axs[1].set_title("Model Resid")
axs[1].set_xlabel("Residual")
axs[1].set_ylabel("Frequency")
sns.kdeplot(residuals, label='Residual')
s = np.random.normal(0, np.std(residuals), 10000)
sns.kdeplot(s, label='Normal')

axs[1].grid()
plt.legend()
plt.show()

In [0]:
# plot learning curves
learning_results = model.evals_result()

fig, axs = plt.subplots(1, 2, figsize=(14, 6))
axs[0].plot(learning_results['validation_0']['logloss'], label='Traning Logloss')
axs[0].plot(learning_results['validation_1']['logloss'], label='Testing Logloss')

axs[1].plot(learning_results['validation_0']['mae'], label='Training MAE')
axs[1].plot(learning_results['validation_1']['mae'], label='Testing MAE')

axs[0].set_xlabel("Iterations")
axs[1].set_xlabel("Iterations")
# show the legend
axs[0].legend()
axs[1].legend()
# show the plot
plt.show()