# Part #03: Model Validation

## Step #01: Feature Selection REVISITED!

### Set up the workspace

In [None]:
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
sns.set()
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold, learning_curve, RandomizedSearchCV, validation_curve
from xgboost import XGBRegressor

In [None]:
# Read the results from before:
with open("./assets/preprocessing_results.pkl", mode="rb") as file_bin:
    resulst = pickle.load(file_bin)

In [None]:
# Unpack train and test sets:
feature_names, X_train, X_test, y_train, y_test = resulst.values()

### Perform backward feature elimination to exclude less important features

In [None]:
# Recreate the df for training & test sets:
X_train_df = pd.DataFrame(data=X_train, columns=feature_names)
X_test_df = pd.DataFrame(data=X_test, columns=feature_names)

In [None]:
# Instantiate a candidate model:
xgb_reg = XGBRegressor()

In [None]:
# Define a scoring function(s):
r2 = make_scorer(r2_score, greater_is_better=True)
rmse = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

In [None]:
%%time
# Compare different features:
%run "./recipes/compare_features.py";
results = compare_features(xgb_reg, X_train_df, y_train, r2, cv=10);

In [None]:
# Determine irrelevant features:
n_features = 6
irrelevant_features = [f for f in results.index[:n_features] if f != "current_performance"]
irrelevant_features

In [None]:
# Drop irrelevant features:
X_train_df = X_train_df.drop(irrelevant_features, axis=1)

In [None]:
%%time
# Compare remaining features:
%run "./recipes/compare_features.py";
results = compare_features(xgb_reg, X_train_df, y_train, r2, cv=10);

In [None]:
# Drop irrelevant features from test set:
X_test_df = X_test_df.drop(irrelevant_features, axis=1)

In [None]:
# Convert train and test set into numpy arrays:
X_train = X_train_df.to_numpy(dtype=np.float32)
X_test = X_test_df.to_numpy(dtype=np.float32)

In [None]:
# Save feature names for later:
feature_names = X_train_df.columns.tolist()

## Step #02: Plotting the Learning Curve

In [None]:
%%time
# Perform CV on different sample sizes:
train_sizes, train_scores, test_scores = learning_curve(
    xgb_reg,
    X_train,
    y_train,
    train_sizes=np.linspace(start=0.2, stop=1.0, num=5),
    cv=10,
    scoring=r2,
    random_state=42
)

In [None]:
# Plot the learning curve:
%run "./recipes/plot_learning_curve.py"
plot_learning_curve(train_sizes, train_scores, test_scores);

## Step #03: Plotting the Validation Curve(s)

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor()

In [None]:
# Create range of values for the hyperparameter in question:
n_estimators = [int(x) for x in np.linspace(start=50, stop=300, num=6)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="n_estimators",
    param_range=n_estimators,
    cv=5,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(n_estimators, train_scores, test_scores)

In [None]:
best_n_estimators = n_estimators[test_scores.mean(axis=1).argmax()]
best_n_estimators

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor()

In [None]:
# Create range of values for the hyperparameter in question:
learning_rate = [x for x in np.linspace(start=0.1, stop=0.5, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="learning_rate",
    param_range=learning_rate,
    cv=5,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(learning_rate, train_scores, test_scores)

In [None]:
best_learning_rate = learning_rate[test_scores.mean(axis=1).argmax()]
best_learning_rate

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor()

In [None]:
# Create range of values for the hyperparameter in question:
max_depth = [int(x) for x in np.linspace(start=2, stop=10, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="max_depth",
    param_range=max_depth,
    cv=5,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(max_depth, train_scores, test_scores)

In [None]:
best_max_depth = max_depth[test_scores.mean(axis=1).argmax()]
best_max_depth

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor()

In [None]:
# Create range of values for the hyperparameter in question:
gamma = [x for x in np.linspace(start=0, stop=5, num=10)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="gamma",
    param_range=gamma,
    cv=5,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(gamma, train_scores, test_scores)

In [None]:
best_gamma = gamma[test_scores.mean(axis=1).argmax()]
best_gamma

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor()

In [None]:
# Instantiate the best model:
best_xgb_reg = XGBRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    gamma=best_gamma);

In [None]:
%%time
# Compare different models:
%run "./recipes/compare_models.py";
results = compare_models(
    X_train,
    y_train,
    r2,
    cv=10,
    xgb_reg=xgb_reg,
    best_xgb_reg=best_xgb_reg
);

## Step #04: Tuning the Model Hyperparameters

### Perform a randomized search over a pre-defined hyperparameter distribution

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor()

In [None]:
param_distributions = {
    "n_estimators": n_estimators,
    "learning_rate": learning_rate,
    "max_depth": max_depth,
    "gamma": gamma
}

In [None]:
# Perform a randomized search over the pre-determined param_distributions:
rand_cv_results = RandomizedSearchCV(
    estimator=xgb_reg,
    param_distributions=param_distributions,
    n_iter=100,
    scoring=r2,
    cv=3,
    random_state=42
)

In [None]:
%%time
# Fit the RandomSearchCV object:
rand_cv_results.fit(X_train, y_train);

In [None]:
# Print the best parameters:
rand_cv_best_params = rand_cv_results.best_params_
print(rand_cv_best_params)

In [None]:
# Train the model with the best parameters:
best_rand__xgb_reg = XGBRegressor(**rand_cv_best_params)
best_rand__xgb_reg.fit(X_train, y_train);

In [None]:
# Evaluate the model:
score = best_rand__xgb_reg.score(X_test, y_test)
print(score)

### Perform an optimized search over a pre-defined hyperparameter distribution

In [None]:
# Set optuna log level to WARNINGonly:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
# Define the objective function:
def objective(trial):
    
    # Define the hyperparameters:
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
    }

    # Train the model with CV:
    xgb_reg = XGBRegressor(**params)
    cv_results = cross_val_score(xgb_reg, X_train, y_train, scoring=r2, cv=3)

    # Evaluate the model
    r2_score = cv_results.mean()

    return r2_score

In [None]:
# Create a study:
study = optuna.create_study(direction='maximize');

In [None]:
%%time
# Run the study:
study.optimize(objective, n_trials=50)

In [None]:
# Print the best parameters:
optim_cv_best_params = study.best_params
print(optim_cv_best_params)

In [None]:
# Train the model with the best parameters:
optimized_xgb_reg = XGBRegressor(**optim_cv_best_params)
optimized_xgb_reg.fit(X_train, y_train);

In [None]:
# Evaluate the model:
score = optimized_xgb_reg.score(X_test, y_test)
print(score)

## Step #05: Final Model Evaluation:

In [None]:
# Instantiate the best model so far:
best_model_so_far = XGBRegressor(**optim_cv_best_params)

In [None]:
# Fit the model:
best_model_so_far.fit(X_train, y_train)

In [None]:
# Evaluate the model using r2:
best_r2_score = r2(best_model_so_far, X_test, y_test)
print(f"R-squared = {best_r2_score:.2%}")

In [None]:
# Evaluate the model using rmse:
best_rmse_score = rmse(best_model_so_far, X_test, y_test)