# Part #04: Model Validation

## Step #01: Feature Selection REVISITED!

### Set up the workspace

In [None]:
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
sns.set()
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold, learning_curve, RandomizedSearchCV, validation_curve
from xgboost import XGBRegressor

In [None]:
# Read the results from before:
with open("./assets/preprocessing_results.pkl", mode="rb") as file_bin:
    resulst = pickle.load(file_bin)

In [None]:
# Unpack train and test sets:
feature_names, X_train, X_test, y_train, y_test = resulst.values()

### Perform backward feature elimination to exclude less important features

In [None]:
# Recreate the df for training & test sets:
X_train_df = pd.DataFrame(data=X_train, columns=feature_names)
X_test_df = pd.DataFrame(data=X_test, columns=feature_names)

In [None]:
# Instantiate a candidate model:
xgb_reg = XGBRegressor()

In [None]:
# Define a scoring function(s):
r2 = make_scorer(r2_score, greater_is_better=True)
rmse = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

In [None]:
%%time
# Compare different features:
%run "./recipes/compare_features.py";
results = compare_features(xgb_reg, X_train_df, y_train, r2, cv=10);

In [None]:
# Determine irrelevant features:
n_features = 6
irrelevant_features = [f for f in results.index[:n_features] if f != "current_performance"]
irrelevant_features

In [None]:
# Drop irrelevant features:
X_train_df = X_train_df.drop(irrelevant_features, axis=1)

In [None]:
%%time
# Compare remaining features:
%run "./recipes/compare_features.py";
results = compare_features(xgb_reg, X_train_df, y_train, r2, cv=10);

In [None]:
# Drop irrelevant features from test set:
X_test_df = X_test_df.drop(irrelevant_features, axis=1)

In [None]:
# Convert train and test set into numpy arrays:
X_train = X_train_df.to_numpy(dtype=np.float32)
X_test = X_test_df.to_numpy(dtype=np.float32)

In [None]:
# Save feature names for later:
feature_names = X_train_df.columns.tolist()

## Step #02: Plotting the Learning Curve

In [None]:
%%time
# Perform CV on different sample sizes:
train_sizes, train_scores, test_scores = learning_curve(
    xgb_reg,
    X_train,
    y_train,
    train_sizes=np.linspace(start=0.2, stop=1.0, num=5),
    cv=10,
    scoring=r2,
    random_state=42
)

In [None]:
# Plot the learning curve:
%run "./recipes/plot_learning_curve.py"
plot_learning_curve(train_sizes, train_scores, test_scores);

## Step #03: Plotting the Validation Curve(s)

### Tune `max_depth`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(random_state=42);

In [None]:
# Create range of values for the hyperparameter in question:
max_depth = [int(x) for x in np.linspace(start=1, stop=10, num=10)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="max_depth",
    param_range=max_depth,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(max_depth, train_scores, test_scores)

In [None]:
best_max_depth = max_depth[test_scores.mean(axis=1).argmax()]
best_max_depth

### Tune `learning_rate`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    random_state=42
);

In [None]:
# Create range of values for the hyperparameter in question:
learning_rate = [x for x in np.linspace(start=0.01, stop=0.1, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="learning_rate",
    param_range=learning_rate,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(learning_rate, train_scores, test_scores)

In [None]:
best_learning_rate = learning_rate[test_scores.mean(axis=1).argmax()]
best_learning_rate

### Tune `n_estimators`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    random_state=42
);

In [None]:
# Create range of values for the hyperparameter in question:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="n_estimators",
    param_range=n_estimators,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(n_estimators, train_scores, test_scores)

In [None]:
best_n_estimators = n_estimators[test_scores.mean(axis=1).argmax()]
best_n_estimators

### Tune `min_child_weight`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    random_state=42
);

In [None]:
# Create range of values for the hyperparameter in question:
min_child_weight = [x for x in np.linspace(start=1, stop=10, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="min_child_weight",
    param_range=min_child_weight,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(min_child_weight, train_scores, test_scores)

In [None]:
best_min_child_weight = min_child_weight[test_scores.mean(axis=1).argmax()]
best_min_child_weight

### Tune `gamma`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    min_child_weight=best_min_child_weight,
    random_state=42
);

In [None]:
# Create range of values for the hyperparameter in question:
gamma = [x for x in np.linspace(start=0.0, stop=1.0, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="gamma",
    param_range=gamma,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(gamma, train_scores, test_scores)

In [None]:
best_gamma = gamma[test_scores.mean(axis=1).argmax()]
best_gamma

### Tune `subsample`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    random_state=42
);

In [None]:
# Create range of values for the hyperparameter in question:
subsample = [x for x in np.linspace(start=0.1, stop=1.0, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="subsample",
    param_range=subsample,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(subsample, train_scores, test_scores)

In [None]:
best_subsample = subsample[test_scores.mean(axis=1).argmax()]
best_subsample

### Tune `colsample_bytree`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    random_state=42
);

In [None]:
# Create range of values for the hyperparameter in question:
colsample_bytree = [x for x in np.linspace(start=0.1, stop=1.0, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="colsample_bytree",
    param_range=colsample_bytree,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(colsample_bytree, train_scores, test_scores)

In [None]:
best_colsample_bytree = colsample_bytree[test_scores.mean(axis=1).argmax()]
best_colsample_bytree

### Tune `reg_alpha`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    random_state=42
);

In [None]:
# Create range of values for the hyperparameter in question:
reg_alpha = [x for x in np.linspace(start=0.0, stop=1.0, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="reg_alpha",
    param_range=reg_alpha,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(reg_alpha, train_scores, test_scores)

In [None]:
best_reg_alpha = reg_alpha[test_scores.mean(axis=1).argmax()]
best_reg_alpha

### Tune `reg_lambda`

In [None]:
# Instantiate the model:
xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    reg_alpha=best_reg_alpha,
    random_state=42
);

In [None]:
reg_lambda = [x for x in np.linspace(start=0.0, stop=1.0, num=5)]

In [None]:
%%time
# Perform CV using different hyperparametr values:
train_scores, test_scores = validation_curve(
    xgb_reg,
    X_train,
    y_train,
    param_name="reg_lambda",
    param_range=reg_lambda,
    cv=3,
    scoring=r2
)

In [None]:
# Plot the validation curve:
%run "./recipes//plot_validation_curve.py"
plot_validation_curve(reg_lambda, train_scores, test_scores)

In [None]:
best_reg_lambda = reg_lambda[test_scores.mean(axis=1).argmax()]
best_reg_lambda

### Evaluate the model using the best-found hyperparameters (flawed but intuitive approach)

In [None]:
# Instantiate the xgb_reg baseline model:
xgb_reg = XGBRegressor(random_state=42);

In [None]:
# Instantiate the better xgb_reg model:
better_xgb_reg = XGBRegressor(
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    reg_alpha=best_reg_alpha,
    reg_lambda=best_reg_lambda,
    random_state=42
);

In [None]:
%%time
# Compare different models:
%run "./recipes/compare_models.py";
results = compare_models(
    X_train,
    y_train,
    r2,
    cv=5,
    xgb_reg=xgb_reg,
    better_xgb_reg=better_xgb_reg
);

## Step #04: Tuning the Model Hyperparameters

In [None]:
# Set optuna log level to WARNINGonly:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
# Define the objective function:
def objective(trial):
    
    # Define the hyperparameters:
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
    }

    # Train the model with CV:
    xgb_reg = XGBRegressor(random_state=42, **params)
    cv_results = cross_val_score(xgb_reg, X_train, y_train, scoring=r2, cv=3)

    # Evaluate the model
    r2_score = cv_results.mean()

    return r2_score

In [None]:
# Create a study:
study = optuna.create_study(direction='maximize');

In [None]:
%%time
# Run the study:
study.optimize(objective, n_trials=100)

In [None]:
# Print the best parameters:
opt_cv_best_params = study.best_params
print(opt_cv_best_params)

In [None]:
# Train the model with the best parameters:
hpefully_best_model = XGBRegressor(**opt_cv_best_params)
hpefully_best_model.fit(X_train, y_train);

In [None]:
%%time
# Compare different models:
%run "./recipes/compare_models.py";
results = compare_models(
    X_train,
    y_train,
    r2,
    cv=5,
    xgb_reg=xgb_reg,
    better_xgb_reg=better_xgb_reg,
    hpefully_best_model
);

## Step #05: Final Model Evaluation

In [None]:
# Instantiate the best model so far:
best_model_so_far = XGBRegressor(**opt_cv_best_params)

In [None]:
# Fit the model:
best_model_so_far.fit(X_train, y_train)

In [None]:
# Evaluate the model using r2:
best_r2_score = r2(best_model_so_far, X_test, y_test)
print(f"R-squared = {best_r2_score:.2%}")

In [None]:
# Evaluate the model using rmse:
best_rmse_score = rmse(best_model_so_far, X_test, y_test)
print(f"RMSE = {best_rmse_score:.2%}")

## Step #06: Saving Results

In [None]:
# Create a dictionary to hold the results:
feature_elimination_results = {
    "feature_names": feature_names,
    "X_train": X_train.astype(np.float32),
    "X_test": X_test.astype(np.float32),
    "y_train": y_train.astype(np.float32),
    "y_test": y_test.astype(np.float32)
}

In [None]:
# Pickle the results:
with open("./assets/feature_elimination_results.pkl", mode="wb") as file_bin:
    pickle.dump(feature_elimination_results, file_bin)

In [None]:
# Save the model:
best_model_so_far.save_model("./assets/best_model_so_far.json")

In [None]:
# Load the saved model:
saved_model = XGBRegressor()
saved_model.load_model("./assets/best_model_so_far.json");