# XGBRegressor, 5 fold cross validation and Optuna Hyperparameter Tuning

## Import Libraries

In [None]:
! pip install optuna

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn import model_selection

import optuna
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV

# Mount Google drive
from google.colab import drive
drive.mount('/content/drive')


## Util functions

In [None]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

# Create function to evaluate model on a few different levels
def show_scores(X_train, X_valid, y_train, y_valid, model):
  """
  Usage > show_scores(X_train, X_valid, y_train, y_valid, model)
  """
  train_preds = model.predict(X_train)
  val_preds = model.predict(X_valid)
  scores = {"Training R^2": r2_score(y_train, train_preds),
            "Valid R^2": r2_score(y_valid, val_preds),
            # If True returns MSE value, if False returns RMSE value.
            "Train RMSE": mean_squared_error(y_train, train_preds, squared=False),
            "*Valid RMSE": mean_squared_error(y_valid, val_preds, squared=False)}

  return scores

## Exploratory Data Analysis and Feature Engineering

In [None]:
df = pd.read_csv("drive/MyDrive/input/azure/bulldozers/TrainAndValid.csv", low_memory=False)


In [None]:
df.info()

In [None]:
df.isna().sum() / len(df)*100

In [None]:
df.head().T

## Feature Engineer data

In [None]:
# Import data again but this time parse dates
# which converts the specified field to a datetime object
df = pd.read_csv("drive/MyDrive/input/azure/bulldozers/TrainAndValid.csv",
                 low_memory=False,
                 parse_dates=["saledate"])

In [None]:
df.info()

In [None]:
# Sort DataFrame in date order
df.sort_values(by=["saledate"], inplace=True, ascending=True)
df.saledate.head(20)

In [None]:
df["saleYear"] = df.saledate.dt.year
df["saleMonth"] = df.saledate.dt.month
df["saleDay"] = df.saledate.dt.day
df["saleDayOfWeek"] = df.saledate.dt.dayofweek
df["saleDayOfYear"] = df.saledate.dt.dayofyear

In [None]:
df.head().T

In [None]:
# Now we've enriched our DataFrame with date time features, we can remove 'saledate'
df.drop("saledate", axis=1, inplace=True)

In [None]:
df.head().T

In [None]:
# Find the columns which contain strings
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
df.isna().sum() / len(df)*100

In [None]:
df.head().T

In [None]:
df.info()

In [None]:
# This will turn all of the string value into category values
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype("category").cat.as_ordered()

In [None]:
df.info()

In [None]:
df.UsageBand.cat.categories

In [None]:
df.UsageBand.cat.codes

In [None]:
df.isna().sum() / len(df)*100

### Fill missing numerical values

In [None]:
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Check for which numeric columns have null values
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

auctioneerID
MachineHoursCurrentMeter


In [None]:
# Fill numeric rows with the median
# And not mean as outliers can affect the mean, but not median
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tells us if the data was missing or not
            df[label+"_is_missing"] = pd.isnull(content)
            # Fill missing numeric values with median
            df[label] = content.fillna(content.median())

In [None]:
df.info()

In [None]:
# Check if there's any null numeric values
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Check to see how many examples were missing
df.auctioneerID_is_missing.value_counts()

False    392562
True      20136
Name: auctioneerID_is_missing, dtype: int64

### Filling and turning categorical variables into numbers

In [None]:
# Check for columns which aren't numeric
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
df.info()

In [None]:
# Turn categorical variables into numbers and fill missing
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        df[label+"_is_missing"] = pd.isnull(content)
        # Turn categories into numbers and add +1
        df[label] = pd.Categorical(content).codes+1

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 412698 entries, 205615 to 409203
Columns: 103 entries, SalesID to Steering_Controls_is_missing
dtypes: bool(46), float64(3), int16(4), int64(10), int8(40)
memory usage: 81.1 MB


In [None]:
df.isna().sum()

SalesID                         0
SalePrice                       0
MachineID                       0
ModelID                         0
datasource                      0
                               ..
Backhoe_Mounting_is_missing     0
Blade_Type_is_missing           0
Travel_Controls_is_missing      0
Differential_Type_is_missing    0
Steering_Controls_is_missing    0
Length: 103, dtype: int64

In [None]:
df.head().T

### Add a fold number in data

In [None]:
# Add a new col and initiate with value -1
df["kfold"] = -1

In [None]:
df.columns

Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'fiModelDesc',
       ...
       'Stick_Length_is_missing', 'Thumb_is_missing',
       'Pattern_Changer_is_missing', 'Grouser_Type_is_missing',
       'Backhoe_Mounting_is_missing', 'Blade_Type_is_missing',
       'Travel_Controls_is_missing', 'Differential_Type_is_missing',
       'Steering_Controls_is_missing', 'kfold'],
      dtype='object', length=104)

In [None]:
df.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,Stick_Length_is_missing,Thumb_is_missing,Pattern_Changer_is_missing,Grouser_Type_is_missing,Backhoe_Mounting_is_missing,Blade_Type_is_missing,Travel_Controls_is_missing,Differential_Type_is_missing,Steering_Controls_is_missing,kfold
205615,1646770,9500.0,1126363,8434,132,18.0,1974,0.0,0,4593,...,True,True,True,True,False,False,False,True,True,-1
274835,1821514,14000.0,1194089,10150,132,99.0,1980,0.0,0,1820,...,True,True,True,True,True,True,True,False,False,-1
141296,1505138,50000.0,1473654,4139,132,99.0,1978,0.0,0,2348,...,True,True,True,True,False,False,False,True,True,-1
212552,1671174,16000.0,1327630,8591,132,99.0,1980,0.0,0,1819,...,True,True,True,True,True,True,True,False,False,-1
62755,1329056,22000.0,1336053,4089,132,99.0,1984,0.0,0,2119,...,True,True,True,True,False,False,False,True,True,-1


In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=1)
for fold, (train_indices, valid_indices) in enumerate(kf.split(X=df)):
  print(fold, train_indices, valid_indices)
  df.loc[valid_indices, "kfold"] = fold

0 [     0      2      5 ... 412693 412694 412696] [     1      3      4 ... 412688 412695 412697]
1 [     0      1      2 ... 412693 412695 412697] [     6      9     11 ... 412682 412694 412696]
2 [     0      1      2 ... 412695 412696 412697] [    22     29     30 ... 412680 412685 412686]
3 [     0      1      3 ... 412695 412696 412697] [     2     10     12 ... 412679 412684 412689]
4 [     1      2      3 ... 412695 412696 412697] [     0      5      7 ... 412691 412692 412693]


In [None]:
df.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,Stick_Length_is_missing,Thumb_is_missing,Pattern_Changer_is_missing,Grouser_Type_is_missing,Backhoe_Mounting_is_missing,Blade_Type_is_missing,Travel_Controls_is_missing,Differential_Type_is_missing,Steering_Controls_is_missing,kfold
205615,1646770,9500.0,1126363,8434,132,18.0,1974,0.0,0,4593,...,True,True,True,True,False,False,False,True,True,0
274835,1821514,14000.0,1194089,10150,132,99.0,1980,0.0,0,1820,...,True,True,True,True,True,True,True,False,False,4
141296,1505138,50000.0,1473654,4139,132,99.0,1978,0.0,0,2348,...,True,True,True,True,False,False,False,True,True,1
212552,1671174,16000.0,1327630,8591,132,99.0,1980,0.0,0,1819,...,True,True,True,True,True,True,True,False,False,0
62755,1329056,22000.0,1336053,4089,132,99.0,1984,0.0,0,2119,...,True,True,True,True,False,False,False,True,True,0


In [None]:
df.kfold.value_counts()

0    82540
1    82540
2    82540
4    82539
3    82539
Name: kfold, dtype: int64

In [None]:
df.to_csv("drive/MyDrive/input/azure/bulldozers/TranskFoldTrainAndValid.csv", index=False)

## XGBRegressor

In [None]:
df = pd.read_csv("drive/MyDrive/input/azure/bulldozers/TranskFoldTrainAndValid.csv")
df.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,Stick_Length_is_missing,Thumb_is_missing,Pattern_Changer_is_missing,Grouser_Type_is_missing,Backhoe_Mounting_is_missing,Blade_Type_is_missing,Travel_Controls_is_missing,Differential_Type_is_missing,Steering_Controls_is_missing,kfold
0,1646770,9500.0,1126363,8434,132,18.0,1974,0.0,0,4593,...,True,True,True,True,False,False,False,True,True,0
1,1821514,14000.0,1194089,10150,132,99.0,1980,0.0,0,1820,...,True,True,True,True,True,True,True,False,False,4
2,1505138,50000.0,1473654,4139,132,99.0,1978,0.0,0,2348,...,True,True,True,True,False,False,False,True,True,1
3,1671174,16000.0,1327630,8591,132,99.0,1980,0.0,0,1819,...,True,True,True,True,True,True,True,False,False,0
4,1329056,22000.0,1336053,4089,132,99.0,1984,0.0,0,2119,...,True,True,True,True,False,False,False,True,True,0


In [None]:
df.shape

(412698, 104)

In [None]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

# Create function to evaluate model on a few different levels
def show_scores(X_train, X_valid, y_train, y_valid, model):
  """
  Usage > show_scores(X_train, X_valid, y_train, y_valid, model)
  """
  train_preds = model.predict(X_train)
  val_preds = model.predict(X_valid)
  scores = {"Training R^2": r2_score(y_train, train_preds),
            "Valid R^2": r2_score(y_valid, val_preds),
            # If True returns MSE value, if False returns RMSE value.
            "Train RMSE": mean_squared_error(y_train, train_preds, squared=False),
            "*Valid RMSE": mean_squared_error(y_valid, val_preds, squared=False)}

  return scores

### Using KFolds for cross validation

In [None]:
xgb_params = {
              'random_state': 1,
              'n_estimators': 20000,
              'objective': 'reg:squarederror',
              'n_jobs': -1,                 
              # Only test with these parameters
              'gpu_id': 0,
              'predictor': 'gpu_predictor',
              'tree_method': 'gpu_hist',
              # Only test with these parameters
              }

In [None]:
folds=5
total_mean_rmse = 0
total_mean_mae = 0

for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.SalePrice
    yvalid = xvalid.SalePrice

    model = XGBRegressor(**xgb_params)
    
    # fit, validate and score       
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
    # If True returns MSE value, if False returns RMSE value.
    fold_rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    fold_mae = mean_absolute_error(yvalid, preds_valid)

    print(fold, fold_rmse)
    print(fold, fold_mae)
   
    # calculate mean RMSE and MAE
    total_mean_rmse += fold_rmse / folds
    total_mean_mae += fold_mae / folds
  
print(f"Total Mean RMSE: {total_mean_rmse}")
print(f"Total Mean MAE: {total_mean_mae}")

[0]	validation_0-rmse:34896.8
Will train until validation_0-rmse hasn't improved in 300 rounds.
[1000]	validation_0-rmse:236.045
Stopping. Best iteration:
[962]	validation_0-rmse:235.707

0 235.70671494320177
0 53.79279831653592
[0]	validation_0-rmse:35217.2
Will train until validation_0-rmse hasn't improved in 300 rounds.
[1000]	validation_0-rmse:297.242
Stopping. Best iteration:
[863]	validation_0-rmse:295.762

1 295.7624479695491
1 63.91091602603662
[0]	validation_0-rmse:34930.1
Will train until validation_0-rmse hasn't improved in 300 rounds.
[1000]	validation_0-rmse:264.809
Stopping. Best iteration:
[1040]	validation_0-rmse:264.369

2 264.3690471122155
2 53.90674039906788
[0]	validation_0-rmse:35066
Will train until validation_0-rmse hasn't improved in 300 rounds.
[1000]	validation_0-rmse:251.863
Stopping. Best iteration:
[1074]	validation_0-rmse:251.57

3 251.57003156796125
3 57.16067150121836
[0]	validation_0-rmse:34917.4
Will train until validation_0-rmse hasn't improved in 300

In [None]:
# BEST RMSE SO FAR!
# Total Mean RMSE: 258.5559559618499
# Total Mean MAE: 57.00299664849331

#In Azure MAE = 5701

### Hyperparameter tuning using Optuna

In [None]:
df = pd.read_csv("drive/MyDrive/input/azure/bulldozers/TranskFoldTrainAndValid.csv")
df.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,Stick_Length_is_missing,Thumb_is_missing,Pattern_Changer_is_missing,Grouser_Type_is_missing,Backhoe_Mounting_is_missing,Blade_Type_is_missing,Travel_Controls_is_missing,Differential_Type_is_missing,Steering_Controls_is_missing,kfold
0,1646770,9500.0,1126363,8434,132,18.0,1974,0.0,0,4593,...,True,True,True,True,False,False,False,True,True,0
1,1821514,14000.0,1194089,10150,132,99.0,1980,0.0,0,1820,...,True,True,True,True,True,True,True,False,False,4
2,1505138,50000.0,1473654,4139,132,99.0,1978,0.0,0,2348,...,True,True,True,True,False,False,False,True,True,1
3,1671174,16000.0,1327630,8591,132,99.0,1980,0.0,0,1819,...,True,True,True,True,True,True,True,False,False,0
4,1329056,22000.0,1336053,4089,132,99.0,1984,0.0,0,2119,...,True,True,True,True,False,False,False,True,True,0


In [None]:
def run(trial):
    # We are optimizing in One 0th fold only
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.SalePrice
    yvalid = xvalid.SalePrice  

    model = XGBRegressor(
        random_state=1,
        objective='reg:squarederror',
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=20000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)

    # If True returns MSE value, if False returns RMSE value.
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=5)

In [None]:
study.best_params

{'learning_rate': 0.04551062055450038,
 'reg_lambda': 2.3260078384980094e-08,
 'reg_alpha': 0.7896809973685475,
 'subsample': 0.6443398422324955,
 'colsample_bytree': 0.9374729485362446,
 'max_depth': 5}

In [None]:
# {'learning_rate': 0.01035909195820482,
#  'reg_lambda': 3.805761687567834e-05,
#  'reg_alpha': 0.009358510799224654,
#  'subsample': 0.5272639150608613,
#  'colsample_bytree': 0.8510195107224439,
#  'max_depth': 2}

### Use best Hyperparameters

In [None]:
xgb_params = {
              'random_state': 1,
              'n_estimators': 20000,
              'objective': 'reg:squarederror',
              'n_jobs': -1,                 
              # Only test with these parameters
              'gpu_id': 0,
              'predictor': 'gpu_predictor',
              'tree_method': 'gpu_hist',
              # Only test with these parameters
              }

xgb_params_best = {
              'random_state': 1,
              'n_estimators': 20000,
              'objective': 'reg:squarederror',
              'n_jobs': -1,                 
              # Only test with these parameters
              'gpu_id': 0,
              'predictor': 'gpu_predictor',
              'tree_method': 'gpu_hist',
              # Only test with these parameters

             'learning_rate': 0.04551062055450038,
             'reg_lambda': 2.3260078384980094e-08,
             'reg_alpha': 0.7896809973685475,
             'subsample': 0.6443398422324955,
             'colsample_bytree': 0.9374729485362446,
             'max_depth': 5
              }

In [None]:
folds=5
total_mean_rmse = 0

for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.SalePrice
    yvalid = xvalid.SalePrice

    # model = XGBRegressor(**xgb_params)
    model = XGBRegressor(**xgb_params_best)

    # fit, validate and score       
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)

    # If True returns MSE value, if False returns RMSE value.
    fold_rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, fold_rmse)  
   
    # calculate total mean RMSE
    total_mean_rmse += fold_rmse / folds
  
print(f"Total Mean RMSE: {total_mean_rmse}")

In [None]:
# Total Mean MSE: 260.38886869600435 - Not better than 258