In [2]:
# Load libraries.
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import gc

import xgboost as xgb

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics

In [3]:
# Import datasets.
path = '../input/santander-value-prediction-challenge/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [4]:
X = train.drop(["ID", "target"], axis=1)
y = np.log1p(train["target"].values)

test = test.drop(["ID"], axis=1)

del train
gc.collect()

14

In [5]:
# Removes features containing constant values
feat_to_remove = []
for feat in X.columns:
    if len(X[feat].unique()) == 1:
        feat_to_remove.append(feat)
        
X.drop(feat_to_remove, axis=1, inplace=True)
test.drop(feat_to_remove, axis=1, inplace=True)

print(f'Removed {len(feat_to_remove)} Constant Columns\n')

Removed 256 Constant Columns



### GridSearch with Cross-Validation
To get the best XGB model possible, it is important to try a range of different parameters over the training data. Below is a grid search function that searches every combination between the three parameters provided to it, recording which is the best combination. The metric used for determining the best model is the RMSE.

In [6]:
def to_dMatrix(X, y=None):
    if y is None:
        return xgb.DMatrix(X)
    else:
        return xgb.DMatrix(X, y)

In [7]:
def create_eval_set(X_val, y_val):
    return [(xgb.DMatrix(X_val, y_val), 'eval')]

In [34]:
def GridSearch(hyperParams, train_df, target):
    # GridSearch function can only support the exploration of 3 params in hyperParams
    
    best_score = np.inf
    best_params = None

    n_folds = 5
    folds = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    const_params = {
        'objective': 'reg:squarederror',
        'eta':0.01,
        'eval_metric':'rmse',
        'tree_method': 'gpu_hist'
    }
    
#     Create iterable parameters
    grid = ParameterGrid(hyperParams)
    for hyper_params in grid:
        
#         Create dictionary of params 
        param1, param2, param3 = hyper_params.items()
        current_params = {param1[0]: param1[1], param2[0]: param2[1], param3[0]: param3[1]}
        
        print('##########################################################################\n')
        print(f'Parameters being tested: {current_params}')

#         Append param dictionary to model parameters
        const_params.update(current_params)
        
        kfold_gs_score = 0
        for train_index, test_index in folds.split(train_df):
#             Seperate train/test data
            train_X, train_y = train_df.loc[train_index], target[train_index]
            test_X, test_y = train_df.loc[test_index], target[test_index]
            
#             Preparing the training data
            eval_set_list = create_eval_set(test_X, test_y)
            dMatTrain = to_dMatrix(train_X, train_y)
            dMatTest = to_dMatrix(test_X)           
            
#             Training the model
            start = time.time()
            gs = xgb.train(
                            params=const_params, 
                            dtrain=dMatTrain, 
                            num_boost_round=5000, 
                            evals=eval_set_list,
                            early_stopping_rounds=100, 
                            verbose_eval=1000
                            )    
            end = time.time()
            print(f'Execution time: {np.round((end - start),2)}s\n')
            print('------------------------------------')
            
            pred_y = np.expm1(gs.predict(dMatTest))
        
#             Sum kfold model score
            kfold_gs_score += np.sqrt(metrics.mean_squared_error(np.expm1(test_y), pred_y))
    
#         Produce average model score
        gs_score = kfold_gs_score / n_folds
        print(f'Parameter score: {gs_score}\n')
        
#         Only keep the best parameters
        if best_score > gs_score:
            best_score = gs_score
            best_params = current_params
    print(f'The best parameters found were: {best_params}')
    return best_params

In [35]:
%%time
grid_params = {
    'subsample': np.arange(0.5,1,0.1).tolist(),
    'max_depth': np.arange(3,12,1).tolist(),
    'min_child_weight': np.arange(3,6,1).tolist()
}
print(f'Number of training features: {X.shape[1]} | Number of training rows: {X.shape[0]}')
best_params = GridSearch(grid_params, X, y)

Number of training features: 4735 | Number of training rows: 4459
##########################################################################

Parameters being tested: {'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.5}
[0]	eval-rmse:13.9511
Will train until eval-rmse hasn't improved in 100 rounds.
[1000]	eval-rmse:1.4714
Stopping. Best iteration:
[1506]	eval-rmse:1.46148

Execution time: 10.13s

------------------------------------
[0]	eval-rmse:13.9247
Will train until eval-rmse hasn't improved in 100 rounds.
[1000]	eval-rmse:1.52323


KeyboardInterrupt: 

#### Final Model
Now that we have found our best parameters, we can train our final model and submit to the competition.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

# Eval_set train/test preformance data
dX_train = xgb.DMatrix(X_train, y_train)
dy_test = xgb.DMatrix(X_test, y_test)

# Training data
dtrain = xgb.DMatrix(X, y)

# del X_train, X_test, y_train, y_test
gc.collect()

64

In [14]:
dtest = xgb.DMatrix(test)
del test

NameError: name 'test' is not defined

In [18]:
# original algorithm
best_params = {'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.8999999999999999}
params = {
    'objective': 'reg:squarederror',
    'eta':0.01,
    'eval_metric':'rmse',
    'tree_method': 'gpu_hist'
}

# params.update(best_params)

eval_set = [(dX_train, 'train'), (dy_test, 'eval')]
bst = xgb.train(
                params=params, 
                dtrain=dtrain, 
                num_boost_round=5000, 
                evals=eval_set,
                early_stopping_rounds=100, 
                verbose_eval=1000
                )

[0]	train-rmse:13.9626	eval-rmse:13.9538
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
[1000]	train-rmse:1.12053	eval-rmse:1.10625
[2000]	train-rmse:0.993619	eval-rmse:0.985007
[3000]	train-rmse:0.908328	eval-rmse:0.900145
[4000]	train-rmse:0.84454	eval-rmse:0.835424
[4999]	train-rmse:0.794802	eval-rmse:0.78424


In [26]:
y_pred = np.expm1(bst.predict(dtest))

In [None]:
sub = pd.read_csv(path + 'sample_submission.csv')
sub['target'] = y_pred
sub.to_csv('submission.csv', index=False)