In [2]:
# Load libraries.
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import scipy
import os
import time
import gc

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import xgboost as xgb

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics

In [3]:
# Import datasets.
path = '../input/santander-value-prediction-challenge/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [None]:
# This function was created by https://www.kaggle.com/rinnqd. \
# Reference: https://www.kaggle.com/rinnqd/reduce-memory-usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [4]:
X = train.drop(["ID", "target"], axis=1)
y = train["target"].values

test = test.drop(["ID"], axis=1)

del train
gc.collect()

14

In [5]:
# Removes features containing constant values
y = train['target'].copy()
train = train.drop(['target','ID'], axis=1)

feat_to_remove = []
for feat in train.columns:
    if len(train[feat].unique()) == 1:
        feat_to_remove.append(feat)
        
train.drop(feat_to_remove, axis=1, inplace=True)

test.drop(feat_to_remove, axis=1, inplace=True)

print(f'Removed {len(feat_to_remove)} Constant Columns\n')

Removed 256 Constant Columns



In [35]:
## 3 month period
train_3_period_smooth = train.rolling(3,axis=1).mean()
test_3_period_smooth = test.rolling(3,axis=1).mean()
# train_3_period_smooth.fillna(0, axis=1, inplace=True)

# ## 6 month period
# train_6_period_smooth = train.rolling(6,axis=1).mean()
# test_6_period_smooth = test.rolling(6,axis=1).mean()
# # train_6_period_smooth.fillna(0, axis=1, inplace=True)

# ## 12 month period
# train_12_period_smooth = train.rolling(12,axis=1).mean()
# test_12_period_smooth = test.rolling(12,axis=1).mean()
# # train_12_period_smooth.fillna(0, axis=1, inplace=True)


# ## 7-day period
# train_7_period_smooth = train.rolling(7,axis=1).mean()
# test_7_period_smooth = test.rolling(7,axis=1).mean()
# # train_7_period_smooth.fillna(0, axis=1, inplace=True)

# ## 30-day period
# train_30_period_smooth = train.rolling(30,axis=1).mean()
# test_30_period_smooth = test.rolling(30,axis=1).mean()
# # train_30_period_smooth.fillna(0, axis=1, inplace=True)

Number of training features: 4735 | Number of training rows: 4459
##########################################################################

Parameters being tested: {'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.5}
[0]	eval-rmse:13.9511
Will train until eval-rmse hasn't improved in 100 rounds.
[1000]	eval-rmse:1.4714
Stopping. Best iteration:
[1506]	eval-rmse:1.46148

Execution time: 10.13s

------------------------------------
[0]	eval-rmse:13.9247
Will train until eval-rmse hasn't improved in 100 rounds.
[1000]	eval-rmse:1.52323


KeyboardInterrupt: 

In [None]:

## Preparing mean smoothing training data
X_3mms = train_3_period_smooth
y_3mms = np.log1p(y)
# X_6mms = train_6_period_smooth
# y_6mms = np.log1p(y)
# X_12mms = train_12_period_smooth
# y_12mms = np.log1p(y)

# X_7dms = train_7_period_smooth
# y_7dms = np.log1p(y)
# X_30dms = train_30_period_smooth
# y_30dms = np.log1p(y)

## Preparing sum truncation training data
# X_3m_sum = train_3_period_sum
# y_3m_sum = np.log1p(y)
# X_6m_sum = train_6_period_sum
# y_6m_sum = np.log1p(y)
# X_12m_sum = train_12_period_sum
# y_12m_sum = np.log1p(y)

# X_7d_sum = train_7_period_sum
# y_7d_sum = np.log1p(y)
# X_30d_sum = train_30_period_sum
# y_30d_sum = np.log1p(y)

## Preparing test data
test = test.drop(["ID"], axis=1)


del train, train_no_zero, train_3_period_smooth, train_6_period_smooth, train_12_period_smooth, train_7_period_smooth,
train_30_period_smooth, train_3_period_sum, train_6_period_sum, train_12_period_sum, train_7_period_sum, 
train_30_period_sum
gc.collect()

In [6]:
def to_dMatrix(X, y=None):
    if y is None:
        return xgb.DMatrix(X)
    else:
        return xgb.DMatrix(X, y)

In [7]:
def create_eval_set(X_val, y_val):
    return [(xgb.DMatrix(X_val, y_val), 'eval')]

In [None]:
def XGBCV(X_data, y_data, model_params, folds_n):
    folds = KFold(n_splits=folds_n, shuffle=True, random_state=42)
    kfold_gs_score=0
    for train_index, test_index in folds.split(X_data):
    #    Seperate train/test data
        train_X, train_y = X_data.loc[train_index], y_data[train_index]
        test_X, test_y = X_data.loc[test_index], y_data[test_index]

    #    Preparing the training data
        eval_set_list = create_eval_set(test_X, test_y)
        dMatTrain = to_dMatrix(train_X, train_y)
        dMatTest = to_dMatrix(test_X)          

    #    Training the model
        gs = xgb.train(
                        params=model_params, 
                        dtrain=dMatTrain, 
                        num_boost_round=5000, 
                        evals=eval_set_list,
                        early_stopping_rounds=100, 
                        verbose_eval=False
                        )

        pred_y = np.expm1(gs.predict(dMatTest))

    #    Sum kfold model score
        kfold_gs_score += np.sqrt(metrics.mean_squared_error(np.expm1(test_y), pred_y))

    # Produce average model score
    gs_score = kfold_gs_score / folds_n
    return gs_score

In [None]:
## Preparing the data
best_params = {'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.8999999999999999}
params = {
    'objective': 'reg:squarederror',
    'eta':0.01,
    'eval_metric':'rmse',
    'tree_method': 'gpu_hist'
}
params.update(best_params)

In [None]:
# A collection of all model scores
scores = {}
idx = []

# Mean smoothing features
scores['ms3'] = XGBCV(X_3mms, y_3mms, params, 5)
# scores['ms6'] = XGBCV(X_6mms, y_6mms, params, 5)
# scores['ms12'] = XGBCV(X_12mms, y_12mms, params, 5)

# scores['ms7'] = XGBCV(X_7dms, y_7dms, params, 5)
# scores['ms30'] = XGBCV(X_30dms, y_30dms, params, 5)

# del x_log, y_log
gc.collect()

#### Final Model
Now that we have found our best parameters, we can train our final model and submit to the competition.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_3mms, y_3mms, test_size=0.25, random_state=42)

# Eval_set train/test preformance data
dX_train = xgb.DMatrix(X_train, y_train)
dy_test = xgb.DMatrix(X_test, y_test)
 
# Training data
dtrain = xgb.DMatrix(X, y)

# del X_train, X_test, y_train, y_test
gc.collect()

64

In [14]:
dtest = xgb.DMatrix(test)
del test

NameError: name 'test' is not defined

In [18]:
# original algorithm
best_params = {'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.8999999999999999}
params = {
    'objective': 'reg:squarederror',
    'eta':0.01,
    'eval_metric':'rmse',
    'tree_method': 'gpu_hist'
}

# params.update(best_params)

eval_set = [(dX_train, 'train'), (dy_test, 'eval')]
bst = xgb.train(
                params=params, 
                dtrain=dtrain, 
                num_boost_round=5000, 
                evals=eval_set,
                early_stopping_rounds=100, 
                verbose_eval=1000
                )

[0]	train-rmse:13.9626	eval-rmse:13.9538
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
[1000]	train-rmse:1.12053	eval-rmse:1.10625
[2000]	train-rmse:0.993619	eval-rmse:0.985007
[3000]	train-rmse:0.908328	eval-rmse:0.900145
[4000]	train-rmse:0.84454	eval-rmse:0.835424
[4999]	train-rmse:0.794802	eval-rmse:0.78424


In [26]:
y_pred = np.expm1(bst.predict(dtest))

In [None]:
sub = pd.read_csv(path + 'sample_submission.csv')
sub['target'] = y_pred
sub.to_csv('submission.csv', index=False)