In [1]:
# Load libraries.
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import os

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# Import datasets.
path = '../input/santander-value-prediction-challenge/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [3]:
X = train.drop(["ID", "target"], axis=1)
y = train["target"].values

test = test.drop(["ID"], axis=1)

del train

In [4]:
# Removes features containing constant values
feat_to_remove = []
for feat in X.columns:
    if len(X[feat].unique()) == 1:
        feat_to_remove.append(feat)
        
X.drop(feat_to_remove, axis=1, inplace=True)
test.drop(feat_to_remove, axis=1, inplace=True)

print(f'Removed {len(feat_to_remove)} Constant Columns\n')

Removed 256 Constant Columns



In [5]:

# Preparing model evaluation data.
# predictors = X.columns

X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.25, random_state=42)

X_train = scipy.sparse.csc_matrix(X_train)
X_test = scipy.sparse.csc_matrix(X_test)


# Eval_set train/test preformance data
dX_train = xgb.DMatrix(X_train, y_train)
dy_test = xgb.DMatrix(X_test, y_test)

# Training data
dtrain = xgb.DMatrix(X.values, y)

# del X_train, X_test, y_train, y_test

In [6]:
# Preparing customer value prediction data.
test = scipy.sparse.csc_matrix(test)
dtest = xgb.DMatrix(test)
# del test

In [7]:
%%time
params = {
    'objective': 'reg:squarederror',
    'eta':0.01,
    'eval_metric':'rmse',
    'tree_method': 'gpu_hist'
}

eval_set = [(dX_train, 'train'), (dy_test, 'eval')]
bst = xgb.train(params, dtrain, 2000, eval_set, early_stopping_rounds=100, verbose_eval=100)

# metrics.mean_squared_error(y_test, y_pred)

[0]	train-rmse:13.9531	eval-rmse:13.9444
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
[100]	train-rmse:6.21002	eval-rmse:6.18791
[200]	train-rmse:3.46127	eval-rmse:3.4338
[300]	train-rmse:2.52339	eval-rmse:2.4901
[400]	train-rmse:2.08347	eval-rmse:2.04632
[500]	train-rmse:1.96259	eval-rmse:1.92888
[600]	train-rmse:1.85682	eval-rmse:1.8263
[700]	train-rmse:1.74725	eval-rmse:1.72063
[800]	train-rmse:1.71853	eval-rmse:1.69176
Stopping. Best iteration:
[798]	train-rmse:1.71411	eval-rmse:1.68772

CPU times: user 4.86 s, sys: 12.2 s, total: 17 s
Wall time: 17.3 s


In [8]:
y_pred = np.exp(bst.predict(dtest))

In [9]:
sub = pd.read_csv(path + 'sample_submission.csv')
sub['target'] = y_pred
sub.to_csv('submission.csv', index=False)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
