In [None]:
# Load libraries.
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import gc

import xgboost as xgb

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics

In [None]:
# Import datasets.
path = '../input/santander-value-prediction-challenge/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [None]:
X = train.drop(["ID", "target"], axis=1)
y = np.log1p(train["target"].values)

test = test.drop(["ID"], axis=1)

del train
gc.collect()

## EDA
First we want to get an idea of what we are working with. Lets have a look at the shape, head, tail and the data types in each row.

In [None]:
print(f'The train dataset has {X.shape[0]} rows and {X.shape[1]} columns')
print(f'The test dataset has {test.shape[0]} rows and {test.shape[1]} columns')

In [None]:
X.head(10)

In [None]:
test.head(10)

In [None]:
train_dtype = X.dtypes.unique().tolist()
test_dtype = test.dtypes.unique().tolist()

print(f'The train dataset contains {train_dtype[0]}, {train_dtype[1]} and {train_dtype[2]} data types')
print(f'The test dataset contains {test_dtype[0]} and {test_dtype[1]} data types')

The only the ID column is of type 'object', we have no categorical variables to encoded.

Just from the initial look, it seems we have a very sparse dataset for both the train and test data. All columns have been anonymised so it will be tricky to get an intuition on what variables we want to use. We also see that there are more features than rows so manual feature selection will not be feasible. 
The target variable seems to consist of very large values.

### Missing Values

In [None]:
y_na = y.isna().any().sum()
X_na = X.isna().any().sum()
test_na = test.isna().any().sum()
print(f'{y_na} contain missing values in the target data')
print(f'{X_na} out of {len(X.columns)} columns contain missing values in the predictor data')
print(f'{test_na} out of {len(test.columns)} columns contain missing values in the test data')

From this, we can conclude there is not missing values meaning not data imputation will be needed.

Next, we will explore the features more in-depth.

### Target

In [None]:
print(f'Target minimum & maximum values: {np.min(y)} & {np.max(y)}')

In [None]:
# 'TransactionAmt' feature
plt.style.use('ggplot')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 3))

# Not Fraud
ax1.title.set_text('Tagret distribution')
ax1.set_xlabel('Target')
ax1.set_ylabel('Number of occuraces')
_ = ax1.hist(y, bins=100)

# Fraud
ax2.title.set_text('log Tagret distribution')
ax2.set_xlabel('Target')
ax2.set_ylabel('Number of occuraces')
_ = ax2.hist(np.log(y), bins=100)

After a log transformation, we see a flat distribution occuring with larget peaks between 12 and 17.

## Feature Engineering

## Feature Selection
Before we do any feature selection, all constant values should be removed as they offer nothing to the model.

In [None]:
# Removes features containing constant values
feat_to_remove = []
for feat in X.columns:
    if len(X[feat].unique()) == 1:
        feat_to_remove.append(feat)
        
X.drop(feat_to_remove, axis=1, inplace=True)
test.drop(feat_to_remove, axis=1, inplace=True)

print(f'Removed {len(feat_to_remove)} Constant Columns\n')

### Prediction Features
As our training data is anonymised, we must use statistical methodlegy to examine which features might produce good results.

We will start by producing a correlation plot with 

#### Correlation

In [None]:
# corr_df = X.drop('ID', axis=1).corr()

In [None]:
# corr_df = corr_df.where((corr_df != 1) & (corr_df > 0.75) | (corr_df < -0.75))

In [None]:
# dic = {}
# for col in corr_df.columns:
#     for row in corr_df.index:
#         if corr_df[col].loc[row] != 'nan':
#             if col in dic:
#                 dic[col].append(row)
#             else:
#                 dic[col] = [row]

# dic

## GridSearch with Cross-Validation
To get the best XGB model possible, it is important to try a range of different parameters over the training data. Below is a grid search function that searches every combination between the three parameters provided to it, recording which is the best combination. The metric used for determining the best model is the RMSE.

In [None]:
def to_dMatrix(X, y=None):
    if y is None:
        return xgb.DMatrix(X)
    else:
        return xgb.DMatrix(X, y)

In [None]:
def create_eval_set(X_val, y_val):
    return [(xgb.DMatrix(X_val, y_val), 'eval')]

In [None]:
def GridSearch(hyperParams, train_df, target):
    # GridSearch function can only support the exploration of 3 params in hyperParams
    
    best_score = np.inf
    best_params = None

    n_folds = 5
    folds = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    const_params = {
        'objective': 'reg:squarederror',
        'eta':0.01,
        'eval_metric':'rmse',
        'tree_method': 'gpu_hist'
    }
    
#     Create iterable parameters
    grid = ParameterGrid(hyperParams)
    for hyper_params in grid:
        
#         Create dictionary of params 
        param1, param2, param3 = hyper_params.items()
        current_params = {param1[0]: param1[1], param2[0]: param2[1], param3[0]: param3[1]}
        
        print('##########################################################################\n')
        print(f'Parameters being tested: {current_params}')

#         Append param dictionary to model parameters
        const_params.update(current_params)
        
        kfold_gs_score = 0
        for train_index, test_index in folds.split(train_df):
#             Seperate train/test data
            train_X, train_y = train_df.loc[train_index], target[train_index]
            test_X, test_y = train_df.loc[test_index], target[test_index]
            
#             Preparing the training data
            eval_set_list = create_eval_set(test_X, test_y)
            dMatTrain = to_dMatrix(train_X, train_y)
            dMatTest = to_dMatrix(test_X)           
            
#             Training the model
            start = time.time()
            gs = xgb.train(
                            params=const_params, 
                            dtrain=dMatTrain, 
                            num_boost_round=5000, 
                            evals=eval_set_list,
                            early_stopping_rounds=100, 
                            verbose_eval=1000
                            )    
            end = time.time()
            print(f'Execution time: {np.round((end - start),2)}s\n')
            print('------------------------------------')
            
            pred_y = np.expm1(gs.predict(dMatTest))
        
#             Sum kfold model score
            kfold_gs_score += np.sqrt(metrics.mean_squared_error(np.expm1(test_y), pred_y))
    
#         Produce average model score
        gs_score = kfold_gs_score / n_folds
        print(f'Parameter score: {gs_score}\n')
        
#         Only keep the best parameters
        if best_score > gs_score:
            best_score = gs_score
            best_params = current_params
    print(f'The best parameters found were: {best_params}')
    return best_params

In [None]:
%%time
grid_params = {
    'subsample': np.arange(0.5,1,0.1).tolist(),
    'max_depth': np.arange(3,12,1).tolist(),
    'min_child_weight': np.arange(3,6,1).tolist()
}
print(f'Number of training features: {X.shape[1]} | Number of training rows: {X.shape[0]}')
best_params = GridSearch(grid_params, X, y)

#### Final Model
Now that we have found our best parameters, we can train our final model and submit to the competition.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

# Eval_set train/test preformance data
dX_train = xgb.DMatrix(X_train, y_train)
dy_test = xgb.DMatrix(X_test, y_test)

# Training data
dtrain = xgb.DMatrix(X, y)

del X_train, X_test, y_train, y_test
gc.collect()

In [None]:
dtest = xgb.DMatrix(test)
del test

In [None]:
# original algorithm
# best_params = {'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.8999999999999999}
params = {
    'objective': 'reg:squarederror',
    'eta':0.01,
    'eval_metric':'rmse',
    'tree_method': 'gpu_hist'
}

params.update(best_params)

eval_set = [(dX_train, 'train'), (dy_test, 'eval')]
bst = xgb.train(
                params=params, 
                dtrain=dtrain, 
                num_boost_round=5000, 
                evals=eval_set,
                early_stopping_rounds=100, 
                verbose_eval=1000
                )

y_pred = np.expm1(bst.predict(dtest))

In [None]:
sub = pd.read_csv(path + 'sample_submission.csv')
sub['target'] = y_pred
sub.to_csv('submission.csv', index=False)