In [52]:
import pandas as pd
import pickle
import numpy as np
# from sklearn.model_selection import train_test_split
# from xgboost import XGBClassifier
import  xgboost as xgb
# import os
# import sys
import gc
# import catboost
from sklearn.model_selection import StratifiedKFold

# file = open('cat_cols.txt', 'rb')
# cat_cols = pickle.load(file)
# file.close()
class CONFIG:
    """
    Configuration of training

    params: dict = Parameters of Tree Booster

    n_folds: int = number of splits for Stratified K-Folds

    n_rounds: int = number of boosting iterations

    early_stopping: int = stop if there is little to no improvement
    """
    # Parameters of Tree Booster
    params = {
        'eta': 0.03,
        'gamma': 0,
        'max_depth': 6,
        'min_child_weight': 1,
        'max_delta_step': 1, #suggested by xgb documentation for imbalanced dataset
        'max_leaves': 100,
        'objective': 'binary:logistic',
        'disable_default_eval_metric': 1,
        'tree_method': 'hist'
    }
    # number of folds
    n_folds = 5
    n_rounds = 3000
    early_stopping = 1500
    verbose_eval = 150
    def output(self):
        print('CONFIGURATION')
        print('=' * 60)
        print('Tree Booster parameters: ')
        for k, v in self.params.items():
            print(f'{k}: {v}')
        print(f'Number of boosting rounds: {self.n_rounds}')
        print(f'Early stopping rounds: {self.early_stopping}')
        print('=' * 60)
        print(f'Stratified K-Fold number of splits: {self.n_folds}')
        print('END')
def one_hot(train, test):
    c_enc = train.columns.to_series().groupby(train.dtypes).groups[np.dtype('object')].tolist()
    for col in c_enc:
        dummies = pd.get_dummies(train[col], prefix=col, drop_first=False)
        train = pd.concat([train, dummies], axis=1)
        train.drop(columns=col, inplace=True)
        dummies = pd.get_dummies(test[col], prefix=col, drop_first=False)
        test = pd.concat([test, dummies], axis=1)
        test.drop(columns=col, inplace=True)
    gc.collect()
    return train, test

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)
def xgb_amex_metric(predt: np.ndarray, dtrain: xgb.DMatrix):
    y_true = dtrain.get_label()
    return 'amex_metric', amex_metric(y_true, predt)
def dt_converter(dtype):
    if dtype == 'float64':
        return 'float16'
    elif dtype == 'int64':
        return 'int16'
    else:
        return 'object'

def mem_usage_gb(df):
    return round((df.memory_usage(deep=True).sum()/1073741824), 2)

In [56]:
def load_dataframes():
    file = open('dtype_for_agg_catb.txt', 'rb')
    dtypes = pickle.load(file)
    file.close()
    train = pd.read_csv('prep_catboost_train.csv', dtype=dtypes)
    train.drop(columns=['customer_ID'], inplace=True)
    test = pd.read_csv('prep_catboost_test.csv', dtype=dtypes)
    test.drop(columns=['customer_ID'], inplace=True)
    train, test = one_hot(train, test)
    idx = train.columns.get_loc('D_64_last_-1')
    test.insert(loc=idx, column='D_64_last_-1', value=[0] * len(test))
    test['D_64_last_-1'] = test['D_64_last_-1'].astype('uint8')
    print('encoded cols in train, test')
    display(train.dtypes.loc[train.dtypes == 'uint8'], test.dtypes.loc[test.dtypes == 'uint8'])
    print('train dataset mem usage:', mem_usage_gb(train), 'GB')
    print('test dataset mem usage:', mem_usage_gb(test), 'GB')
    labels = pd.read_csv('train_labels.csv', dtype={'target': 'int8'})
    labels.drop(columns=['customer_ID'], inplace=True)
    labels = np.ravel(labels)
    print('datasets loaded, total mem usage: ', mem_usage_gb(train) + mem_usage_gb(test), 'GB')
    gc.collect()
    return train, test, labels
def get_predictions(train_data, test_data, train_labels):
    total_predictions = np.zeros(test_data.shape[0])
    CONFIG().output()
    print('Start training...')
    results = dict()
    skf = StratifiedKFold(n_splits=CONFIG.n_folds)
    n = 1
    for train_index, test_index in skf.split(train_data, train_labels):
        print('=' * 60)
        print(f'Fold number: {n}')
        print('=' * 60)
        n += 1
        d_train = xgb.DMatrix(train_data.iloc[train_index], train_labels[train_index])
        d_test = xgb.DMatrix(train_data.iloc[test_index], train_labels[test_index])
        model = xgb.train(CONFIG.params,
                          dtrain= d_train,
                          num_boost_round= CONFIG.n_rounds,
                          evals= [(d_train, 'train'), (d_test, 'eval')],
                          custom_metric= xgb_amex_metric,
                          early_stopping_rounds= CONFIG.early_stopping,
                          evals_result= results,
                          verbose_eval= CONFIG.verbose_eval,
                          )
        d_out = xgb.DMatrix(test_data)
        fold_out_predictions = model.predict(d_out)
        # adjust predictions according to n_folds
        total_predictions += fold_out_predictions / CONFIG.n_folds
    del d_train, d_test
    gc.collect()
    sample = pd.read_csv('sample_submission.csv')
    output = pd.DataFrame({'customer_ID': sample.customer_ID, 'prediction': total_predictions})
    return output, results

first\
eta = 0.04
n_rounds = 1200
early_stopping = 600
second\
eta = 0.03
n_rounds = 3000
early stopping = 1500

In [54]:
train_d, test_d, train_l = load_dataframes()

encoded cols in train, test


D_63_last_CL    uint8
D_63_last_CO    uint8
D_63_last_CR    uint8
D_63_last_XL    uint8
D_63_last_XM    uint8
D_63_last_XZ    uint8
D_64_last_-1    uint8
D_64_last_O     uint8
D_64_last_R     uint8
D_64_last_U     uint8
dtype: object

D_63_last_CL    uint8
D_63_last_CO    uint8
D_63_last_CR    uint8
D_63_last_XL    uint8
D_63_last_XM    uint8
D_63_last_XZ    uint8
D_64_last_-1    uint8
D_64_last_O     uint8
D_64_last_R     uint8
D_64_last_U     uint8
dtype: object

train dataset mem usage: 0.79 GB
test dataset mem usage: 1.59 GB
datasets loaded, total mem usage:  2.38 GB


In [57]:
ofile, result = get_predictions(train_d, test_d, train_l)
result

CONFIGURATION
Tree Booster parameters: 
eta: 0.03
gamma: 0
max_depth: 6
min_child_weight: 1
max_delta_step: 1
max_leaves: 100
objective: binary:logistic
disable_default_eval_metric: 1
tree_method: hist
Number of boosting rounds: 3000
Early stopping rounds: 1500
Stratified K-Fold number of splits: 5
END
Start training...
Fold number: 1
[0]	train-amex_metric:0.65128	eval-amex_metric:0.65007
[150]	train-amex_metric:0.78518	eval-amex_metric:0.76906
[300]	train-amex_metric:0.80956	eval-amex_metric:0.78172
[450]	train-amex_metric:0.82619	eval-amex_metric:0.78652
[600]	train-amex_metric:0.83977	eval-amex_metric:0.78787
[750]	train-amex_metric:0.84849	eval-amex_metric:0.78894
[900]	train-amex_metric:0.85820	eval-amex_metric:0.79001
[1050]	train-amex_metric:0.86823	eval-amex_metric:0.79019
[1200]	train-amex_metric:0.87853	eval-amex_metric:0.79083
[1350]	train-amex_metric:0.88777	eval-amex_metric:0.79010
[1499]	train-amex_metric:0.89718	eval-amex_metric:0.79074
Fold number: 2
[0]	train-amex_metr

{'train': OrderedDict([('amex_metric',
               [0.650206,
                0.673163,
                0.673298,
                0.678995,
                0.685455,
                0.69084,
                0.692834,
                0.701101,
                0.707076,
                0.710197,
                0.711131,
                0.715692,
                0.718826,
                0.720326,
                0.723528,
                0.725362,
                0.72674,
                0.729463,
                0.730481,
                0.730495,
                0.732491,
                0.735258,
                0.73617,
                0.73694,
                0.738431,
                0.739201,
                0.740388,
                0.741656,
                0.741872,
                0.742515,
                0.7431,
                0.74485,
                0.74562,
                0.746097,
                0.746983,
                0.747269,
                0.748336,
       

In [58]:
ofile.to_csv('sub_k_folds.csv', index=False)