In [9]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import os
import sys
import gc
import catboost
from sklearn.model_selection import StratifiedKFold

file = open('cat_cols.txt', 'rb')
cat_cols = pickle.load(file)
file.close()
file = open('dtype_for_agg_catb.txt', 'rb')
dtype_dict = pickle.load(file)
file.close()

def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    probabilities, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted Gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted Gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted Gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def dt_converter(dtype):
    if dtype == 'float64':
        return 'float16'
    elif dtype == 'int64':
        return 'int16'
    else:
        return 'object'

def mem_usage_gb(df):
    return round((df.memory_usage(deep=True).sum()/1073741824), 2)

In [10]:
train_data = pd.read_csv('prep_catboost_train.csv', dtype=dtype_dict)
features = train_data.drop(columns=['customer_ID']).columns.to_list()
display(train_data.head())
print('DataFrame memory usage:', mem_usage_gb(train_data), 'GB')

Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933594,0.0242,0.868652,0.960449,0.93457,0.010704,0.024445,0.001082,0.091492,...,1,13,O,1,0,,0,13,6.0,1
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.899902,0.022095,0.861328,0.929199,0.880371,0.21521,0.199097,0.002224,0.567383,...,1,13,O,1,0,,0,13,6.0,1
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.878418,0.028839,0.797852,0.904297,0.880859,0.004181,0.002758,0.000802,0.009705,...,1,13,R,1,0,,0,13,6.0,1
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.599121,0.020081,0.567383,0.623535,0.621582,0.048859,0.088501,0.00066,0.268555,...,1,13,O,1,0,,0,13,3.0,3
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.891602,0.042328,0.805176,0.94043,0.87207,0.004642,0.002882,3e-05,0.008682,...,1,13,O,1,13,1.0,1,13,6.0,1


DataFrame memory usage: 0.88 GB


In [13]:
train_labels = pd.read_csv('train_labels.csv', dtype={'target': 'int8'})
train_labels.drop(columns=['customer_ID'], inplace=True)
train_labels = np.ravel(train_labels)

In [14]:
# X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
# y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(train_data, train_labels):
    print("TRAIN:", train_index, "TEST:", test_index)
    # X_train, X_test = X[train_index], X[test_index]
    # y_train, y_test = y[train_index], y[test_index]

TRAIN: [ 91365  91370  91372 ... 458910 458911 458912] TEST: [    0     1     2 ... 91967 91968 91970]
TRAIN: [     0      1      2 ... 458910 458911 458912] TEST: [ 91365  91370  91372 ... 183738 183739 183741]
TRAIN: [     0      1      2 ... 458910 458911 458912] TEST: [183102 183107 183110 ... 275769 275770 275772]
TRAIN: [     0      1      2 ... 458910 458911 458912] TEST: [274193 274194 274195 ... 367328 367330 367331]
TRAIN: [     0      1      2 ... 367328 367330 367331] TEST: [366545 366546 366547 ... 458910 458911 458912]


In [6]:

import xgboost as xgb
model = XGBClassifier(objective= 'binary:logistic', random_state= 42, n_estimators= 1200)
# 'tree_method': 'gpu_hist',
params = {
    'objective': 'binary:logistic',
    'random_state': 42,
    'n_estimators': 1200
}
n_tests = 5
metric_scores = []
metric_scores2 = []
for i in range(n_tests):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    # d_train = xgb.DMatrix(X_train, y_train)
    # model = xgb.train(params, d_train)
    predictions = model.predict_proba(X_test)[:, 1]
    metric_scores.append(amex_metric_numpy(y_test, predictions))
    metric_scores2.append(amex_metric(y_test, predictions))
print('#' * 50)
print('Accuracy_0: ', *metric_scores)
print('#' * 50)
print('')
print('#' * 50)
print('Accuracy_1: ', *metric_scores2)
print('#' * 50)


'float16'