In [1]:
import pandas
import pandas as pd
import pickle
import numpy as np
# from sklearn.model_selection import train_test_split
# from xgboost import XGBClassifier
import  xgboost as xgb
# import os
# import sys
import gc
# import catboost
from sklearn.model_selection import StratifiedKFold

# file = open('cat_cols.txt', 'rb')
# cat_cols = pickle.load(file)
# file.close()
class CONFIG:

    """
    Configuration of training

    params: dict = Parameters of Tree Booster

    n_folds: int = number of splits for Stratified K-Folds

    n_rounds: int = number of boosting iterations

    early_stopping: int = stop if there is little to no improvement
    """

    # Parameters of Tree Booster
    params = {
        'eta': 0.0175,
        'gamma': 0,
        'max_depth': 6,
        'min_child_weight': 1,
        'max_delta_step': 1, #suggested by xgb documentation for imbalanced dataset
        'max_leaves': 100,
        'objective': 'binary:logistic',
        'disable_default_eval_metric': 1,
        'tree_method': 'gpu_hist'
    }
    # number of folds
    n_folds = 5
    n_rounds = 6000
    early_stopping = 3000
    verbose_eval = 300
    def output(self):
        print('=' * 60)
        print('CONFIGURATION')
        print('=' * 60)
        print('Tree Booster parameters: ')
        for k, v in self.params.items():
            print(f'{k}: {v}')
        print(f'Number of boosting rounds: {self.n_rounds}')
        print(f'Early stopping rounds: {self.early_stopping}')
        print(f'Stratified K-Fold number of splits: {self.n_folds}')
        print('=' * 60)
        print('END')
        print('=' * 60)
def one_hot(train, test):
    """
    Simple encoding using pandas.DataFrame.get_dummies()

    !!! If some values do not appear in both datasets in same column
    function will not work correctly. Check that test and train have same columns !!!

    :param train: train dataset <- pandas.DataFrame
    :param test: test dataset <- pandas.DataFrame
    :return: same datasets with one hot encoded categoricals -> tuple[pandas.DataFrame, pandas.DataFrame]
    """
    c_enc = train.columns.to_series().groupby(train.dtypes).groups[np.dtype('object')].tolist()
    for col in c_enc:
        dummies = pd.get_dummies(train[col], prefix=col, drop_first=False)
        train = pd.concat([train, dummies], axis=1)
        train.drop(columns=col, inplace=True)
        dummies = pd.get_dummies(test[col], prefix=col, drop_first=False)
        test = pd.concat([test, dummies], axis=1)
        test.drop(columns=col, inplace=True)
    gc.collect()
    return train, test

def amex_metric(y_true, y_pred):
    """
    Amex Kaggle Competition Metric
    :param y_true: true labels
    :param y_pred: predicted values
    :return: metric score -> float
    """
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)
def xgb_amex_metric(predt: np.ndarray, dtrain: xgb.DMatrix):
    """
    Custom Metric for XGBoost
    using Amex Metric
    :param predt: predicted values <- np.ndarray
    :param dtrain: matrix to get labels (true values) from <- xgb.Dmatrix
    :return: name of the metric, score -> tuple[str, float]
    """
    y_true = dtrain.get_label()
    return 'amex_metric', amex_metric(y_true, predt)

def mem_usage_gb(df: pandas.DataFrame, deep: bool):
    """
    Memory usage of DataFrame
    :param df: dataset <- pandas.DataFrame
    :param deep: parameter of pandas.DataFrame.memory_usage(deep=) <- bool
    :return: rounded memory usage in GB -> float
    """
    return round((df.memory_usage(deep=deep).sum()/1073741824), 2)

In [2]:
def load_dataframes(message: str):
    print(message)
    file = open('dtype_for_agg_catb.txt', 'rb')
    dtypes = pickle.load(file)
    file.close()
    train = pd.read_csv('prep_catboost_train.csv', dtype=dtypes)
    train.drop(columns=['customer_ID'], inplace=True)
    test = pd.read_csv('prep_catboost_test.csv', dtype=dtypes)
    test.drop(columns=['customer_ID'], inplace=True)
    train, test = one_hot(train, test)
    idx = train.columns.get_loc('D_64_last_-1')
    test.insert(loc=idx, column='D_64_last_-1', value=[0] * len(test))
    test['D_64_last_-1'] = test['D_64_last_-1'].astype('uint8')
    print('encoded cols in train, test')
    display(train.dtypes.loc[train.dtypes == 'uint8'], test.dtypes.loc[test.dtypes == 'uint8'])
    deep = True
    print('train dataset mem usage:', mem_usage_gb(train, deep), 'GB')
    print('test dataset mem usage:', mem_usage_gb(test, deep), 'GB')
    labels = pd.read_csv('train_labels.csv', dtype={'target': 'int8'})
    labels.drop(columns=['customer_ID'], inplace=True)
    labels = np.ravel(labels)
    unique, counts = np.unique(labels, return_counts=True)
    count_dict = dict(zip(unique, counts))
    for k, v in count_dict.items():
        print(f'Number of {k} in train_labels: {v}')
        print(f'percentage of {k} in train_labels: {int(round(v / len(labels), 2)) * 100}%')
    print('datasets loaded, total mem usage: ', mem_usage_gb(train, deep) + mem_usage_gb(test, deep), 'GB')
    del deep, unique, counts, count_dict
    gc.collect()
    return train, test, labels
def get_predictions(train_data, test_data, train_labels):
    total_predictions = np.zeros(test_data.shape[0])
    CONFIG().output()
    print('Start training...')
    results = dict()
    skf = StratifiedKFold(n_splits=CONFIG.n_folds)
    n = 1
    for train_index, test_index in skf.split(train_data, train_labels):
        print('=' * 60)
        print(f'Fold number: {n}')
        print('=' * 60)
        # Get counts of 0 and 1 in K-Fold labels
        unique, counts = np.unique(train_labels[train_index], return_counts=True)
        count_dict = dict(zip(unique, counts))
        for k, v in count_dict.items():
            print(f'Number of {k} in train_labels: {v}')
            print(f'percentage of {k} in train_labels: {int(round(v / len(train_labels[train_index]), 2)) * 100}%')
        unique, counts = np.unique(train_labels[test_index], return_counts=True)
        count_dict = dict(zip(unique, counts))
        for k, v in count_dict.items():
            print(f'Number of {k} in test(eval)_labels: {v}')
            print(f'percentage of {k} in test(eval)_labels: {int(round(v / len(train_labels[test_index]), 2)) * 100}%')
        print('=' * 60)
        del unique, counts, count_dict
        gc.collect()
        n += 1
        d_train = xgb.DMatrix(train_data.iloc[train_index], train_labels[train_index])
        d_test = xgb.DMatrix(train_data.iloc[test_index], train_labels[test_index])
        model = xgb.train(CONFIG.params,
                          dtrain= d_train,
                          num_boost_round= CONFIG.n_rounds,
                          evals= [(d_train, 'train'), (d_test, 'eval')],
                          custom_metric= xgb_amex_metric,
                          early_stopping_rounds= CONFIG.early_stopping,
                          evals_result= results,
                          verbose_eval= CONFIG.verbose_eval,
                          )
        d_out = xgb.DMatrix(test_data)
        fold_out_predictions = model.predict(d_out)
        # adjust predictions according to n_folds
        total_predictions += fold_out_predictions / CONFIG.n_folds
        print('Fold model successfully trained. Predictions saved')
    del d_train, d_test
    gc.collect()
    sample = pd.read_csv('sample_submission.csv')
    output = pd.DataFrame({'customer_ID': sample.customer_ID, 'prediction': total_predictions})
    return output, results

first\
eta = 0.04
n_rounds = 1200
early_stopping = 600
second\
eta = 0.03
n_rounds = 3000
early stopping = 1500

In [3]:
train_d, test_d, train_l = load_dataframes('Ivan Piiashev trying ML')

Ivan Piiashev trying ML


FileNotFoundError: [Errno 2] No such file or directory: 'dtype_for_agg_catb.txt'

In [None]:
ofile, result = get_predictions(train_d, test_d, train_l)

In [None]:
ofile.to_csv('sub2_k_folds.csv', index=False)