In [14]:
import pandas
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import  xgboost as xgb
import os
import gc
import catboost as ctb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold




class PREPROCESS:
    categorical = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    catboost_prep = True
    xgb_lgb_prep = False




class CONFIG:
    def __init__(self, name: str, params: dict, num_boost_round: int, early_stopping = None, verbose_eval = None, n_folds = None):
        self.name = name
        self.params = params
        self.num_boost_round = num_boost_round
        self.early_stopping = early_stopping
        self.verbose_eval = verbose_eval
        self.n_folds = n_folds
    def __str__(self):
        s = self.name; s += '\n'
        s += 'Booster parameters:\n'
        for k, v in self.params.items():
            s += f'{k}: {v}\n'
        s += '.train() args:\n'
        s += f'Number of boosting rounds: {self.num_boost_round}\n'
        if self.early_stopping is not None:
            s += f'Early stopping rounds: {self.early_stopping}\n'
        if self.verbose_eval is not None:
            s += f'Verbose eval: {self.verbose_eval}\n'
        if self.n_folds is  not None:
            s += f'Stratified K-Fold number of splits: {self.n_folds}\n'
        s += 'END OF CONFIG'
        return s
    def save(self, path: str):
        f = open(path, 'w')
        print(self, file=f)
        f.close()




def one_hot(train, test):
    """
    Simple encoding using pandas.DataFrame.get_dummies()

    !!! If some values do not appear in both datasets in same column
    function will not work correctly. Check that test and train have same columns !!!

    :param train: train dataset <- pandas.DataFrame
    :param test: test dataset <- pandas.DataFrame
    :return: same datasets with one hot encoded categoricals -> tuple[pandas.DataFrame, pandas.DataFrame]
    """
    c_enc = train.columns.to_series().groupby(train.dtypes).groups[np.dtype('object')].tolist()
    for col in c_enc:
        dummies = pd.get_dummies(train[col], prefix=col, drop_first=False)
        train = pd.concat([train, dummies], axis=1)
        train.drop(columns=col, inplace=True)
        dummies = pd.get_dummies(test[col], prefix=col, drop_first=False)
        test = pd.concat([test, dummies], axis=1)
        test.drop(columns=col, inplace=True)
    gc.collect()
    return train, test

def amex_metric(y_true, y_pred):
    """
    Amex Kaggle Competition Metric
    :param y_true: true labels
    :param y_pred: predicted values
    :return: metric score -> float
    """
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)
def lgb_amex_metric(preds: np.ndarray, train_data: lgb.Dataset):
    y_true = train_data.get_label()
    eval_name = 'amex_metric'
    value = amex_metric(y_true, preds)
    is_higher_better = True
    return eval_name, value, is_higher_better
def xgb_amex_metric(predt: np.ndarray, dtrain: xgb.DMatrix):
    """
    Custom Metric for XGBoost
    using Amex Metric
    :param predt: predicted values <- np.ndarray
    :param dtrain: matrix to get labels (true values) from <- xgb.Dmatrix
    :return: name of the metric, score -> tuple[str, float]
    """
    y_true = dtrain.get_label()
    return 'amex_metric', amex_metric(y_true, predt)
def file_mem_usage_gb(file_name: str):
    """
    Disk memory usage of file in GB
    :param file_name: pathlike[str]
    :return: size in GB -> float
    """
    return round((os.stat(file_name).st_size/1073741824), 2)
def mem_usage_gb(df: pandas.DataFrame, deep: bool):
    """
    Memory usage of DataFrame
    :param df: dataset <- pandas.DataFrame
    :param deep: parameter of pandas.DataFrame.memory_usage(deep=) <- bool
    :return: rounded memory usage in GB -> float
    """
    return round((df.memory_usage(deep=deep).sum()/1073741824), 2)

In [15]:
config_xgb = CONFIG('CONFIG_XGB',
                    params= {
                        'eta': 0.04,
                        'gamma': 0,
                        'max_depth': 6,
                        'min_child_weight': 1,
                        'max_delta_step': 2, #suggested by xgb documentation for imbalanced dataset
                        'max_leaves': 127,
                        'objective': 'binary:logistic',
                        'disable_default_eval_metric': 1,
                        'tree_method': 'gpu_hist'},
                    num_boost_round= 1500,
                    early_stopping= 1000,
                    verbose_eval= 250,
                    n_folds= 5
                    )
config_lgb = CONFIG('CONFIG_LGBM',
                    params= {
                        'boosting_type': 'dart',
                        'objective': 'binary',
                        'learning_rate': 0.04,
                        'num_leaves': 127},
                    num_boost_round= 1200,
                    early_stopping= 900,
                    verbose_eval= 200,
                    n_folds= 5
                    )

CONFIG_XGB
Booster parameters:
eta: 0.04
gamma: 0
max_depth: 6
min_child_weight: 1
max_delta_step: 2
max_leaves: 127
objective: binary:logistic
disable_default_eval_metric: 1
tree_method: gpu_hist
.train() args:
Number of boosting rounds: 1500
Early stopping rounds: 1000
Verbose eval: 25
Stratified K-Fold number of splits: 8
END OF CONFIG
CONFIG_LGBM
Booster parameters:
boosting_type: dart
objective: binary
learning_rate: 0.04
num_leaves: 127
.train() args:
Number of boosting rounds: 1100
END OF CONFIG


In [5]:
def read_datasets():
    # load train, test
    train = pd.read_parquet('train_data.parquet')
    test = pd.read_parquet('test_data.parquet')
    # print memory usage
    print('train dataset mem usage:', mem_usage_gb(df=train, deep=True), 'GB')
    print('test dataset mem usage:', mem_usage_gb(df=test, deep=True), 'GB')
    # load true labels for train (target)
    labels = pd.read_csv('train_labels.csv')
    labels.drop(columns= ['customer_ID'], inplace= True)
    # convert labels to numpy.array
    labels = np.ravel(labels)
    # get unique labels counts to see whether the dataset is imbalanced
    counts, names = get_label_counts(labels)
    # Creating plot
    plt.pie(counts, labels = names)
    plt.title('Counts of unique labels')
    # show plot
    plt.show()
    # return datasets
    return train, test, labels




def preprocess_dataset(train: pd.DataFrame, test: pd.DataFrame, prep_flag: bool):
    # get numerical features
    numerical = train.drop(columns= PREPROCESS.categorical).columns.to_list()
    numerical.remove('customer_ID'); numerical.remove('S_2')

    # preprocess train
    # aggregate numerical columns for each customer by mean, std, min, max, last entry
    # rows reduced from 5,5 mil to 458k
    train_num_agg = train.groupby("customer_ID")[numerical].agg(['mean', 'std', 'min', 'max', 'last'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)
    train_num_agg.drop(['customer_ID'], axis= 1, inplace= True)
    # aggregate categoricals by count, last, number of unique entries
    train_cat_agg = train.groupby("customer_ID")[PREPROCESS.categorical].agg(['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    train_cat_agg.drop(['customer_ID'], axis= 1, inplace= True)
    agg_cat_cols = train_cat_agg.columns.tolist()
    train = pd.concat([train_num_agg, train_cat_agg], axis= 1)
    del train_num_agg, train_cat_agg
    gc.collect()

    # preprocess test
    # essentially the same
    # aggregate numerical columns for each customer by mean, std, min, max, last entry
    test_num_agg = test.groupby("customer_ID")[numerical].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)
    test_num_agg.drop(['customer_ID'], axis= 1, inplace= True)
    # aggregate categoricals by count, last, number of unique entries
    test_cat_agg = test.groupby("customer_ID")[PREPROCESS.categorical].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    test_cat_agg.drop(['customer_ID'], axis= 1, inplace= True)
    test = pd.concat([test_num_agg, test_cat_agg], axis= 1)
    del test_num_agg, test_cat_agg
    gc.collect()
    # different encoding for different gradient boosting libraries
    if prep_flag:
        # prep for catboost.
        # catboost accepts categorical features in raw form as string or integers
        # no NaN's, float's can be present in categorical features
        # change dtypes float to object, fill NaN in cat. columns with some string. I chose 'no_data'
        # as I understood Catboost uses one-hot encoding automatically
        print('preprocessing for Catboost')
        nulls = train[agg_cat_cols].isna().sum(); types = train[agg_cat_cols].dtypes
        cat_stats = pd.concat([nulls, types], axis=1).rename(columns={0: "NaN_count", 1: "type"})
        cols_to_encode = cat_stats.loc[(cat_stats['NaN_count'] != 0) | (cat_stats['type'] == 'float16')].index.tolist()
        for col in cols_to_encode:
            train[col] = train[col].astype('object')
            train[col] = train[col].map(str)
        train[agg_cat_cols].fillna('no_data', inplace= True)
        nulls = test[agg_cat_cols].isna().sum(); types = test[agg_cat_cols].dtypes
        cat_stats = pd.concat([nulls, types], axis= 1).rename(columns= {0: "NaN_count", 1: "type"})
        cols_to_encode = cat_stats.loc[(cat_stats['NaN_count'] != 0) | (cat_stats['type'] == 'float16')].index.tolist()
        for col in cols_to_encode:
            test[col] = test[col].astype('object')
            test[col] = test[col].map(str)
        test[agg_cat_cols].fillna('no_data', inplace=True)
    else:
        # else I one-hot encode manually
        # unique values in categoricals of train and test differ
        # one unique value in train appears, that does not in test
        # I have to add a column of 0 manually to test for that value
        # Initially I got the missing col by getting difference of sets of columns (train - test),
        # but here the column is added manually for the sake of simplicity (I'm lazy)
        print('Preprocessing for LGB / XGB')
        train, test = one_hot(train, test)
        idx = train.columns.get_loc('D_64_last_-1')
        test.insert(loc=idx, column='D_64_last_-1', value=[0] * len(test))
    return train, test




def get_label_counts(labels: np.ndarray):
    unique, counts = np.unique(labels, return_counts=True)
    count_dict = dict(zip(unique, counts))
    names = []
    for k, v in count_dict.items():
        names.append(f'{k}, {round(v / len(labels), 2) * 100}%')
        print(f'Count of {k}: ({v} / {len(labels)}), percentage = {round(v / len(labels), 4) * 100}%')
    return counts, names




def train_and_predict_xgb(train_data: pd.DataFrame, test_data: pd.DataFrame, train_labels: np.ndarray, config: CONFIG):
    total_predictions = np.zeros(test_data.shape[0])
    oof_predictions = np.zeros(train_data.shape[0])
    print(config)
    skf = StratifiedKFold(n_splits=config.n_folds)
    n = 1
    for train_index, valid_index in skf.split(train_data, train_labels):
        results = dict()
        print('=' * 60)
        print(f'Fold number: {n}')
        print('=' * 60)
        # Get counts of 0 and 1 in K-Fold labels
        # to see that folds are indeed stratified (percentages of classes are preserved)
        print('label counts in train')
        get_label_counts(train_labels[train_index])
        print('label counts in test(eval)')
        get_label_counts(train_labels[valid_index])

        d_train = xgb.DMatrix(train_data.iloc[train_index], train_labels[train_index])
        d_valid = xgb.DMatrix(train_data.iloc[valid_index], train_labels[valid_index])

        model = xgb.train(config.params,
                          dtrain= d_train,
                          num_boost_round= config.num_boost_round,
                          evals= [(d_train, 'train'), (d_valid, 'valid')],
                          custom_metric= xgb_amex_metric,
                          early_stopping_rounds= config.early_stopping,
                          evals_result= results,
                          verbose_eval= config.verbose_eval,
                          )

        #=======================
        # plot metrics
        #=======================
        epochs = len(results['valid']['amex_metric'])
        x_axis = range(0, epochs)
        fig, ax = plt.subplots()
        ax.plot(x_axis, results['train']['amex_metric'], label='Train')
        ax.plot(x_axis, results['valid']['amex_metric'], label='Valid')
        ax.legend()
        plt.ylabel('AMEX_METRIC')
        plt.title('XGBoost AMEX')
        plt.show()

        # get fold predictions
        valid_predictions = model.predict(d_valid)
        oof_predictions[valid_index] = valid_predictions
        # get test predictions
        d_test = xgb.DMatrix(test_data)
        fold_test_predictions = model.predict(d_test)
        # adjust predictions according to n_folds
        total_predictions += fold_test_predictions / config.n_folds
        print('Fold model successfully trained. Predictions saved')
        print(f'fold {n} score: {amex_metric(y_true=train_labels[valid_index], y_pred=valid_predictions)}')
        n += 1
    del d_train, d_valid, d_test
    gc.collect()
    print(f'Total out of folds score: {amex_metric(y_true=train_labels, y_pred=oof_predictions)}')
    sample = pd.read_csv('sample_submission.csv')
    output = pd.DataFrame({'customer_ID': sample.customer_ID, 'prediction': total_predictions})
    return output




def train_and_predict_lgbm(train_data: pd.DataFrame, test_data: pd.DataFrame, train_labels: np.ndarray, config: CONFIG):
    total_predictions = np.zeros(test_data.shape[0])
    oof_predictions = np.zeros(train_data.shape[0])
    print(config)
    skf = StratifiedKFold(n_splits=config.n_folds)
    n = 1
    for train_index, valid_index in skf.split(train_data, train_labels):
        results = dict()
        print('=' * 60)
        print(f'Fold number: {n}')
        print('=' * 60)
        # Get counts of 0 and 1 in K-Fold labels
        # to see that folds are indeed stratified (percentages of classes are preserved)
        print('label counts in train')
        get_label_counts(train_labels[train_index])
        print('label counts in test(eval)')
        get_label_counts(train_labels[valid_index])

        d_train = lgb.Dataset(data=train_data.iloc[train_index], label=train_labels[train_index])
        d_valid = lgb.Dataset(data=train_data.iloc[valid_index], label=train_labels[valid_index])
        model = lgb.train(config.params,
                          train_set = d_train,
                          num_boost_round= config.num_boost_round,
                          valid_sets= [d_train, d_valid],
                          valid_names=['train', 'valid'],
                          feval=lgb_amex_metric,
                          early_stopping_rounds= config.early_stopping,
                          evals_result= results,
                          verbose_eval= config.verbose_eval,
                          )

        #=======================
        # plot metrics
        #=======================
        lgb.plot_metric(results)
        plt.show()

        # get fold predictions
        valid_predictions = model.predict(d_valid)
        oof_predictions[valid_index] = valid_predictions
        # get test predictions
        d_test = lgb.Dataset(test_data)
        fold_test_predictions = model.predict(d_test)
        # adjust predictions according to n_folds
        total_predictions += fold_test_predictions / config.n_folds
        print('Fold model successfully trained. Predictions saved')
        print(f'fold {n} score: {amex_metric(y_true=train_labels[valid_index], y_pred=valid_predictions)}')
        n += 1
    del d_train, d_valid, d_test
    gc.collect()
    print(f'Total out of folds score: {amex_metric(y_true=train_labels, y_pred=oof_predictions)}')
    sample = pd.read_csv('sample_submission.csv')
    output = pd.DataFrame({'customer_ID': sample.customer_ID, 'prediction': total_predictions})
    return output

IndentationError: expected an indented block (1292343145.py, line 58)