# Description

https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

# Environment

## Library Imports

In [78]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [79]:
import gc
from numba import jit
import time 

In [80]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [81]:
from xgboost import XGBClassifier

## Local Imports

In [82]:
from phuc import data_process, file
from phuc import visualization as vs

## Reload data path

In [83]:
from phuc.file import standard_template

standard_template.save_data_path()

Loaded file data_path.pkl
Directory  /home/phuc/Desktop/Work/Data Sience/kaggle_competition/SafeDirverPrediction  already exists
Saved file data_path.pkl


##  Setup Pandas

In [84]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Set up data path

In [85]:
CURR_DIR = os.getcwd().split('/src')[0]

In [86]:
data_path = file.load_pickle(CURR_DIR + '/data_path.pkl')

Loaded file data_path.pkl


In [87]:
for key in data_path: print(key)

PROJECT_DIR
OUTPUTS
TRAIN_CSV_PKL_PATH
TEST_CSV_PKL_PATH
METADATA_PKL_PATH
WORKING_DIR
SAMPLE_SUBMISSION_CSV_PATH
TEST_CSV_PATH
TRAIN_CSV_PATH
EXTERNAL_DIR


# Code

## local function

In [88]:
@jit
def eval_gini(y_true, y_prob):
    """
    Original author CPMP : https://www.kaggle.com/cpmpml
    In kernel : https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini


def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[
        target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / \
        (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * \
        (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(
            columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(
            columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

## Load Raw Data

In [89]:
train_csv = file.load_pickle(data_path['TRAIN_CSV_PKL_PATH'])
test_csv = file.load_pickle(data_path['TEST_CSV_PKL_PATH'])

target = train_csv['target']
del train_csv['target']

Loaded file train_csv.pkl
Loaded file test_csv.pkl


## Feature

In [90]:
train_features = [
    "ps_car_13",  # : 1571.65 / shadow  609.23
    "ps_reg_03",  # : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  # : 1387.87 / shadow   84.72
    "ps_ind_03",  # : 1219.47 / shadow  230.55
    "ps_ind_15",  # :  922.18 / shadow  242.00
    "ps_reg_02",  # :  920.65 / shadow  267.50
    "ps_car_14",  # :  798.48 / shadow  549.58
    "ps_car_12",  # :  731.93 / shadow  293.62
    "ps_car_01_cat",  # :  698.07 / shadow  178.72
    "ps_car_07_cat",  # :  694.53 / shadow   36.35
    "ps_ind_17_bin",  # :  620.77 / shadow   23.15
    "ps_car_03_cat",  # :  611.73 / shadow   50.67
    "ps_reg_01",  # :  598.60 / shadow  178.57
    "ps_car_15",  # :  593.35 / shadow  226.43
    "ps_ind_01",  # :  547.32 / shadow  154.58
    "ps_ind_16_bin",  # :  475.37 / shadow   34.17
    "ps_ind_07_bin",  # :  435.28 / shadow   28.92
    "ps_car_06_cat",  # :  398.02 / shadow  212.43
    "ps_car_04_cat",  # :  376.87 / shadow   76.98
    "ps_ind_06_bin",  # :  370.97 / shadow   36.13
    "ps_car_09_cat",  # :  214.12 / shadow   81.38
    "ps_car_02_cat",  # :  203.03 / shadow   26.67
    "ps_ind_02_cat",  # :  189.47 / shadow   65.68
    "ps_car_11",  # :  173.28 / shadow   76.45
    "ps_car_05_cat",  # :  172.75 / shadow   62.92
    "ps_calc_09",  # :  169.13 / shadow  129.72
    "ps_calc_05",  # :  148.83 / shadow  120.68
    "ps_ind_08_bin",  # :  140.73 / shadow   27.63
    "ps_car_08_cat",  # :  120.87 / shadow   28.82
    "ps_ind_09_bin",  # :  113.92 / shadow   27.05
    "ps_ind_04_cat",  # :  107.27 / shadow   37.43
    "ps_ind_18_bin",  # :   77.42 / shadow   25.97
    "ps_ind_12_bin",  # :   39.67 / shadow   15.52
    "ps_ind_14",  # :   37.37 / shadow   16.65
    "ps_car_11_cat"  # Very nice spot from Tilii : https://www.kaggle.com/tilii7
]

# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),
    ('ps_reg_01', 'ps_car_04_cat'),
]

## Encoding

In [91]:
start = time.time()
for ix, (feature_1, feature_2) in enumerate(combs):
    # comb_feature name
    comb_feature = feature_1 + "_plus_" + feature_2
    # Print Time Process
    print('current feature %60s %4d in %5.1f'
          % (comb_feature, ix + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')

    # create comb_feature in train and test set
    train_csv[comb_feature] = train_csv[feature_1].apply(lambda x: str(
        x)) + "_" + train_csv[feature_2].apply(lambda x: str(x))

    test_csv[comb_feature] = test_csv[feature_1].apply(lambda x: str(
        x)) + "_" + test_csv[feature_2].apply(lambda x: str(x))

    # Label Encode
    le = LabelEncoder()
    # fit
    le.fit(list(train_csv[comb_feature].values) +
           list(test_csv[comb_feature].values))
    # transform
    train_csv[comb_feature] = le.transform(list(train_csv[comb_feature].values))
    test_csv[comb_feature] = le.transform(list(test_csv[comb_feature].values))
    
    train_features.append(comb_feature)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1

## Transformed Data

In [92]:
train_csv = train_csv[train_features]
test_csv = test_csv[train_features]

## categorical columns

In [93]:
cat_columns = [column for column in train_csv.columns if "_cat" in column]
for column in cat_columns:
    train_csv[column + "_avg"], \
        test_csv[column + "_avg"] = target_encode(trn_series=train_csv[column],
                                                  tst_series=test_csv[column],
                                                  target=target,
                                                  min_samples_leaf=200,
                                                  smoothing=10,
                                                  noise_level=0)

## Cross-Validation

In [94]:
n_splits = 5
n_estimators = 200
np.random.seed(0)
increase = True


In [95]:
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(train_csv.columns), n_splits))
xgb_evals = np.zeros((n_estimators, n_splits))
oof = np.empty(len(test_csv))
predictions = np.zeros(len(test_csv))

In [None]:
for fold_, (train_idx, val_idx) in enumerate(skf.split(target, target)):
    train_data, train_tg = train_csv.iloc[train_idx], target.iloc[train_idx]
    val_data, val_tg = train_csv.iloc[val_idx], target.iloc[val_idx]

    clf = XGBClassifier(n_estimators=n_estimators,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        nthread=2)
    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if increase:
        # Get positive examples
        pos = pd.Series(train_tg == 1)
        # Add positive examples
        train_data = pd.concat([train_data, train_data.loc[pos]], axis=0)
        train_tg = pd.concat([train_tg, train_tg.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(train_data))
        np.random.shuffle(idx)
        train_data = train_data.iloc[idx]
        train_data = train_data.iloc[idx]
        
    clf.fit(train_data, train_tg, 
            eval_set=[(train_data, train_tg), (val_data, val_tg)],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False)
            
    # Keep feature importances
    imp_df[:, fold_] = clf.feature_importances_

    # Find best round for validation set
    xgb_evals[:, fold_] = clf.evals_result_["validation_1"]["gini"]
    # Xgboost provides best round starting from 0 so it has to be incremented
    best_round = np.argsort(xgb_evals[:, fold_])[::-1][0]

    # Predict OOF and submission probas with the best round
    oof[val_idx] = clf.predict_proba(val_data, ntree_limit=best_round)[:, 1]
    # Update submission
    predictions += clf.predict_proba(test_csv, ntree_limit=best_round)[:, 1] / n_splits

    # Display results
    print("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
          % (fold_ + 1,
             eval_gini(val_tg, oof[val_idx]),
             n_estimators,
             xgb_evals[best_round, fold_],
             best_round))

Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: non-precise type pyobject
[1] During: typing of argument at <ipython-input-88-0a91aada45e7> (7)

File "<ipython-input-88-0a91aada45e7>", line 7:
def eval_gini(y_true, y_prob):
    <source elided>
    """
    y_true = np.asarray(y_true)
    ^

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: cannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>

File "<ipython-input-88-0a91aada45e7>", line 13:
def eval_gini(y_true, y_prob):
    <source elided>
    n = len(y_true)
    for i in range(n-1, -1, -1):
    ^

  @jit

File "<ipython-input-88-0a91aada45e7>", line 7:
def eval_gini(y_true, y_prob):
    <source elided>
    """
    y_true = np.asarray(y_true)
    ^

  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has be

Fold  1 : 0.024455 @ 200 / best score is 0.041039 @   4
Fold  2 : 0.100083 @ 200 / best score is 0.112094 @  13
Fold  3 : 0.005135 @ 200 / best score is 0.012675 @  24


# Conduct Result

In [None]:
print("Full OOF score : %.6f" % eval_gini(
    target, oof))  # Compute mean score and std
mean_eval = np.mean(xgb_evals, axis=1)
std_eval = np.std(xgb_evals, axis=1)
best_round = np.argsort(mean_eval)[::-1][0]

print("Best mean score : %.6f + %.6f @%4d"
      % (mean_eval[best_round], std_eval[best_round], best_round))

importances = sorted([(train_csv.columns[i], imp) for i, imp in enumerate(imp_df.mean(axis=1))],
                     key=lambda x: x[1])

for column, imp in importances[::-1]:
    print("%-34s : %10.4f" % (column, imp))

# Sumission

In [None]:
test_csv["target"] = predictions

test_csv[["target"]].to_csv(data_path['OUTPUTS']+"/submission.csv", index=True, float_format="%.9f")