In [1]:
import numpy as np
import pandas as pd
import random
import xgboost as xgb
import matplotlib.pyplot as plt
from xgbtune import tune_xgb_model
from sklearn.model_selection import train_test_split
%matplotlib inline

In [28]:
train = pd.read_csv("../../data/train.csv")
test  = pd.read_csv("../../data/test.csv")

In [3]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [31]:
target = train["target"]
train = train.drop(['target', 'id'], axis=1)
train.replace(-1, np.nan, inplace=True)
test = test.drop(['id'], axis=1)

In [5]:
display(np.unique(target))
train.head()

array([0, 1])

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,2,2.0,5,1.0,0.0,0,1,0,0,0,...,9,1,5,8,0,1,1,0,0,1
1,1,1.0,7,0.0,0.0,0,0,1,0,0,...,3,1,1,9,0,1,1,0,1,0
2,5,4.0,9,1.0,0.0,0,0,1,0,0,...,4,2,7,7,0,1,1,0,1,0
3,0,1.0,2,0.0,0.0,1,0,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,0,2.0,0,1.0,0.0,1,0,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [4]:
total = target.value_counts()
print(f"res 0: {total[0]}, res 1: {total[1]}")
print(f"{total[0] / target.shape[0]}")
print(f"{total[1] / target.shape[0]}")

res 0: 573518, res 1: 21694
0.963552482140817
0.036447517859182946


In [32]:
bin_features = [col for col in train.columns if col.endswith('bin')]
cat_features = [col for col in train.columns if col.endswith('cat')]
con_features = [col for col in train.columns if not col.endswith(('cat', 'bin'))]

In [33]:
display((np.isnan(train[bin_features])).sum() / train.shape[0])
display((np.isnan(train[cat_features])).sum() / train.shape[0])
display((np.isnan(train[con_features])).sum() / train.shape[0])

ps_ind_06_bin     0.0
ps_ind_07_bin     0.0
ps_ind_08_bin     0.0
ps_ind_09_bin     0.0
ps_ind_10_bin     0.0
ps_ind_11_bin     0.0
ps_ind_12_bin     0.0
ps_ind_13_bin     0.0
ps_ind_16_bin     0.0
ps_ind_17_bin     0.0
ps_ind_18_bin     0.0
ps_calc_15_bin    0.0
ps_calc_16_bin    0.0
ps_calc_17_bin    0.0
ps_calc_18_bin    0.0
ps_calc_19_bin    0.0
ps_calc_20_bin    0.0
dtype: float64

ps_ind_02_cat    0.000363
ps_ind_04_cat    0.000139
ps_ind_05_cat    0.009760
ps_car_01_cat    0.000180
ps_car_02_cat    0.000008
ps_car_03_cat    0.690898
ps_car_04_cat    0.000000
ps_car_05_cat    0.447825
ps_car_06_cat    0.000000
ps_car_07_cat    0.019302
ps_car_08_cat    0.000000
ps_car_09_cat    0.000956
ps_car_10_cat    0.000000
ps_car_11_cat    0.000000
dtype: float64

ps_ind_01     0.000000
ps_ind_03     0.000000
ps_ind_14     0.000000
ps_ind_15     0.000000
ps_reg_01     0.000000
ps_reg_02     0.000000
ps_reg_03     0.181065
ps_car_11     0.000008
ps_car_12     0.000002
ps_car_13     0.000000
ps_car_14     0.071605
ps_car_15     0.000000
ps_calc_01    0.000000
ps_calc_02    0.000000
ps_calc_03    0.000000
ps_calc_04    0.000000
ps_calc_05    0.000000
ps_calc_06    0.000000
ps_calc_07    0.000000
ps_calc_08    0.000000
ps_calc_09    0.000000
ps_calc_10    0.000000
ps_calc_11    0.000000
ps_calc_12    0.000000
ps_calc_13    0.000000
ps_calc_14    0.000000
dtype: float64

In [34]:
train.fillna(train.mean(), inplace=True)
np.isnan(train).sum()

ps_ind_01         0
ps_ind_02_cat     0
ps_ind_03         0
ps_ind_04_cat     0
ps_ind_05_cat     0
ps_ind_06_bin     0
ps_ind_07_bin     0
ps_ind_08_bin     0
ps_ind_09_bin     0
ps_ind_10_bin     0
ps_ind_11_bin     0
ps_ind_12_bin     0
ps_ind_13_bin     0
ps_ind_14         0
ps_ind_15         0
ps_ind_16_bin     0
ps_ind_17_bin     0
ps_ind_18_bin     0
ps_reg_01         0
ps_reg_02         0
ps_reg_03         0
ps_car_01_cat     0
ps_car_02_cat     0
ps_car_03_cat     0
ps_car_04_cat     0
ps_car_05_cat     0
ps_car_06_cat     0
ps_car_07_cat     0
ps_car_08_cat     0
ps_car_09_cat     0
ps_car_10_cat     0
ps_car_11_cat     0
ps_car_11         0
ps_car_12         0
ps_car_13         0
ps_car_14         0
ps_car_15         0
ps_calc_01        0
ps_calc_02        0
ps_calc_03        0
ps_calc_04        0
ps_calc_05        0
ps_calc_06        0
ps_calc_07        0
ps_calc_08        0
ps_calc_09        0
ps_calc_10        0
ps_calc_11        0
ps_calc_12        0
ps_calc_13        0


In [8]:
column = train.columns

In [35]:
noise_columns = ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 
                 'ps_ind_13_bin', 'ps_ind_14', 'ps_car_10_cat']
new_features = list(filter(lambda x: x not in noise_columns, column))

In [36]:
train['ps_ind_03-ps_ind_02_cat'] = train['ps_ind_03']*train['ps_ind_02_cat']
train['ps_car_13-ps_ind_03']     = train['ps_car_13']*train['ps_ind_03']

test['ps_ind_03-ps_ind_02_cat']  = test['ps_ind_03']*test['ps_ind_02_cat']
test['ps_car_13-ps_ind_03']      = test['ps_car_13']*test['ps_ind_03']

new_features += ['ps_ind_03-ps_ind_02_cat', 'ps_car_13-ps_ind_03']

In [37]:
X = train[new_features]
test = test[new_features]

In [38]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, target, test_size = 0.3, stratify = target, random_state = 42)

In [39]:
params = {
    'objective': 'binary:logistic',
    'tree_method': 'hist',
    'device': 'cuda',
    'lambda': 4.645511,
    'alpha': 0.654147,
    'colsample_bytree': 0.920,
    'subsample': 0.66,
    'learning_rate': 0.015,
    'max_depth': 8,
    'min_child_weight': 194,
    'eval_metric': 'logloss'
}

In [41]:
train_df = xgb.DMatrix(
    X_train.values, 
    label=y_train, 
    enable_categorical=True)
val_df = xgb.DMatrix(
    X_valid.values, 
    label=y_valid, 
    enable_categorical=True)
xbg_model = xgb.train(
    params, 
    train_df,
    num_boost_round=1000)

In [17]:
# https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703
def gini(actual, pred):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [47]:
val_matrix = xgb.DMatrix(X_valid)
val_pred = xbg_model.predict(val_matrix)
print(f"Gine score: {gini_normalized(y_valid, val_pred)}")

Gine score: 0.29100373446543565


In [48]:
X_test = xgb.DMatrix(test)
y_hat = xbg_model.predict(X_test)

In [49]:
y_hat = pd.DataFrame({'id': range(0, len(y_hat)), 'target': y_hat})

In [59]:
y_hat.to_csv("submission.csv",index=False)