In [2]:
import pandas as pd
import numpy as np
import os
os.listdir('./data')

['train.csv', 'test.csv']

In [3]:
train = pd.read_csv('./data/train.csv')
train_label = train['target']
train_id = train.id

del train['target'], train['id']

test = pd.read_csv('./data/test.csv')
test_id = test.id

del test['id']

# FE
## missing
- 결측치의 개수가 데이터 내에 새로운 군집 정보를 제공할 수 있다고 생각
- 새로운 운전자의 경우 결측값이 더 많이 존재할 수 있다
- 또 특정 지점에서 DB의 문제로 열 정보가 사라졌을 떄, 간접적인 정보로 표현 가능

In [8]:
# -1 : NA
train['missing'] = (train == -1).sum(axis = 1)
test['missing'] = (test == -1).sum(axis = 1)

## sum of binary value
- 변수 간의 상호 작용으로 얻을 수 있는 고차원 정보를 추출

In [9]:
bin_features = [c for c in train.columns if 'bin' in c]
train['bin_sum'] = train[bin_features].sum(axis = 1)
test['bin_sum'] = test[bin_features].sum(axis = 1)

## target encoding
- 선별한 일부 변수를 대상으로 수행(bar plot을 통해서 변수별로 target값의 차이가 있어 보이는 변수들을 활용)
- 단일 변수의 고유값별 타겟 변수의 평균값을 파생 변수로 활용
- 타겟 변수의 값을 직접적으로 사용하는 변수

In [11]:
train.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin', 'missing', 'bin_s

In [13]:
features = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin','ps_ind_09_bin', 
            'ps_ind_12_bin', 'ps_ind_16_bin','ps_ind_17_bin', 'ps_ind_18_bin',
            'ps_ind_04_cat','ps_ind_05_cat','ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
            'ps_car_06_cat', 'ps_car_07_cat', 'ps_ind_01','ps_ind_03','ps_ind_15','ps_car_11']

# LightGBM Model

In [16]:
num_boost_round = 10000
params = {'objective' : 'binary',
         'boosting_type' : 'gbdt',
         'learning_rate' : 0.1,
         'num_leaves' : 15,
         'max_bin' : 256,
         'feature_fraction' : 0.6,
         'verbosity' : 0,
         'drop_rate' : 0.1,
         'is_unbalance' : False,
         'max_drop' : 50,
         'min_child_samples' : 10,
         'min_child_weight' : 150,
         'min_split_gain' : 0,
         'subsmaple' : 0.9,
         'seed' : 2018,
         'n_jobs' : -1}

In [17]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

In [23]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true


def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

In [33]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits = NFOLDS, shuffle = True, random_state=218)
kf = kfold.split(train, train_label)

cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))
best_trees = []
fold_scores = []

In [25]:
for i, (train_idx, valid_idx) in enumerate(kf) :
    X_train, y_train = train.iloc[train_idx, :], train_label[train_idx]
    X_valid, y_valid = train.iloc[valid_idx, :], train_label[valid_idx]
    
    for feature in features :
        map_dict = pd.DataFrame([X_train[feature], y_train]).T.groupby(feature).agg('mean')
        map_dict = map_dict.to_dict()['target']
        
        X_train[feature + '_target_enc'] = X_train[feature].apply(lambda x : map_dict.get(x,0))
        X_valid[feature + '_target_enc'] = X_valid[feature].apply(lambda x : map_dict.get(x,0))
        test[feature + '_target_enc'] = test[feature].apply(lambda x : map_dict.get(x,0))
        
    dtrain = lgb.Dataset(X_train, y_train)
    dvalid = lgb.Dataset(X_valid, y_valid)
    
    bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid,
                   feval = evalerror, verbose_eval=100, early_stopping_rounds=100)
    
    best_trees.append(bst.best_iteration)
    
    cv_pred += bst.predict(test, num_iteration=bst.best_iteration)
    cv_train[valid_idx] += bst.predict(X_valid)
    
    score = Gini(y_valid, cv_train[valid_idx])
    print(score)
    fold_scores.append(score)
    
cv_pred /= NFOLDS

        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152193	valid_0's gini: 0.277277
[200]	valid_0's binary_logloss: 0.152162	valid_0's gini: 0.27859
Early stopping, best iteration is:
[151]	valid_0's binary_logloss: 0.152125	valid_0's gini: 0.279835
0.2798346884236375
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151813	valid_0's gini: 0.281267
[200]	valid_0's binary_logloss: 0.15187	valid_0's gini: 0.279366
Early stopping, best iteration is:
[114]	valid_0's binary_logloss: 0.151798	valid_0's gini: 0.282089
0.28208945536561003
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151668	valid_0's gini: 0.287244
[200]	valid_0's binary_logloss: 0.151759	valid_0's gini: 0.285428
Early stopping, best iteration is:
[109]	valid_0's binary_logloss: 0.151659	valid_0's gini: 0.287601
0.28760084772746447


In [32]:
print('cv score : ', Gini(train_label, cv_train))
print(fold_scores)
print(best_trees, np.mean(best_trees))

cv score :  0.17956604017228467
[0.2798346884236375, 0.28208945536561003, 0.28760084772746447]
[95, 151, 114, 109] 117.25
