In [1]:
import lightgbm as lgb
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
import os
os.listdir('./data')

['fea0.pk', 'train.csv', 'test.csv']

In [3]:
path = './data/'

train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

train_label = train['target']
train_id = train['id']
test_id = test['id']

# evaluate function

In [4]:
def Gini(y_true, y_pred) :
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1, 0]
    pred_order = arr[arr[:,1].argsort()][::-1, 0]
    
    L_true = np.cumsum(true_order) * 1 / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1 / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)
    
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    return G_pred / G_true

def evalerror(preds, dtrain) :
    labels = dtrain.get_label()
    
    return 'gini', Gini(labels, preds), True

# feature engineering

In [5]:
y = train['target'].values
drop_feature = ['id', 'target']

X = train.drop(drop_feature, axis = 1)

In [6]:
#separate categroy variables and nummeric variables
feature_names = X.columns.tolist()

cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]

## missing values -> new feature

In [7]:
train['missing'] = (train==-1).sum(axis = 1).astype(float)
test['missing'] = (test == -1).sum(axis = 1).astype(float)
num_features.append('missing')

## Categorical Variables -> Label Encoding -> OneHot

In [8]:
#label encoding
for c in cat_features :
    le = LabelEncoder()
    le.fit(train[c])
    
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])
    
#onehot encoding
enc = OneHotEncoder()
enc.fit(train[cat_features])

X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## ind + ind -> 'new_ind' (string)

In [9]:
ind_features = [c for c in feature_names if 'ind' in c]
count = 0

for c in ind_features :
    if count == 0 : #first order
        train['new_ind'] = train[c].astype(str) + '_'
        test['new_ind'] = test[c].astype(str) + '_'
        count += 1
    else :
        train['new_ind'] += train[c].astype(str) + '_'
        test['new_ind'] += test[c].astype(str) + '_'
        #print(c)

## cat features, 'new_ind' -> count

In [10]:
cat_count_features = []
for c in cat_features + ['new_ind'] :
    d = pd.concat([train[c], test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x : d.get(x,0))
    test['%s_count'%c] = train[c].apply(lambda x : d.get(x,0))
    
    cat_count_features.append('%s_count'%c)

In [11]:
train_list = [train[num_features + cat_count_features].values, X_cat,]
test_list = [test[num_features + cat_count_features].values, X_t_cat,]

In [12]:
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()

# modeling

In [13]:
num_boost_round = 10000
params = {'objective' : 'binary',
         'boosting_type' : 'gbdt',
         'learning_rate' : 0.1,
         'num_leaves' : 15,
         'max_bin' : 256,
         'feature_fraction' : 0.6,
         'verbosity' : 0,
         'drop_rate' : 0.1,
         'is_unbalance' : False,
         'max_drop' : 50,
         'min_child_samples' : 10,
         'min_child_weight' : 150,
         'min_split_gain' : 0,
         'subsample' : 0.9,
         'n_jobs' : -1}

In [15]:
NFolds = 5
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=218)

x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

#5fold * 16
for s in range(16) :
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))
    
    params['seed'] = s
    
    kf = kfold.split(X, train_label)
    best_trees = []
    fold_scores = []
    
    for i, (train_fold, validate) in enumerate(kf) :
        X_train, X_valid, label_train, label_valid = X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
        
        dtrain = lgb.Dataset(X_train, label_train) #trainset
        dvalid = lgb.Dataset(X_valid, label_valid, reference=dtrain) #validset
        
        bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid,
                       feval = evalerror, verbose_eval = 200, early_stopping_rounds=100)
        best_trees.append(bst.best_iteration)
        
        cv_pred += bst.predict(X_test, num_iteration = bst.best_iteration)
        cv_train[validate] += bst.predict(X_valid)
        
        score = Gini(label_valid, cv_train[validate])
        print(score)
        
        fold_scores.append(score)
        
    cv_pred /= NFolds
    final_cv_train += cv_train
    final_cv_pred += cv_pred
    
    print('cv score :')
    print(Gini(train_label, cv_train))
    print('current score : ', Gini(train_label, final_cv_train / (s+1)), s+1)
    
    print(fold_scores)
    print(best_trees, np.mean(best_trees))
    
    x_score.append(Gini(train_label, cv_train))

print(x_score)
pd.DataFrame({'id' : test_id, 'target' : final_cv_pred / 16}).to_csv(path + 'lgbm_pred_avg.csv', index = False)

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.151285	valid_0's gini: 0.296137
Early stopping, best iteration is:
[235]	valid_0's binary_logloss: 0.151265	valid_0's gini: 0.296665
0.29666456778791145
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.152144	valid_0's gini: 0.27349
Early stopping, best iteration is:
[145]	valid_0's binary_logloss: 0.152127	valid_0's gini: 0.272928
0.2729277334928809
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.152048	valid_0's gini: 0.28106
Early stopping, best iteration is:
[129]	valid_0's binary_logloss: 0.152014	valid_0's gini: 0.281321
0.28132063784139355
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.151786	valid_0's gini: 0.282396
Early stopping, best iteration is:
[174]	valid_0's binary_logloss: 0.151771	valid_0's gini: 0.282331
0.2823311851029831
Trai

0.2856485735256103
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.151493	valid_0's gini: 0.294389
Early stopping, best iteration is:
[120]	valid_0's binary_logloss: 0.151437	valid_0's gini: 0.295662
0.295661760524777
cv score :
0.2873242056245988
current score :  0.2893071481821934 6
[0.29736152889458917, 0.2745523847571541, 0.2838133329685625, 0.2856485735256103, 0.295661760524777]
[344, 158, 159, 166, 120] 189.4
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.151356	valid_0's gini: 0.295657
Early stopping, best iteration is:
[225]	valid_0's binary_logloss: 0.151331	valid_0's gini: 0.29669
0.29669030213213143
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.15216	valid_0's gini: 0.273503
Early stopping, best iteration is:
[154]	valid_0's binary_logloss: 0.152126	valid_0's gini: 0.274139
0.27413868226848387
Training until validation scores d

0.2747119023286961
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.15199	valid_0's gini: 0.281738
Early stopping, best iteration is:
[171]	valid_0's binary_logloss: 0.15198	valid_0's gini: 0.281989
0.28198900344628036
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.151734	valid_0's gini: 0.283925
Early stopping, best iteration is:
[191]	valid_0's binary_logloss: 0.151721	valid_0's gini: 0.284323
0.28432270878146754
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.151483	valid_0's gini: 0.293925
Early stopping, best iteration is:
[125]	valid_0's binary_logloss: 0.151449	valid_0's gini: 0.294785
0.29478518472750487
cv score :
0.2861239009597362
current score :  0.2893238816399851 12
[0.2953432514663995, 0.2747119023286961, 0.28198900344628036, 0.28432270878146754, 0.29478518472750487]
[171, 166, 171, 191, 125] 164.8
Training until validation sc