### このカーネルを見た：https://www.kaggle.com/hmendonca/lightgbm-predictions-explained-with-shap-0-796

In [21]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions
import re

# LightGBM
import lightgbm as lgb

# sklearn tools for model training and assesment
from contextlib import contextmanager
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

import seaborn as sns
import warnings

df = pd.read_csv("data.csv")
pd.set_option('display.max_columns', 100)
df = df.drop(['employees_created'], axis=1)
df = df.drop(['employee_id'], axis=1)
df["age"] = df["age"].fillna(df["age"].mean())
df["influx_route_midium"] = df["influx_route_midium"].fillna('null')
df["influx_route_small"] = df["influx_route_small"].fillna('null')
df.head()

Unnamed: 0,influx_route_big,influx_route_midium,influx_route_small,device,age,gender,entry_date_diffs,interview_date_diffs,open_count,mail_click,owned_visit,advertisement,career_change_desired_date_1,career_change_desired_date_2,career_change_desired_date_3,career_change_desired_date_4,now_status_1,now_status_2,now_status_3,now_status_4,now_status_5,contact_result_1,contact_result_2,contact_result_4,contact_result_5,objective_variable
0,SEO,,,SP,24.0,1,4,0,5,3,8,0,1,0,0,0,1,0,0,0,0,0,1,1,1,0
1,アフィリエイト,レントラックス,,SP,19.0,0,2,1,0,0,32,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
2,リスティング,Yahoo検索,フリーター軸,SP,21.0,0,3,3,8,2,24,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1
3,SEO,,,PC,24.0,0,5,5,0,0,8,0,1,0,0,0,1,0,0,0,0,0,0,3,0,1
4,アフィリエイト,アクセストレード,,SP,21.0,1,1,1,0,0,3,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1


In [3]:
df.shape

(12677, 26)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12677 entries, 0 to 12676
Data columns (total 26 columns):
influx_route_big                12677 non-null object
influx_route_midium             12677 non-null object
influx_route_small              12677 non-null object
device                          12677 non-null object
age                             12677 non-null float64
gender                          12677 non-null int64
entry_date_diffs                12677 non-null int64
interview_date_diffs            12677 non-null int64
open_count                      12677 non-null int64
mail_click                      12677 non-null int64
owned_visit                     12677 non-null int64
advertisement                   12677 non-null int64
career_change_desired_date_1    12677 non-null int64
career_change_desired_date_2    12677 non-null int64
career_change_desired_date_3    12677 non-null int64
career_change_desired_date_4    12677 non-null int64
now_status_1                    12677

#memo
K-foldは、単純に9：1に分けるだけ
StratifiedKFoldは、ラベルの比率も合わせながら9：1に分ける

In [98]:
def kfold_lightgbm(train_df, train_target, test_df, num_folds, stratified = False, debug=False):
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47)
    else:
        folds = KFold(n_splits = num_folds, shuffle =True, random_state=47)
        
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feat_importance = pd.DataFrame()
    scores = []
    models = []
    
    # データセットの準備(2次元配列にする)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_target)):
        train_x, train_y = train_df.iloc[train_idx], train_target.iloc[train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[valid_idx]
        
    
    # 学習パラメーターの設定
    params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          #'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : {'auc', 'binary_logloss'},
         }
    
    #探索範囲の作成
    gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501], # Updated from 'seed'
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }
    
# mdl 
    clf = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])
    
#     #グリッドの作成
#     grid = GridSearchCV(mdl, gridParams,
#                         verbose=0,
#                         cv=4,
#                         n_jobs=1, 
#                         categorical_feature=['influx_route_big','influx_route_midium','influx_route_small','device'])
 
#     #　最適パラメーターの探索
#     grid.fit(train_x, train_y)
#     # 最適パラメーターの表示
#     print(grid.best_params_)
#     print(grid.best_score_)
    
#     # 最適パラメーターのセット
#     params['colsample_bytree'] = grid.best_params_['colsample_bytree']
#     params['learning_rate'] = grid.best_params_['learning_rate']
#     # params['max_bin'] = grid.best_params_['max_bin']
#     params['num_leaves'] = grid.best_params_['num_leaves']
#     params['reg_alpha'] = grid.best_params_['reg_alpha']
#     params['reg_lambda'] = grid.best_params_['reg_lambda']
#     params['subsample'] = grid.best_params_['subsample']
    print('Start training...')

    # 学習
#     gbm = lgb.train(params,
#                     train_x,
#                     100000,
#                     valid_sets=train_y,
#                     early_stopping_rounds=50,
#                     verbose_eval=4)
    clf.fit(params, train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], verbose= 100000, early_stopping_rounds= 100)
    
    
#     clf.fit(params, train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)]
#             , verbose= 100000, early_stopping_rounds= 100
# #             , categorical_feature=['influx_route_big','influx_route_midium','influx_route_small','device']
# #             , categorical_feature=[0,1,2,3]
#            )

    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(test_df, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    ## feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = test_df.columns.values
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["shap_values"] = abs(shap.TreeExplainer(clf).shap_values(valid_x)[:,:test_df.shape[1]]).mean(axis=0).T
    fold_importance_df["fold"] = n_fold + 1
    feat_importance = pd.concat([feat_importance, fold_importance_df], axis=0)

    scores.append(roc_auc_score(valid_y, oof_preds[valid_idx]))
    print('Fold %2d AUC : %.6f' % (n_fold + 1, scores[n_fold]))
    
    models.append(clf)
    del clf, train_x, train_y, valid_x, valid_y, fold_importance_df
    gc.collect()
    
    return feat_importance, models, scores

# test_df要らない説

In [99]:
df_train = df.copy()
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

#変数を配列へ変換
from sklearn.cross_validation import train_test_split
# df_train['influx_route_big'] = df_train['influx_route_big'].astype("category")
# df_train['influx_route_midium'] = df_train['influx_route_midium'].astype("category")
# df_train['influx_route_small'] = df_train['influx_route_small'].astype("category")
# df_train['device'] = df_train['device'].astype("category")

# Create a label encoder object
le = preprocessing.LabelEncoder()

# le.fit(df_train['influx_route_big'])
# le.fit(df_train['influx_route_midium'])
# le.fit(df_train['influx_route_small'])
# le.fit(df_train['device'])

In [100]:
# Transform both training and testing data
df_train['influx_route_big']= le.fit_transform(df_train['influx_route_big'])
df_train['influx_route_midium']= le.fit_transform(df_train['influx_route_midium'])
df_train['influx_route_small']= le.fit_transform(df_train['influx_route_small'])
df_train['device']= le.fit_transform(df_train['device'])

In [101]:
X = df_train.drop("objective_variable" , axis=1)
y = df_train["objective_variable"]

# X = df_train.drop("objective_variable" , axis=1).values
# y = df_train["objective_variable"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval  = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [102]:
%%time
debug = False

feat_importance, models, scores = kfold_lightgbm(X_train, y_train, X_test, num_folds=5, stratified=True, debug=debug)

Starting LightGBM. Train shape: (8873, 25), test shape: (3804, 25)
Start training...


ValueError: Unknown label type: 'continuous-multioutput'

In [108]:
y_train

6599     1
9681     1
3835     0
12592    1
5777     0
12569    0
6852     1
3683     0
905      0
7572     1
10689    1
7713     0
10721    1
7208     0
11686    1
5067     0
253      1
6287     0
11922    0
9855     0
1161     0
658      1
5707     0
2533     0
3210     0
6587     1
839      1
9097     1
3789     1
10305    0
        ..
408      0
12430    0
5338     0
1231     1
6162     0
11981    0
9459     0
7073     0
6756     0
8447     0
1840     0
5287     0
8686     1
11335    1
6479     0
12508    1
3778     1
2866     0
12269    1
12442    0
6748     1
191      0
3353     0
10105    1
838      0
12373    1
11495    1
6666     0
10283    0
1388     1
Name: objective_variable, Length: 8873, dtype: int64

In [106]:
y_test.shape

(3804,)