In [1]:
import numpy as np
import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# from sklearn.model_selection import KFold # 交差検証法に関する関数

import warnings
warnings.simplefilter('ignore')


## データの読み込み

In [2]:
# データの読み込み
df_org = pd.read_csv('./data/input_01.csv')


In [3]:
# データ量が多いのでコーディング中はサンプリング
df = df_org.sample(100000, replace=True)
# df = df_org

## データの準備

In [4]:
# 数値エンコーディング（GBDT以外の場合は必須）
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df = df.apply(encoder.fit_transform)

In [5]:
y = df[['state']]
x_cols = ['category', 'main_category', 'currency', 'country', 'period', 'n_words', 'log_goal']
x = df[x_cols]

In [6]:
# y.shape

In [7]:
x.head()

Unnamed: 0,category,main_category,currency,country,period,n_words,log_goal
59145,58,7,13,22,29,8,11774
41702,8,13,4,8,29,0,5006
40628,138,13,1,3,30,0,15243
24487,129,6,13,22,31,1,2352
9969,95,12,13,22,29,7,13929


## 交差検証

In [8]:
# !pip install lightgbm

In [21]:
from sklearn.model_selection import KFold
n_split = 5 # グループ数を設定（今回は5分割）
seed = 1234
# cross_valid_mae = 0
split_num = 1
kf = KFold(n_splits=n_split, shuffle=True, random_state=seed)

In [24]:
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
import lightgbm as lgb

# scores_train = []
scores_valid = []
itr = 1

for train_idx, valid_idx in kf.split(x, y):
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = x.iloc[valid_idx], y.iloc[valid_idx]
    
    # lightgbmのデータ構造に変換
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    
    # ハイパーパラメータの設定
    # metricsはaccuracy_scoreにすべきかも
    params = {'num_leaves': 64, 'objective': 'binary', 'seed': seed, 
              'verbose': 0, 'metrics': 'binary_logloss'}
    num_round = 1000

    # 学習の実行
    # カテゴリ変数を指定した場合の挙動はKaggle本 p240参照
    # categorical_features = ['category', 'main_category', 'currency', 'country']
    model = lgb.train(params, lgb_train, num_round, 
                    valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_valid],
                     early_stopping_rounds=20, verbose_eval=False)
    
    # # 訓練データに対する予測、正答率
    # y_pred = clf.predict(x_train)
    # acc_train = accuracy_score(y_true=y_train, y_pred=y_pred)
    # print(f"{itr:02}train_訓練データの正答率: {100 * acc_train:.2f} %")
    # scores_train.append(acc_train)  
    
    # 検証データに対する予測
    y_pred = model.predict(x_valid)
    # accurasy_scoreするには0で丸め込みが必要（https://shiokoji11235.com/two_interface_of_lightgbm）
    y_pred = y_pred.round(0)
    # score_valid = log_loss(y_true=y_valid, y_pred=y_pred)
    score_valid = accuracy_score(y_valid, y_pred)
    # score_valid = roc_auc_score(y_valid, y_pred)
    # print(f"{itr:02}valid_logloss: {score_valid:.4f} %")    
    print(f"{itr:02}valid_accuracy_score: {score_valid:.4f} %")    
    # print(f"{itr:02}valid_auc_score: {score_valid:.4f} %")    
    scores_valid.append(score_valid)
    
    itr += 1

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
01valid_accuracy_score: 0.7023 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
02valid_accuracy_score: 0.6993 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
03valid_accuracy_score: 0.6969 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
04valid_accuracy_score: 0.6948 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
05valid_accuracy_score: 0.7016 %


In [23]:
# 各foldのスコアの平均
# print(np.mean(scores_train))
print(np.mean(scores_valid))

0.69899


In [None]:
# print(x_train.shape)
# print(x_valid.shape)
# print(y_train.shape)
# print(y_valid.shape)

## テストデータに対する予測
本来はtrain/validではなくtrain/valid/testに分割し、testに対する予測を見るべき