In [1]:
import numpy as np
import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# from sklearn.model_selection import KFold # 交差検証法に関する関数

import warnings
warnings.simplefilter('ignore')


## データの読み込み

In [2]:
# データの読み込み
# df_org = pd.read_csv('./data/input_01.csv')
# df_org = pd.read_csv('./data/input_02.csv')
df_org = pd.read_csv('./data/input_03.csv')

In [3]:
# データ量が多いのでコーディング中はサンプリング
# df = df_org.sample(100000, replace=True)
df = df_org

## データの準備

In [4]:
# 数値エンコーディング（GBDT以外の場合は必須）
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df = df.apply(encoder.fit_transform)

In [5]:
y = df[['state']]
# x_cols = ['category', 'main_category', 'currency', 'country', 'period', 'n_words', 'log_goal']
# x_cols = ['class', 'currency', 'country', 'period', 'n_words', 'log_goal']
# x_cols = ['class', 'main_category', 'currency', 'country', 'period', 'n_words', 'log_goal']
x_cols = ['class', 'currency', 'country', 'period', 'n_words', 'log_goal', 'launched_year', 'launched_month', 'launched_day']
x = df[x_cols]

In [6]:
# y.shape

In [7]:
x.head()

Unnamed: 0,class,currency,country,period,n_words,log_goal,launched_year,launched_month,launched_day
0,97,5,9,58,5,10270,6,7,10
1,42,13,22,59,7,35206,8,8,1
2,42,13,22,44,2,37850,4,0,11
3,78,13,22,29,6,19451,3,2,16
4,55,13,22,34,2,38482,7,1,25


## モデル学習・推論

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,period,launched_year,launched_month,launched_day,class,n_words,log_goal
0,0,154087,108,12,5,689,0,0,0,9,0,0,58,6,7,10,97,5,10270
1,1,154088,93,6,13,5532,8330,0,15,22,6867,34063,59,8,8,1,42,7,35206
2,2,154089,93,6,13,5994,1094,0,3,22,11452,10300,44,4,0,11,42,2,37850
3,3,154090,90,10,13,2405,1,0,1,22,51,53,29,3,2,16,78,6,19451
4,4,154091,123,7,13,6124,49937,1,224,22,82385,89274,34,7,1,25,55,2,38482


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
import lightgbm as lgb

# ランダムシード値
RANDOM_STATE = 10

num_round = 1000

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=TEST_SIZE,random_state=RANDOM_STATE)

# 学習データと評価データを作成
x_train, x_valid, y_train, y_valid = train_test_split(x_train,y_train,test_size=TEST_SIZE,random_state=RANDOM_STATE)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)

In [14]:
# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 50,
    'verbose': 0
}

In [16]:
# LightGBM学習
model = lgb.train(params, lgb_train, num_round, 
                  valid_names=['train', 'valid'], valid_sets=lgb_eval,
                  early_stopping_rounds=20, verbose_eval=False)

# LightGBM推論
y_pred = model.predict(x_test, num_iteration=model.best_iteration)

# 評価
y_pred = y_pred.round(0)
score_valid = accuracy_score(y_test, y_pred)
print(f"test_accuracy_score: {score_valid:.4f} %")    

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
test_accuracy_score: 0.6980 %


## 交差検証

In [87]:
# !pip install lightgbm

In [17]:
from sklearn.model_selection import KFold
n_split = 5 # グループ数を設定（今回は5分割）
seed = 1234
# cross_valid_mae = 0
split_num = 1
kf = KFold(n_splits=n_split, shuffle=True, random_state=seed)

In [18]:
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
import lightgbm as lgb

# scores_train = []
scores_valid = []
itr = 1

for train_idx, valid_idx in kf.split(x, y):
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = x.iloc[valid_idx], y.iloc[valid_idx]
    
    # lightgbmのデータ構造に変換
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    
    # ハイパーパラメータの設定
    # metricsはaccuracy_scoreにすべきかも
    params = {'num_leaves': 64, 'objective': 'binary', 'seed': seed, 
              'verbose': 0, 'metrics': 'binary_logloss'}
    num_round = 1000

    # 学習の実行
    # カテゴリ変数を指定した場合の挙動はKaggle本 p240参照
    # categorical_features = ['category', 'main_category', 'currency', 'country']
    model = lgb.train(params, lgb_train, num_round, 
                    valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_valid],
                     early_stopping_rounds=20, verbose_eval=False)
    
    # # 訓練データに対する予測、正答率
    # y_pred = clf.predict(x_train)
    # acc_train = accuracy_score(y_true=y_train, y_pred=y_pred)
    # print(f"{itr:02}train_訓練データの正答率: {100 * acc_train:.2f} %")
    # scores_train.append(acc_train)  
    
    # 検証データに対する予測
    y_pred = model.predict(x_valid)
    # accurasy_scoreするには0で丸め込みが必要（https://shiokoji11235.com/two_interface_of_lightgbm）
    y_pred = y_pred.round(0)
    # score_valid = log_loss(y_true=y_valid, y_pred=y_pred)
    score_valid = accuracy_score(y_valid, y_pred)
    # score_valid = roc_auc_score(y_valid, y_pred)
    # print(f"{itr:02}valid_logloss: {score_valid:.4f} %")    
    print(f"{itr:02}valid_accuracy_score: {score_valid:.4f} %")    
    # print(f"{itr:02}valid_auc_score: {score_valid:.4f} %")    
    scores_valid.append(score_valid)
    
    itr += 1

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
01valid_accuracy_score: 0.6979 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
02valid_accuracy_score: 0.6958 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
03valid_accuracy_score: 0.7014 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
04valid_accuracy_score: 0.6969 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
05valid_accuracy_score: 0.6981 %


In [19]:
# 各foldのスコアの平均
# print(np.mean(scores_train))
print(np.mean(scores_valid))

0.6980055777493028


### input_01.csv
- 検証データに対する正答率：0.69899


### input_02.csv（class, main_category）
- 検証データに対する正答率：0.70896


### input_02.csv（class）
- 検証データに対する正答率：0.70765


### input_03.csv（class, yar/month/day）
- 検証データに対する正答率：0.7159


## テストデータに対する予測
本来はtrain/validではなくtrain/valid/testに分割し、testに対する予測を見るべき