In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')

## データの読み込み

In [2]:
# データの読み込み
# df_org = pd.read_csv('./data/input_01.csv')
# df_org = pd.read_csv('./data/input_02.csv')
# df_org = pd.read_csv('./data/input_03.csv')
df_org = pd.read_csv('./data/input_04.csv')

In [3]:
# データ量が多いのでコーディング中はサンプリング
# df = df_org.sample(100000, replace=True)
df = df_org

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,...,launched_month,launched_day,class,name_vowel_ratio,n_words,log_goal,Goal_1000,Goal_500,Goal_10,median_goal_Last_Week
0,0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,...,8,11,Publishing**Poetry,0.346154,6,7.335601,1.0,3.0,153.0,
1,1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,...,9,2,Film & Video**Narrative Film,0.236842,8,10.308953,30.0,60.0,3000.0,5850.0
2,2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,...,1,12,Film & Video**Narrative Film,0.333333,3,10.714418,45.0,90.0,4500.0,
3,3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,...,3,17,Music**Music,0.340909,7,8.517193,5.0,10.0,500.0,
4,5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,...,2,26,Food**Restaurants,0.263158,3,10.819778,50.0,100.0,5000.0,


## データの準備

In [5]:
# 数値エンコーディング（GBDT以外の場合は必須）
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df = df.apply(encoder.fit_transform)

In [6]:
y = df[['state']]
x_cols = ['class', 'country', 'period', 'n_words', 'log_goal', 'launched_year', 'launched_month', 'launched_day', 
         'name_vowel_ratio', 'Goal_1000', 'Goal_500', 'Goal_10', 'median_goal_Last_Week']
# x_cols = ['class', 'currency', 'country', 'period', 'n_words', 'log_goal', 'launched_year', 'launched_month', 'launched_day', 
#          'name_vowel_ratio', 'Goal_1000', 'Goal_500', 'Goal_10', 'median_goal_Last_Week']
# x_cols = ['category', 'main_category', 'currency', 'country', 'period', 'n_words', 'log_goal']
# x_cols = ['class', 'currency', 'country', 'period', 'n_words', 'log_goal']
# x_cols = ['class', 'main_category', 'currency', 'country', 'period', 'n_words', 'log_goal']
# x_cols = ['class', 'currency', 'country', 'period', 'n_words', 'log_goal', 'launched_year', 'launched_month', 'launched_day']
# x_cols = ['class', 'currency', 'country', 'period', 'log_goal', 'launched_year', 'launched_month', 'launched_day']
x = df[x_cols]

In [7]:
y.head()

Unnamed: 0,state
0,0
1,0
2,0
3,0
4,1


In [8]:
x.head()

Unnamed: 0,class,country,period,n_words,log_goal,launched_year,launched_month,launched_day,name_vowel_ratio,Goal_1000,Goal_500,Goal_10,median_goal_Last_Week
0,97,9,58,5,10270,6,7,10,398,1,3,153,80
1,42,22,59,7,35206,8,8,1,225,30,60,2936,49
2,42,22,44,2,37850,4,0,11,378,45,90,4146,80
3,78,22,29,6,19451,3,2,16,388,5,10,500,80
4,55,22,34,2,38482,7,1,25,261,50,100,4506,80


## モデル学習（単体）

In [9]:
from sklearn.model_selection import train_test_split

# 訓練データとテストデータを8:2に分割
# ※評価を揃えるため df 以外の引数は変更しないこと
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1234)

In [10]:
import lightgbm as lgb
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score

# パラメータ
RANDOM_STATE = 10
num_round = 1000
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_valid, y_train, y_valid = train_test_split(x_train,y_train,test_size=TEST_SIZE,random_state=RANDOM_STATE)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)

In [11]:
# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 50,
    'verbose': 0
}

In [12]:
# 訓練データに対する予測
# LightGBM学習
model = lgb.train(params, lgb_train, num_round, 
                  valid_names=['train', 'valid'], valid_sets=lgb_eval,
                  early_stopping_rounds=20, verbose_eval=False)

# LightGBM推論
y_pred = model.predict(x_valid, num_iteration=model.best_iteration)

# 評価
y_pred = y_pred.round(0)
# acc_train = accuracy_score(y_test, y_pred)
acc_train = accuracy_score(y_true=y_valid, y_pred=y_pred)

print(f"訓練データの正答率: {100 * acc_train:.3f} %")    

You can set `force_col_wise=true` to remove the overhead.
訓練データの正答率: 69.618 %


In [13]:
# テストデータに対する予測
# LightGBM推論
y_pred = model.predict(x_test, num_iteration=model.best_iteration)
y_pred = y_pred.round(0)

# 評価
acc_train = accuracy_score(y_true=y_test, y_pred=y_pred)
print(f"テストデータの正答率: {100 * acc_train:.3f} %")  

# print(y_test)
# print(y_pred)

テストデータの正答率: 69.612 %
        state
166106      0
317562      1
58733       0
121183      0
81342       1
...       ...
172079      0
225554      1
151801      1
10079       0
133990      0

[66335 rows x 1 columns]
[1. 1. 0. ... 1. 0. 0.]


## 交差検証

In [14]:
# !pip install lightgbm

In [19]:
# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
n_split = 5 # グループ数を設定（今回は5分割）
seed = 1234
# cross_valid_mae = 0
# split_num = 1
# kf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
kf = KFold(n_splits=n_split, shuffle=True, random_state=seed)

In [20]:
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
import lightgbm as lgb

# scores_train = []
scores_valid = []
models = []
itr = 1

for train_idx, valid_idx in kf.split(x, y):
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = x.iloc[valid_idx], y.iloc[valid_idx]
    
    # lightgbmのデータ構造に変換
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    
    # ハイパーパラメータの設定
    # metricsはaccuracy_scoreにすべきかも
    params = {'num_leaves': 64, 'objective': 'binary', 'seed': seed, 
              'verbose': 0, 'metrics': 'binary_logloss'}
    num_round = 1000

    # 学習の実行
    # カテゴリ変数を指定した場合の挙動はKaggle本 p240参照
    # categorical_features = ['category', 'main_category', 'currency', 'country']
    model = lgb.train(params, lgb_train, num_round, 
                    valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_valid],
                     early_stopping_rounds=20, verbose_eval=False)
    
    # # 訓練データに対する予測、正答率
    # y_pred = clf.predict(x_train)
    # acc_train = accuracy_score(y_true=y_train, y_pred=y_pred)
    # print(f"{itr:02}train_訓練データの正答率: {100 * acc_train:.2f} %")
    # scores_train.append(acc_train)  
    
    # 検証データに対する予測
    # accurasy_scoreするには0で丸め込みが必要（https://shiokoji11235.com/two_interface_of_lightgbm）
    y_pred = model.predict(x_valid)
    y_pred = y_pred.round(0)

    score_valid = accuracy_score(y_valid, y_pred)
    print(f"{itr:02}valid_accuracy_score: {score_valid:.4f} %")    

    # score_valid = log_loss(y_true=y_valid, y_pred=y_pred)
    # print(f"{itr:02}valid_logloss: {score_valid:.4f} %")    
    # score_valid = roc_auc_score(y_valid, y_pred)
    # print(f"{itr:02}valid_auc_score: {score_valid:.4f} %")    

    scores_valid.append(score_valid)    
    models.append(model)
    
    itr += 1

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
01valid_accuracy_score: 0.6971 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
02valid_accuracy_score: 0.6953 %
You can set `force_col_wise=true` to remove the overhead.
03valid_accuracy_score: 0.7014 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
04valid_accuracy_score: 0.6987 %
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
05valid_accuracy_score: 0.7003 %


In [21]:
# 各foldのスコアの平均
# print(np.mean(scores_train))
print(np.mean(scores_valid))

0.6985603376799578


## 推論

In [22]:
y_preds = np.zeros(len(x_test))

for model in models:
    y_pred = model.predict(x_test)
    y_pred = y_pred.round(0)
    y_preds += y_pred / len(models)
    
# print(y_test)
# print('***')
# print(y_preds)
# print('***')
# print(y_pred)

y_preds = y_preds.round(0)
score_test = accuracy_score(y_test, y_preds)
print(f"test_accuracy_score: {score_test:.4f} %")    

test_accuracy_score: 0.7178 %


### input_01.csv
- 検証データに対する正答率：0.69899


### input_02.csv（class, main_category）
- 検証データに対する正答率：0.70896


### input_02.csv（class）
- 検証データに対する正答率：0.70765


### input_03.csv（class, yar/month/day）
- 検証データに対する正答率：0.7159


### input_04.csv
- ：71.78%
x_cols = ['class', 'country', 'period', 'n_words', 'log_goal', 'launched_year', 'launched_month', 'launched_day', 
         'name_vowel_ratio', 'Goal_1000', 'Goal_500', 'Goal_10', 'median_goal_Last_Week']

## テストデータに対する予測
本来はtrain/validではなくtrain/valid/testに分割し、testに対する予測を見るべき