In [4]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import roc_curve,roc_auc_score
import matplotlib.pyplot  as plt
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [5]:
def split_date(df, test_size):
    sorted_id_list = df.sort_values('日付').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list) * (1-test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1-test_size)):]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

In [6]:
# データの読み込み
data = pd.read_csv('encoded/encoded_data.csv')

In [7]:
#着順を変換
data['着順'] = data['着順'].map(lambda x: 1 if x<4 else 0)

In [8]:
# 特徴量とターゲットの分割
train, test = split_date(data, 0.3)
X_train = train.drop(['着順','オッズ','人気','上がり','走破時間','通過順'], axis=1)
y_train = train['着順']
X_test = test.drop(['着順','オッズ','人気','上がり','走破時間','通過順'], axis=1)
y_test = test['着順']

In [9]:
# LightGBMデータセットの作成
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

params={
    'num_leaves':32,
    'min_data_in_leaf':190,
    'class_weight':'balanced',
    'random_state':100
}

lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(X_train, y_train)
y_pred_train = lgb_clf.predict_proba(X_train)[:,1]
y_pred = lgb_clf.predict_proba(X_test)[:,1]

[LightGBM] [Info] Number of positive: 7040, number of negative: 25571
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8915
[LightGBM] [Info] Number of data points in the train set: 32611, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [10]:
#モデルの評価
#print(roc_auc_score(y_train,y_pred_train))
print(roc_auc_score(y_test,y_pred))
total_cases = len(y_test)  # テストデータの総数
TP = (y_test == 1) & (y_pred >= 0.5)  # True positives
FP = (y_test == 0) & (y_pred >= 0.5)  # False positives
TN = (y_test == 0) & (y_pred < 0.5)  # True negatives
FN = (y_test == 1) & (y_pred < 0.5)  # False negatives

TP_count = sum(TP)
FP_count = sum(FP)
TN_count = sum(TN)
FN_count = sum(FN)

accuracy_TP = TP_count / total_cases * 100
misclassification_rate_FP = FP_count / total_cases * 100
accuracy_TN = TN_count / total_cases * 100
misclassification_rate_FN = FN_count / total_cases * 100

0.7738997469532165


In [11]:
print("Total cases:", total_cases)
print("True positives:", TP_count, "(", "{:.2f}".format(accuracy_TP), "%)")
print("False positives:", FP_count, "(", "{:.2f}".format(misclassification_rate_FP), "%)")
print("True negatives:", TN_count, "(", "{:.2f}".format(accuracy_TN), "%)")
print("False negatives:", FN_count, "(", "{:.2f}".format(misclassification_rate_FN), "%)")

# True Positives (TP): 実際に1で、予測も1だったもの
# False Positives (FP): 実際は0だが、予測では1だったもの
# True Negatives (TN): 実際に0で、予測も0だったもの
# False Negatives (FN): 実際は1だが、予測では0だったもの

Total cases: 13976
True positives: 1791 ( 12.81 %)
False positives: 2305 ( 16.49 %)
True negatives: 8559 ( 61.24 %)
False negatives: 1321 ( 9.45 %)


In [12]:
# モデルの保存
lgb_clf.booster_.save_model('model/model.txt')

# 特徴量の重要度を取得
importance = lgb_clf.feature_importances_

# 特徴量の名前を取得
feature_names = X_train.columns

In [14]:
pd.set_option('display.max_rows', 150) # 最大100行表示

In [17]:
# 特徴量の重要度を降順にソート
indices = np.argsort(importance)[::-1]

# 特徴量の重要度を降順に表示
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feature_names[indices[f]], importance[indices[f]]))

 1) 騎手の勝率                          161.000000
 2) 着順1                            129.000000
 3) オッズ1                           122.000000
 4) 日付1                            111.000000
 5) 日付差                            99.000000
 6) 上がり1                           76.000000
 7) 馬                              64.000000
 8) race_id                        61.000000
 9) オッズ2                           57.000000
10) 走破時間1                          56.000000
11) 開催                             51.000000
12) 馬番                             51.000000
13) オッズ3                           51.000000
14) 日付2                            50.000000
15) レース名                           48.000000
16) 距離差                            47.000000
17) 走破時間5                          47.000000
18) 騎手                             46.000000
19) 体重                             45.000000
20) 馬番1                            45.000000
21) クラス                            44.000000
22) 日付差1                           43.000000
23) 上が