## 勾配ブースティング木

In [None]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import datetime
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

In [None]:
results_all = pd.read_pickle('results_all.pickle')

In [None]:
def preprocessing_last(results):
    df = results.copy()
    df.drop(['馬名'],axis=1,inplace=True)
    df.drop(['騎手'],axis=1,inplace=True)
    df.drop(['horse_id'],axis=1,inplace=True)
    df.drop(['jockey_id'],axis=1,inplace=True)
    df['rank'] = df['着順'].map(lambda x: x if x < 4 else 4)
    return df.fillna(0)

In [None]:
def split_data(df,test_size):
    sorted_id_list = df.sort_values('date').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list)*(1-test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list)*(1-test_size)):]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train,test

In [None]:
sample = pd.get_dummies(preprocessing_last(results_all))
train,test = split_data(sample,0.3)
rank_1 = train['rank'].value_counts()[1]
rank_2 = train['rank'].value_counts()[2]
rank_3 = train['rank'].value_counts()[3]

rus = RandomUnderSampler(sampling_strategy={1:rank_1,2:rank_2,3:rank_3,4:rank_1},random_state=71)

X_train = train.drop(['着順','date','rank'],axis=1)
y_train = train['rank']
X_test = test.drop(['着順','date','rank'],axis=1)
y_test = test['rank']

In [None]:
import lightgbm as lgb

params = {
    "num_leaves": 4,
    "n_estimators": 80,
    "class_weight": "balanced",
    "random_state": 100,
}

lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(X_train.values,y_train.values)
#y_pred_train = lgb_clf.predict_proba(X_train)[:,1]
#y_pred = lgb_clf.predict_proba(X_test)[:,1]

In [None]:
print(lgb_clf.score(X_train,y_train),lgb_clf.score(X_test,y_test))

In [None]:
importances = pd.DataFrame(
    {"features": X_train.columns, "importance": lgb_clf.feature_importances_}
)
importances.sort_values("importance", ascending=False)[:20]

In [None]:
import pickle
pickle.dump(lgb_clf, open('lightgbm.pickle', 'wb'))

In [None]:
loaded_model = pickle.load(open('lightgbm.pickle', 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)