## notebook only

In [1]:
# kedro
from kedro.framework.context import load_context
proj_path = '../../../' 
context = load_context(proj_path)
model_lightgbm = catalog.load("model_lightgbm")
parameters = context.params

# import from src
sys.path.append("./../../../src/forecast_keiba/")
from models.predict_lightgbm import scrape_race_info
from models.predict_lightgbm import scrape_id
from models.predict_lightgbm import make_horse_table
from models.predict_lightgbm import scrape_race_span
from models.predict_lightgbm import scrape_race_predict
from models.predict_lightgbm import scrape_horse_results
from models.predict_lightgbm import HorseResults
from models.predict_lightgbm import scrape_peds
from models.predict_lightgbm import process_categorical
from models.predict_lightgbm import add_blood_data
from models.predict_lightgbm import scrape_jockey_results
from models.predict_lightgbm import preprocessing_predict
from models.predict_lightgbm import preprocess_race_predict
from models.predict_lightgbm import compare_df

2020-09-21 15:56:52,286 - kedro.io.data_catalog - INFO - Loading data from `model_lightgbm` (PickleDataSet)...


## import
メイン関数実行のため

In [2]:
import pandas as pd
import time
from tqdm.notebook import tqdm
import datetime
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from selenium.webdriver import Chrome, ChromeOptions

## メイン関数

In [3]:
def predict_lightgbm(model_lightgbm, parameters):

    print("レース結果取得中")
    race_tables,race_infos = scrape_race_predict(parameters['predict_race_id'], parameters)
    for key in race_tables:
        race_tables[key].index = [key] * len(race_tables[key])
    race_tables = pd.concat([race_tables[key] for key in race_tables], sort=False)
    df_infos = pd.DataFrame(race_infos.values(), index=race_infos.keys())
    predict_addinfo = race_tables.merge(df_infos,left_index=True,right_index=True,how='inner')
    predict_addinfo['date'] = pd.to_datetime(predict_addinfo['date'],format='%Y年%m月%d日')

    print("馬情報取得中")
    horse_id_list = predict_addinfo['horse_id'].unique()
    horse_results = scrape_horse_results(horse_id_list)
    for key in horse_results:
        horse_results[key].index = [key] * len(horse_results[key])
    df_horse_results = pd.concat([horse_results[key] for key in horse_results])
    
    print("ジョッキー情報取得中")
    jockey_id_list = predict_addinfo['jockey_id'].unique()
    jockey_results = scrape_jockey_results(jockey_id_list)
    for key in jockey_results:
        jockey_results[key].index = [key] * len(jockey_results[key])
    df_jockey_results = pd.concat([jockey_results[key] for key in jockey_results])
    predict_addinfo = predict_addinfo.merge(df_jockey_results,left_on='jockey_id',right_index=True,how='left')

    print("馬の生産地取得")
    borned_place_list = []
    for i in range(len(predict_addinfo)):
        borned_place_list.append(list(set(list(horse_results[predict_addinfo['horse_id'][i]]["Borned_place"])))[0])
    predict_addinfo["Borned_place"] = borned_place_list

    print("データ結合中")
    hr = HorseResults(df_horse_results)
    predict_all = hr.merge_all(predict_addinfo, n_samples=5)
        
    print("血統情報取得中")
    add_blood_predict = add_blood_data(horse_id_list,predict_all)
    preprocess_df,horse_name = preprocess_race_predict(add_blood_predict)
    predict_data = compare_df(preprocess_df)
    
    return predict_data,horse_name

In [4]:
def main(model_lightgbm, parameters):
    return predict_lightgbm(model_lightgbm, parameters)

if __name__ == "__main__":
    main(model_lightgbm, parameters)

レース結果取得中
馬情報取得中


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


ジョッキー情報取得中


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


馬の生産地取得
データ結合中




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


血統情報取得中


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




## notebook only

## 分析

In [5]:
# 上記mainと同じく推論
predict_data, horse_name = predict_lightgbm(model_lightgbm, parameters)

レース結果取得中
馬情報取得中


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


ジョッキー情報取得中


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


馬の生産地取得
データ結合中




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


血統情報取得中


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




In [6]:
result_proba = model_lightgbm.predict_proba(predict_data)
result = model_lightgbm.predict(predict_data)
category1 = []
category2 = []
category3 = []

for i in range(len(result)):
    #print('馬番',i+1,"予想カテゴリー",result[i],result_proba[i])
    category1.append(result_proba[i][0])
    category2.append(result_proba[i][1])
    category3.append(result_proba[i][2])

In [7]:
cols = ["馬名","予想カテゴリー","カテゴリー1","カテゴリー2","カテゴリー3"]
idx = [i for i in range(1,len(result)+1)]
df = pd.DataFrame(index=idx, columns=cols)
df["馬名"] = list(horse_name)
df["予想カテゴリー"] = result
df["カテゴリー1"] = category1
df["カテゴリー2"] = category2
df["カテゴリー3"] = category3

In [8]:
df.sort_values('カテゴリー1', ascending=False)

Unnamed: 0,馬名,予想カテゴリー,カテゴリー1,カテゴリー2,カテゴリー3
1,ベステンダンク,3,0.333333,0.333333,0.333333
2,ハッピーアワー,3,0.333333,0.333333,0.333333
17,ストーミーシー,3,0.333333,0.333333,0.333333
16,リバティハイツ,3,0.333333,0.333333,0.333333
15,ギルデッドミラー,3,0.333333,0.333333,0.333333
14,メイケイダイハード,3,0.333333,0.333333,0.333333
13,ラセット,3,0.333333,0.333333,0.333333
12,ミッキーブリランテ,3,0.333333,0.333333,0.333333
11,ソーグリッタリング,3,0.333333,0.333333,0.333333
10,プリンスリターン,3,0.333333,0.333333,0.333333
