In [1]:
import create_population
import preprocessing
from feature_engineering import FeatureCreator
from feature_engineering_prediction import PredictionFeatureCreator

import prediction

import pandas as pd


import html5lib

from train_lightgbm_rank_niti import Trainer_lightgbm_rank_niti
from train_lightgbm_time import Trainer_lightgbm_time
from train_lightgbm_rank_kaiki import Trainer_lightgbm_rank_kaiki


from evaluation_lightgbm_rank_niti import Evaluator_lightgbm_rank_niti
from evaluation_lightgbm_time_kaiki import Evaluator_lightgbm_time_kaiki
from evaluation_lightgbm_rank_kaiki import Evaluator_lightgbm_rank_kaiki
from evaluation_pop import Evaluator_pop

%load_ext autoreload

In [25]:
%autoreload

In [3]:
population = create_population.create(from_="2023-01-01", to_="2023-12-31")


In [4]:
population

Unnamed: 0,race_id,date,horse_id
0,202306010101,2023-01-05,2020103575
1,202306010101,2023-01-05,2020107073
2,202306010101,2023-01-05,2020102562
3,202306010101,2023-01-05,2020106345
4,202306010101,2023-01-05,2020100039
...,...,...,...
46207,202309050912,2023-12-28,2017104873
46208,202309050912,2023-12-28,2019100653
46209,202309050912,2023-12-28,2017106240
46210,202309050912,2023-12-28,2018103205


In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.reset_option('display.precision')
pd.reset_option('display.float_format')

In [6]:
import json
from pathlib import Path


import re

import pandas as pd
import numpy as np
import ast
COMMON_DATA_DIR = Path("..", "..", "common", "data")
RAWDF_DIR = COMMON_DATA_DIR / "rawdf"
MAPPING_DIR = COMMON_DATA_DIR / "mapping"
POPULATION_DIR = Path("..", "data", "00_population")
OUTPUT_DIR = Path("..", "data", "01_preprocessed")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# カテゴリ変数を数値に変換するためのマッピング
with open(MAPPING_DIR / "sex.json", "r") as f:
    sex_mapping = json.load(f)
with open(MAPPING_DIR / "race_type.json", "r") as f:
    race_type_mapping = json.load(f)
with open(MAPPING_DIR / "around.json", "r") as f:
    around_mapping = json.load(f)
with open(MAPPING_DIR / "weather.json", "r") as f:
    weather_mapping = json.load(f)
with open(MAPPING_DIR / "ground_state.json", "r") as f:
    ground_state_mapping = json.load(f)
with open(MAPPING_DIR / "race_class.json", "r") as f:
    race_class_mapping = json.load(f)
with open(MAPPING_DIR / "place.json", "r") as f:
    place_mapping = json.load(f)


### preproでレース平均年齢、中央値、平均年齢切り捨て、切り捨て+季節のカテゴリを作成


In [9]:
#切り捨て+季節のみ、futureで行う
population_dir = POPULATION_DIR
populaton_filename = "population.csv"
input_dir = RAWDF_DIR
output_dir = OUTPUT_DIR
input_filename= "results.csv"
output_filename = "results.csv"
sex_mapping = sex_mapping

In [19]:
population = pd.read_csv(population_dir / populaton_filename, sep="\t")

# df = pd.read_csv(input_dir / input_filename, sep="\t").query(
#     "race_id in @population['race_id']"
# )
# `race_id`のリストを作成
population_race_ids = population['race_id'].tolist()

# クエリでリストを直接使用
df = pd.read_csv(input_dir / input_filename, sep="\t").query(
    "race_id in @population_race_ids"
)


df["rank"] = pd.to_numeric(df["着順"], errors="coerce")
df.dropna(subset=["rank"], inplace=True)
df["rank"] = df["rank"].astype(int)

# 時間を秒に変換
df["time"] = pd.to_datetime(df["タイム"], format="%M:%S.%f", errors="coerce")
df.dropna(subset=["time"], inplace=True)
df["time"] = (
    df["time"].dt.minute * 60
    + df["time"].dt.second
    + df["time"].dt.microsecond / 1000000
)
df["time"] = df["time"].astype(float)


# その他の列を整形
df["nobori"] = df["上り"].astype(float)
df["umaban"] = df["馬番"].astype(int)
df["tansho_odds"] = df["単勝"].astype(float)
df["popularity"] = df["人気"].astype(int)
df["impost"] = df["斤量"].astype(float)
df["wakuban"] = df["枠番"].astype(int)
df["sex"] = df["性齢"].str[0].map(sex_mapping)
df["age"] = df["性齢"].str[1:].astype(int)
df["weight"] = df["馬体重"].str.extract(r"(\d+)").astype(int)
df["weight_diff"] = df["馬体重"].str.extract(r"\((.+)\)").astype(int)
df["n_horses"] = df.groupby("race_id")["race_id"].transform("count")

# コーナー通過順を分割して列を作成
corner_cols = df['通過'].str.split('-', expand=True)
corner_cols.columns = [f'corner_{i+1}' for i in range(corner_cols.shape[1])]
# オブジェクト型のデータを整数型に変換する
corner_cols = corner_cols.apply(pd.to_numeric, errors='coerce').astype('Int64')  
# nullable int型を指定


# # time列の相対化
# tmp_df = df.groupby("race_id")["time"]
# df["time_relative"] = ((df["time"] - tmp_df.transform("mean")) / tmp_df.transform("std"))
# tmp_df = df.groupby("race_id")["rank"]
# df["rank_relative"] = ((df["rank"] - tmp_df.transform("mean")) / tmp_df.transform("std"))


# 元のデータフレームと結合
result_df = pd.concat([df, corner_cols], axis=1)
# NoneをNaNに置き換え
result_df = result_df.where(pd.notnull(result_df), np.nan)
df = result_df

# rank / n_horses の特徴量を作成（欠損値を含む行はNaNに設定）
df["rank_per_horse"] = df["rank"].where(df["rank"].notna(), np.nan) / df["n_horses"].where(df["n_horses"].notna(), np.nan)

# corner_1 / n_horses の特徴量を作成（欠損値を含む行はNaNに設定）
df["corner_1_per_horse"] = df["corner_1"].where(df["corner_1"].notna(), np.nan) / df["n_horses"].where(df["n_horses"].notna(), np.nan)

df["corner_2_per_horse"] = df["corner_2"].where(df["corner_2"].notna(), np.nan) / df["n_horses"].where(df["n_horses"].notna(), np.nan)
df["corner_3_per_horse"] = df["corner_3"].where(df["corner_3"].notna(), np.nan) / df["n_horses"].where(df["n_horses"].notna(), np.nan)
df["corner_4_per_horse"] = df["corner_4"].where(df["corner_4"].notna(), np.nan) / df["n_horses"].where(df["n_horses"].notna(), np.nan)
# NoneをNaNに置き換え
df = df.where(pd.notnull(df), np.nan)   
result_df = df

# for col in corner_cols.columns:
#     # ここでは result_df を使う
#     tmp_df = result_df.groupby("race_id")[col]
#     result_df[f"{col}_relative"] = ((result_df[col] - tmp_df.transform("mean")) / tmp_df.transform("std"))
# result_df = result_df.apply(lambda col: col.apply(lambda x: np.nan if pd.isna(x) else x))

        

# データが着順に並んでいることによるリーク防止のため、各レースを馬番順にソートする
result_df = result_df.sort_values(["race_id", "umaban"])

In [11]:
result_df 

Unnamed: 0,race_id,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,人気,馬体重,調教師,horse_id,jockey_id,owner_id,trainer_id,ﾀｲﾑ指数,通過,上り,調教ﾀｲﾑ,厩舎ｺﾒﾝﾄ,備考,馬主,賞金(万円),rank,time,nobori,umaban,tansho_odds,popularity,impost,wakuban,sex,age,weight,weight_diff,n_horses,corner_1,corner_2,corner_3,corner_4,rank_per_horse,corner_1_per_horse,corner_2_per_horse,corner_3_per_horse,corner_4_per_horse
362306,202301010101,5,1,1,ウィスピースノー,牝2,55.0,吉田隼人,1:10.3,1/2,23.9,5.0,434(-10),[西] 今野貞一,2021100648,1095,281008,1128,**,8-8,34.5,,,,水上ふじ子,55.0,5,70.3,34.5,1,23.9,5,55.0,1,1,2,434,-10,8,8,8,,,0.625000,1.0,1.0,,
362307,202301010101,6,2,2,ロードスタウト,牡2,55.0,鮫島克駿,1:10.7,2.1/2,61.8,7.0,454(-6),[西] 中村直也,2021100159,1157,170800,1186,**,5-6,35.1,,,,ロードホースクラブ,,6,70.7,35.1,2,61.8,7,55.0,2,0,2,454,-6,8,5,6,,,0.750000,0.625,0.75,,
362308,202301010101,7,3,3,コミックガール,牝2,53.0,佐々木大,1:10.9,1.1/4,18.8,4.0,404(-2),[東] 上原佑紀,2021100265,1197,320803,1192,**,5-6,35.3,,,,古賀禎彦,,7,70.9,35.3,3,18.8,4,53.0,3,1,2,404,-2,8,5,6,,,0.875000,0.625,0.75,,
362305,202301010101,4,4,4,デビルシズカチャン,牝2,55.0,ルメール,1:10.2,1.1/2,16.6,3.0,450(+2),[西] 武幸四郎,2021105553,5339,680031,1160,**,3-3,34.9,,,,カカムーチョレーシング,83.0,4,70.2,34.9,4,16.6,3,55.0,4,1,2,450,2,8,3,3,,,0.500000,0.375,0.375,,
362302,202301010101,1,5,5,サトミノキラリ,牡2,55.0,横山武史,1:09.5,,1.2,1.0,452(-4),[東] 鈴木伸尋,2021101429,1170,425031,1026,**,2-2,34.3,,,,田代洋己,550.0,1,69.5,34.3,5,1.2,1,55.0,5,0,2,452,-4,8,2,2,,,0.125000,0.25,0.25,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368104,202310030812,1,6,8,ジューンレインボー,牝4,53.0,田口貫太,2:40.0,,3.5,1.0,442(-2),[西] 武英智,2019102542,1208,17803,1161,**,4-4-2-1,36.5,,,,吉川潤,800.0,1,160.0,36.5,8,3.5,1,53.0,6,1,4,442,-2,12,4,4,2,1,0.083333,0.333333,0.333333,0.166667,0.083333
368109,202310030812,6,7,9,アマートカヴァロ,牡3,55.0,藤岡康太,2:42.6,3.1/2,15.2,8.0,468(0),[西] 笹田和秀,2020103754,1116,651031,1104,**,7-7-9-6,38.4,,,,猪又晶介,,6,162.6,38.4,9,15.2,8,55.0,7,0,3,468,0,12,7,7,9,6,0.500000,0.583333,0.583333,0.75,0.5
368107,202310030812,4,7,10,ルージュサクシード,牝3,50.0,今村聖奈,2:41.8,1.1/2,23.1,9.0,458(+6),[西] 松永幹夫,2020100400,1193,180800,1092,**,11-11-12-10,36.9,,,,東京ホースレーシング,120.0,4,161.8,36.9,10,23.1,9,50.0,7,1,3,458,6,12,11,11,12,10,0.333333,0.916667,0.916667,1.0,0.833333
368105,202310030812,2,8,11,シリンガバルガリス,セ3,55.0,幸英明,2:40.9,5,5.0,3.0,486(-6),[西] 松永幹夫,2020103663,732,226800,1092,**,5-5-2-2,37.3,,,,サンデーレーシング,320.0,2,160.9,37.3,11,5.0,3,55.0,8,2,3,486,-6,12,5,5,2,2,0.166667,0.416667,0.416667,0.166667,0.166667


In [22]:
"""
レース平均年齢、中央値、平均年齢切り捨て
"""
import pandas as pd

# 1. race_idごとの年齢の平均、中央値、平均（小数点以下切り捨て）を計算
race_age_stats = result_df.groupby('race_id').agg(
    mean_age=('age', 'mean'),
    median_age=('age', 'median'),
    mean_age_kirisute=('age', lambda x: int(x.mean()))  # 小数点以下切り捨て
).reset_index()

# 2. result_dfに統合（merge）し、_xや_yを回避する
result_df = result_df.merge(
    race_age_stats, 
    on='race_id', 
    how='left', 
    suffixes=('', '_drop')  # _drop を付けることで重複を回避
)

# 不要な列を削除（この場合は、"age_drop" などの列があれば削除する）
result_df = result_df.loc[:, ~result_df.columns.str.endswith('_drop')]

# 確認
result_df.tail(100)


Unnamed: 0,race_id,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,人気,馬体重,調教師,horse_id,jockey_id,owner_id,trainer_id,ﾀｲﾑ指数,通過,上り,調教ﾀｲﾑ,厩舎ｺﾒﾝﾄ,備考,馬主,賞金(万円),rank,time,nobori,umaban,tansho_odds,popularity,impost,wakuban,sex,age,weight,weight_diff,n_horses,corner_1,corner_2,corner_3,corner_4,rank_per_horse,corner_1_per_horse,corner_2_per_horse,corner_3_per_horse,corner_4_per_horse,mean_age,median_age,mean_age_kirisute
45802,202310030806,12,1,2,アオイリキマル,牡3,53.0,大久保友,1:47.4,3,199.8,16.0,500(0),[西] 清水久詞,2020101895,1194,443033,1110,**,4-3-3-5,39.3,,,,新谷幸義,,12,107.4,39.3,2,199.8,16,53.0,1,0,3,500,0,16,4,3,3.0,5.0,0.75,0.25,0.1875,0.1875,0.3125,3.0,3.0,3
45803,202310030806,16,2,3,ヒュプノス,牡3,56.0,水口優也,1:51.8,2.1/2,128.5,15.0,488(0),[西] 吉田直弘,2020106159,1133,368803,1101,**,12-12-14-15,42.6,,,,ウエスト．フォレスト．ステイブル,,16,111.8,42.6,3,128.5,15,56.0,2,0,3,488,0,16,12,12,14.0,15.0,1.0,0.75,0.75,0.875,0.9375,3.0,3.0,3
45804,202310030806,6,2,4,ヴォランテ,牡3,53.0,川端海翼,1:46.6,クビ,85.8,14.0,452(+4),[西] 羽月友彦,2020100593,1195,875006,1091,**,7-7-6-5,38.2,,,,永田和彦,,6,106.6,38.2,4,85.8,14,53.0,2,0,3,452,4,16,7,7,6.0,5.0,0.375,0.4375,0.4375,0.375,0.3125,3.0,3.0,3
45805,202310030806,7,3,5,モズマワシゲリ,牡3,56.0,団野大成,1:46.8,1.1/2,16.5,7.0,500(+4),[西] 鮫島一歩,2020102261,1180,5803,1046,**,10-10-11-8,38.0,,,,キャピタル・システム,,7,106.8,38.0,5,16.5,7,56.0,3,0,3,500,4,16,10,10,11.0,8.0,0.4375,0.625,0.625,0.6875,0.5,3.0,3.0,3
45806,202310030806,15,3,6,ホペロア,牡3,56.0,岩田望来,1:51.4,大,19.7,8.0,512(+2),[西] 友道康夫,2020103557,1174,858800,1061,**,13-13-14-16,42.2,,,,大塚亮一,,15,111.4,42.2,6,19.7,8,56.0,3,0,3,512,2,16,13,13,14.0,16.0,0.9375,0.8125,0.8125,0.875,1.0,3.0,3.0,3
45807,202310030806,1,4,7,ヘヴンズタイム,牡3,53.0,田口貫太,1:46.0,,12.9,6.0,486(+2),[西] 北出成人,2020105624,1208,788800,1078,**,2-2-2-2,38.1,,,,ノースヒルズ,550.0,1,106.0,38.1,7,12.9,6,53.0,4,0,3,486,2,16,2,2,2.0,2.0,0.0625,0.125,0.125,0.125,0.125,3.0,3.0,3
45808,202310030806,4,4,8,セザンワールド,牡3,56.0,国分優作,1:46.5,1.3/4,7.9,4.0,482(0),[西] 畑端省吾,2020106051,1125,523009,1188,**,9-9-8-8,37.9,,,,瀬山孝一,83.0,4,106.5,37.9,8,7.9,4,56.0,4,0,3,482,0,16,9,9,8.0,8.0,0.25,0.5625,0.5625,0.5,0.5,3.0,3.0,3
45809,202310030806,10,5,9,リードブロー,牡3,56.0,藤岡康太,1:46.9,ハナ,28.2,9.0,522(-2),[西] 安田翔伍,2020103657,1116,506800,1164,**,11-11-8-11,38.3,,,,シルクレーシング,,10,106.9,38.3,9,28.2,9,56.0,5,0,3,522,-2,16,11,11,8.0,11.0,0.625,0.6875,0.6875,0.5,0.6875,3.0,3.0,3
45810,202310030806,11,5,10,ジャスティンボルト,牡3,56.0,坂井瑠星,1:46.9,クビ,3.5,1.0,532(0),[西] 友道康夫,2020103624,1163,195031,1061,**,7-7-8-8,38.3,,,,三木正浩,,11,106.9,38.3,10,3.5,1,56.0,5,0,3,532,0,16,7,7,8.0,8.0,0.6875,0.4375,0.4375,0.5,0.5,3.0,3.0,3
45811,202310030806,13,6,11,サンライズプラーナ,牡3,56.0,国分恭介,1:48.7,8,30.2,11.0,538(+6),[西] 牧浦充徳,2020101534,1124,399803,1113,**,15-15-14-14,39.4,,,,ライフハウス,,13,108.7,39.4,11,30.2,11,56.0,6,0,3,538,6,16,15,15,14.0,14.0,0.8125,0.9375,0.9375,0.875,0.875,3.0,3.0,3


In [26]:
#futre_修正、馬場状態
# レース結果テーブルの前処理
results_preprocessed = preprocessing.process_results()
# 馬の過去成績テーブルの加工
horse_results_preprocessed = preprocessing.process_horse_results()
# レース情報テーブルの前処理
race_info_preprocessed = preprocessing.process_race_info()

In [28]:
from pathlib import Path

from tqdm.notebook import tqdm
import json
import re
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from tqdm.notebook import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from io import StringIO 

DATA_DIR = Path("..", "data")
POPULATION_DIR = DATA_DIR / "00_population"
INPUT_DIR = DATA_DIR / "01_preprocessed"
OUTPUT_DIR = DATA_DIR / "02_features"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

population_dir = POPULATION_DIR
poplation_filename = "population.csv"
input_dir = INPUT_DIR
results_filename = "results.csv"
race_info_filename = "race_info.csv"
horse_results_filename = "horse_results.csv"
jockey_leading_filename = "jockey_leading.csv"
trainer_leading_filename = "trainer_leading.csv"
peds_filename = "peds.csv"
sire_leading_filename = "sire_leading.csv"
output_dir = OUTPUT_DIR
output_filename = "features.csv"

population = pd.read_csv(population_dir / poplation_filename, sep="\t")
results = pd.read_csv(input_dir / results_filename, sep="\t")
race_info = pd.read_csv(input_dir / race_info_filename, sep="\t")
horse_results = pd.read_csv(input_dir / horse_results_filename, sep="\t")
jockey_leading = pd.read_csv(input_dir / jockey_leading_filename, sep="\t")
trainer_leading = pd.read_csv(
    input_dir / trainer_leading_filename, sep="\t"
)
peds = pd.read_csv(input_dir / peds_filename, sep="\t")
sire_leading = pd.read_csv(input_dir / sire_leading_filename, sep="\t")
output_dir = output_dir
output_filename = output_filename
agg_horse_per_group_cols_dfs = {}

In [45]:
df = (
    results[["race_id", "horse_id","mean_age_kirisute"]]
    .merge(race_info[["race_id", "race_type", "season","race_class"]], on="race_id")
)


In [46]:
df

Unnamed: 0,race_id,horse_id,mean_age_kirisute,race_type,season,race_class
0,202301010101,2021100648,2,1,2,1
1,202301010101,2021100159,2,1,2,1
2,202301010101,2021100265,2,1,2,1
3,202301010101,2021105553,2,1,2,1
4,202301010101,2021101429,2,1,2,1
...,...,...,...,...,...,...
45897,202310030812,2019102542,3,1,3,2
45898,202310030812,2020103754,3,1,3,2
45899,202310030812,2020100400,3,1,3,2
45900,202310030812,2020103663,3,1,3,2


In [36]:
df

Unnamed: 0,race_id,horse_id,mean_age_kirisute,race_type,season,race_class,date,age_season
0,202301010101,2021100648,2,1,2,1,2023-07-22,22
1,202301010101,2021100159,2,1,2,1,2023-07-22,22
2,202301010101,2021100265,2,1,2,1,2023-07-22,22
3,202301010101,2021105553,2,1,2,1,2023-07-22,22
4,202301010101,2021101429,2,1,2,1,2023-07-22,22
...,...,...,...,...,...,...,...,...
45897,202310030812,2019102542,3,1,3,2,2023-09-03,33
45898,202310030812,2020103754,3,1,3,2,2023-09-03,33
45899,202310030812,2020100400,3,1,3,2,2023-09-03,33
45900,202310030812,2020103663,3,1,3,2,2023-09-03,33


In [50]:

"""

dfに新たな列、race_gradeを作成して欲しい
作成ルールは以下の通りである
'age_season'の条件に引っかかった場合、それを優先すること
次点で"race_class"の条件にかかっても、'age_season'がある方を優先して変換すること


"race_class"列が0は55

"race_class"列が1は60

"race_class"列が2は70
2歳それ以外は68（20<='age_season'<30かつ、2<="race_class"列<5の行）
2歳G2,G3,OPは73（20<='age_season'<30かつ、5<="race_class"列<8の行）

"race_class"列が3は79
2歳G1は79（20<='age_season'<30かつ、8<="race_class"の行）
3歳春OPは80（30<='age_season'<33かつ、4<="race_class"列<6の行）
3歳春G2.G3は81（30<='age_season'<33かつ、6<="race_class"列<8の行）

"race_class"列が4は85
3歳春G1は86（30<='age_season'<33かつ、8<="race_class"の行）
3歳秋G2,G3は86（33<='age_season'<40かつ、5<="race_class"列<8の行）

"race_class"列が5は89
3歳秋G1は91（33<='age_season'<40かつ、8<="race_class"の行）

"race_class"列が6は92

"race_class"列が7は94

"race_class"列が8は98




これらを小さく（1/10 - 5）した列

G1 8	100
G2 7	95
G3 6	92
オープン5	89
1600万4	86
２勝クラス3	80
１勝クラス2	70
未勝利1	60
新馬0	55


クラス	芝	ダート
未勝利	６５（-１５）	６０（-２０）
500万下
Ｇ１を除く２歳ＯＰ	７５（-５）	７２（-８）
1000万下
２歳Ｇ１
Ｇ１を除く３歳春ＯＰ	８３（３）	８３（３）
1600万下
３歳春Ｇ１
３歳秋重賞	８８（８）	９０（１０）
ＯＰ（ただしダート重賞を除く）
３歳秋Ｇ１	９３（１３）	９５（１５）
ダート重賞（３歳を除く）	－	１００（２０）
古馬Ｇ１	９８（１８）	１０５（２５）
"""
# "mean_age_kirisute"と"season"を文字列に変換して結合し、int型に変換して新しい列 "age_season" を作成
df['age_season'] = (df['mean_age_kirisute'].astype(str) + df['season'].astype(str)).astype(int)

# race_gradeの作成
def calculate_race_grade(row):
    age_season = row['age_season']
    race_class = row['race_class']

    # 'age_season' に基づく条件を優先してチェック
    if 20 <= age_season < 30:
        if 2 <= race_class < 5:
            return 68
        elif 5 <= race_class < 8:
            return 73
        elif 8 <= race_class:
            return 78
    elif 30 <= age_season < 33:
        if 4 <= race_class < 6:
            return 80
        elif 6 <= race_class < 8:
            return 82
        elif 8 <= race_class:
            return 85
    elif 33 <= age_season < 40:
        if 5 <= race_class < 8:
            return 86
        elif 8 <= race_class:
            return 92
    
    if race_class == 0:
        return 55
    elif race_class == 1:
        return 60
    elif race_class == 2:
        return 70
    elif race_class == 3:
        return 79
    elif race_class == 4:
        return 85
    elif race_class == 5:
        return 89
    elif race_class == 6:
        return 91
    elif race_class == 7:
        return 94
    elif race_class == 8:
        return 98
    else:
        return np.nan  

# race_grade列を作成
df['race_grade'] = df.apply(calculate_race_grade, axis=1)

#race_grade_scaledの作成
df['race_grade_scaled'] = df['race_grade'] / 10 - 5

race_info[['age_season', 'race_grade', 'race_grade_scaled']] = df[['age_season', 'race_grade', 'race_grade_scaled']]


In [51]:
race_info

Unnamed: 0,race_id,date,race_type,around,course_len,weather,ground_state,race_class,place,season,place_season,place_season_type,kaisai_race,kaisai_race_type,day,day_type,kaisai_count,kaisai_count_type,place_season_day_type,day_condition,place_season_condition_type,place_season_condition_type_categori,place_course_category,place_course_tough,goal_range,curve,goal_slope,place_season_type_course_len,lap_type,race_day_count,race_date_day_count,goal_range_100,age_season,race_grade,race_grade_scaled
0,202306010101,2023-01-05,0,0,1200,1,0,1,6,4,64,640,1,10,1,10,1,10,64010,1,6410,-3,-1,-1,,,,6401200,-1,1,73803501,,22,60,1.0
1,202306010102,2023-01-05,0,0,1800,1,0,1,6,4,64,640,2,20,1,10,1,10,64010,1,6410,-3,-1,-1,,,,6401800,-1,2,73803502,,22,60,1.0
2,202306010103,2023-01-05,0,0,1800,1,0,0,6,4,64,640,3,30,1,10,1,10,64010,1,6410,-3,-1,-1,,,,6401800,-1,3,73803503,,22,60,1.0
3,202306010104,2023-01-05,0,0,1800,1,0,2,6,4,64,640,4,40,1,10,1,10,64010,1,6410,-3,-1,-1,,,,6401800,-1,4,73803504,,22,60,1.0
4,202306010105,2023-01-05,1,0,1600,1,0,1,6,4,64,641,5,51,1,11,1,11,64011,1,6411,3,602,1,310.0,5.0,3.0,6411600,4,5,73803505,3.100,22,60,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3324,202309050908,2023-12-28,0,0,1400,1,0,2,9,4,94,940,8,80,9,90,5,50,94090,2,9420,-2,-1,-1,,,,9401400,-1,8,73839208,,22,60,1.0
3325,202309050909,2023-12-28,0,0,1800,1,0,3,9,4,94,940,9,90,9,90,5,50,94090,2,9420,-2,-1,-1,,,,9401800,-1,9,73839209,,22,60,1.0
3326,202309050910,2023-12-28,1,0,2000,1,0,3,9,4,94,941,10,101,9,91,5,51,94091,2,9421,2,901,1,356.5,5.0,3.0,9412000,-1,10,73839210,3.565,22,60,1.0
3327,202309050911,2023-12-28,0,0,1800,1,0,5,9,4,94,940,11,110,9,90,5,50,94090,2,9420,-2,-1,-1,,,,9401800,-1,11,73839211,,22,60,1.0


In [44]:
horse_results


Unnamed: 0,horse_id,rank_per_horse,date,rank,prize,rank_diff,umaban,wakuban,weather,race_type,course_len,ground_state,race_class,n_horses,time,time_courselen,nobori,win,rentai,show,place,corner_1_per_horse,corner_2_per_horse,corner_3_per_horse,corner_4_per_horse,race_position,pace_diff,pace_category,corner_1,corner_2,corner_3,corner_4,pace_1,pace_2
0,2011106610,1.000000,2023-12-10,8.0,0.0,3.9,1,1,1,1,2400,0,3,8.0,151.9,0.063292,36.4,0,0,0,9,1.000000,1.000000,1.000000,0.875000,4,-4.5,4.0,8,8.0,8.0,7.0,38.2,33.7
1,2011106610,1.000000,2023-11-11,8.0,0.0,1.6,7,7,2,1,1600,0,3,8.0,96.1,0.060062,34.7,0,0,0,5,0.750000,0.750000,,,4,-1.0,3.0,6,6.0,,,35.9,34.9
2,2011106610,1.000000,2023-10-29,9.0,0.0,2.3,2,2,1,1,1800,0,5,9.0,110.2,0.061222,34.9,0,0,0,8,0.777778,0.777778,,,4,-3.1,4.0,7,7.0,,,36.5,33.4
3,2011106610,1.000000,2023-10-07,11.0,0.0,1.8,10,8,1,1,1600,0,5,11.0,95.8,0.059875,34.4,0,0,0,8,0.909091,0.909091,,,4,-2.5,4.0,10,10.0,,,36.0,33.5
4,2011106610,0.900000,2023-09-18,9.0,0.0,2.7,3,3,1,1,1600,0,3,10.0,94.5,0.059062,35.0,0,0,0,9,0.900000,0.900000,,,4,-1.5,4.0,9,9.0,,,34.7,33.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130271,2021102427,0.571429,2024-04-06,8.0,0.0,2.3,12,7,2,0,1800,2,1,14.0,117.5,0.065278,39.6,0,0,0,6,0.571429,0.642857,0.428571,0.500000,3,-1.1,4.0,8,9.0,6.0,7.0,39.1,38.0
130272,2021102427,0.928571,2023-12-28,13.0,0.0,2.5,7,5,1,1,1800,0,0,14.0,110.0,0.061111,36.0,0,0,0,9,0.642857,0.642857,,,3,-1.9,4.0,9,9.0,,,36.0,34.1
130273,2021103807,1.000000,2024-02-24,11.0,0.0,3.4,1,1,2,0,1800,2,1,11.0,118.9,0.066056,43.0,0,0,0,9,0.090909,0.090909,0.090909,0.181818,1,2.9,1.0,1,1.0,1.0,2.0,36.8,39.7
130274,2021103807,0.785714,2024-01-13,11.0,0.0,2.1,2,2,1,0,1800,0,1,14.0,117.9,0.065500,40.8,0,0,0,8,1.000000,0.857143,0.642857,0.571429,3,4.0,1.0,14,12.0,9.0,8.0,35.8,39.8


In [59]:

"""
ホースリザルトは、シーズンデータ入れなくてもいいかも
年齢は入ってるからそれで


dfに新たな列、race_gradeを作成して欲しい
作成ルールは以下の通りである
'age_season'の条件に引っかかった場合、それを優先すること
次点で"race_class"の条件にかかっても、'age_season'がある方を優先して変換すること


"race_class"列が0は55

"race_class"列が1は60

"race_class"列が2は70
2歳それ以外は68（20<='age_season'<30かつ、2<="race_class"列<5の行）
2歳G2,G3,OPは73（20<='age_season'<30かつ、5<="race_class"列<8の行）

"race_class"列が3は79
2歳G1は79（20<='age_season'<30かつ、8<="race_class"の行）
3歳春OPは80（30<='age_season'<33かつ、4<="race_class"列<6の行）
3歳春G2.G3は81（30<='age_season'<33かつ、6<="race_class"列<8の行）

"race_class"列が4は85
3歳春G1は86（30<='age_season'<33かつ、8<="race_class"の行）
3歳秋G2,G3は86（33<='age_season'<40かつ、5<="race_class"列<8の行）

"race_class"列が5は89
3歳秋G1は91（33<='age_season'<40かつ、8<="race_class"の行）

"race_class"列が6は92

"race_class"列が7は94

"race_class"列が8は98




これらを小さく（1/10 - 5）した列

G1 8	100
G2 7	95
G3 6	92
オープン5	89
1600万4	86
２勝クラス3	80
１勝クラス2	70
未勝利1	60
新馬0	55


クラス	芝	ダート
未勝利	６５（-１５）	６０（-２０）
500万下
Ｇ１を除く２歳ＯＰ	７５（-５）	７２（-８）
1000万下
２歳Ｇ１
Ｇ１を除く３歳春ＯＰ	８３（３）	８３（３）
1600万下
３歳春Ｇ１
３歳秋重賞	８８（８）	９０（１０）
ＯＰ（ただしダート重賞を除く）
３歳秋Ｇ１	９３（１３）	９５（１５）
ダート重賞（３歳を除く）	－	１００（２０）
古馬Ｇ１	９８（１８）	１０５（２５）
"""
# race_gradeの作成
def calculate_race_grade(row):
    race_class = row['race_class']
    
    if race_class == 0:
        return 55
    elif race_class == 1:
        return 60
    elif race_class == 2:
        return 70
    elif race_class == 3:
        return 79
    elif race_class == 4:
        return 85
    elif race_class == 5:
        return 89
    elif race_class == 6:
        return 91
    elif race_class == 7:
        return 94
    elif race_class == 8:
        return 98
    else:
        return np.nan  

# race_grade列を作成
df['race_grade'] = df.apply(calculate_race_grade, axis=1
#race_grade_scaledの作成
df['race_grade_scaled'] = df['race_grade'] / 10 - 5
horse_results[['race_grade', 'race_grade_scaled']] = df[['race_grade', 'race_grade_scaled']]


In [60]:
horse_results

Unnamed: 0,horse_id,rank_per_horse,date,rank,prize,rank_diff,umaban,wakuban,weather,race_type,course_len,ground_state,race_class,n_horses,time,time_courselen,nobori,win,rentai,show,place,corner_1_per_horse,corner_2_per_horse,corner_3_per_horse,corner_4_per_horse,race_position,pace_diff,pace_category,corner_1,corner_2,corner_3,corner_4,pace_1,pace_2,race_grade,race_grade_scaled
0,2011106610,1.000000,2023-12-10,8.0,0.0,3.9,1,1,1,1,2400,0,3,8.0,151.9,0.063292,36.4,0,0,0,9,1.000000,1.000000,1.000000,0.875000,4,-4.5,4.0,8,8.0,8.0,7.0,38.2,33.7,60.0,1.0
1,2011106610,1.000000,2023-11-11,8.0,0.0,1.6,7,7,2,1,1600,0,3,8.0,96.1,0.060062,34.7,0,0,0,5,0.750000,0.750000,,,4,-1.0,3.0,6,6.0,,,35.9,34.9,60.0,1.0
2,2011106610,1.000000,2023-10-29,9.0,0.0,2.3,2,2,1,1,1800,0,5,9.0,110.2,0.061222,34.9,0,0,0,8,0.777778,0.777778,,,4,-3.1,4.0,7,7.0,,,36.5,33.4,60.0,1.0
3,2011106610,1.000000,2023-10-07,11.0,0.0,1.8,10,8,1,1,1600,0,5,11.0,95.8,0.059875,34.4,0,0,0,8,0.909091,0.909091,,,4,-2.5,4.0,10,10.0,,,36.0,33.5,60.0,1.0
4,2011106610,0.900000,2023-09-18,9.0,0.0,2.7,3,3,1,1,1600,0,3,10.0,94.5,0.059062,35.0,0,0,0,9,0.900000,0.900000,,,4,-1.5,4.0,9,9.0,,,34.7,33.2,60.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130271,2021102427,0.571429,2024-04-06,8.0,0.0,2.3,12,7,2,0,1800,2,1,14.0,117.5,0.065278,39.6,0,0,0,6,0.571429,0.642857,0.428571,0.500000,3,-1.1,4.0,8,9.0,6.0,7.0,39.1,38.0,,
130272,2021102427,0.928571,2023-12-28,13.0,0.0,2.5,7,5,1,1,1800,0,0,14.0,110.0,0.061111,36.0,0,0,0,9,0.642857,0.642857,,,3,-1.9,4.0,9,9.0,,,36.0,34.1,,
130273,2021103807,1.000000,2024-02-24,11.0,0.0,3.4,1,1,2,0,1800,2,1,11.0,118.9,0.066056,43.0,0,0,0,9,0.090909,0.090909,0.090909,0.181818,1,2.9,1.0,1,1.0,1.0,2.0,36.8,39.7,,
130274,2021103807,0.785714,2024-01-13,11.0,0.0,2.1,2,2,1,0,1800,0,1,14.0,117.9,0.065500,40.8,0,0,0,8,1.000000,0.857143,0.642857,0.571429,3,4.0,1.0,14,12.0,9.0,8.0,35.8,39.8,,
