## SEDフォルダの中身の確認
SEDフォルダには接頭字SEDからはじまるものと、SRBから始まるものがあります。
SEDファイルには成績データ、SRBについては成績レースデータが記録されています。
仕様は以下です。
### SED
http://www.jrdb.com/program/Sed/sed_doc.txt

### SRB
http://www.jrdb.com/program/Srb/srb_doc.txt

In [25]:
# import modules
from jrdb import load
from jrdb import parse
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

In [None]:
# load text data (ex. SED)
loader = load.FileLoader()
data_dir = 'data/SED_2022'
filename = 'SED220105.txt'
text_data = loader.load(os.path.join(data_dir, filename))

# parse
parser = parse.JrdbDataParser()
df = parser.parse(text_data, 'SED', is_japanese=True)   # return pandas DataFrame

In [2]:
df

Unnamed: 0,レースキー,場コード,年,回,日,R,馬番,競走成績キー,血統登録番号,年月日,...,天候コード,コース,レース脚質,単勝,複勝,本賞金,収得賞金,レースペース流れ,馬ペース流れ,4角コース取り
0,06221101,06,22,1,1,1,1,1910217320220105,19102173,20220105,...,1,,3,,,0.0,0,33,33,1
1,06221101,06,22,1,1,1,2,1910428820220105,19104288,20220105,...,1,,3,,,78.0,0,33,33,3
2,06221101,06,22,1,1,1,3,1910612720220105,19106127,20220105,...,1,,4,,,0.0,0,33,33,3
3,06221101,06,22,1,1,1,4,1910354220220105,19103542,20220105,...,1,,2,,170,130.0,0,33,33,1
4,06221101,06,22,1,1,1,5,1910194320220105,19101943,20220105,...,1,,4,,,0.0,0,33,33,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,07221112,07,22,1,1,12,12,1710157120220105,17101571,20220105,...,1,,3,,,0.0,0,32,23,3
317,07221112,07,22,1,1,12,13,1810489420220105,18104894,20220105,...,1,,1,1530,300,1110.0,600,32,32,2
318,07221112,07,22,1,1,12,14,1710133420220105,17101334,20220105,...,1,,3,,,0.0,0,32,23,3
319,07221112,07,22,1,1,12,15,1811001520220105,18110015,20220105,...,1,,2,,110,440.0,0,32,32,2


In [29]:
data_dir = 'data/SED_2022'

# get list of file names in directory
file_names = os.listdir(data_dir)

# filter file names to those starting with 'SED' and ending with '.txt'
sed_file_names = [f for f in file_names if f.startswith('SED') and f.endswith('.txt')]

# load text data and parse into data frames
dfs = []
for file_name in tqdm(sed_file_names):
    loader = load.FileLoader()
    text_data = loader.load(os.path.join(data_dir, file_name))
    parser = parse.JrdbDataParser()
    df = parser.parse(text_data, 'SED', is_japanese=True)
    dfs.append(df)

# concatenate data frames into one
SED_df = pd.concat(dfs, ignore_index=True)

100%|██████████| 109/109 [07:23<00:00,  4.07s/it]


In [30]:
# # get list of file names in directory
# file_names = os.listdir(data_dir)

# # filter file names to those starting with 'SED' and ending with '.txt'
# sed_file_names = [f for f in file_names if f.startswith('KYI') and f.endswith('.txt')]

# # load text data and parse into data frames
# dfs = []
# for file_name in tqdm(sed_file_names):
#     loader = load.FileLoader()
#     text_data = loader.load(os.path.join(data_dir, file_name))
#     parser = parse.JrdbDataParser()
#     df = parser.parse(text_data, 'SRB', is_japanese=True)
#     dfs.append(df)

# # concatenate data frames into one
# SRB_df = pd.concat(dfs, ignore_index=True)

In [31]:
SED_df

Unnamed: 0,レースキー,場コード,年,回,日,R,馬番,競走成績キー,血統登録番号,年月日,...,天候コード,コース,レース脚質,単勝,複勝,本賞金,収得賞金,レースペース流れ,馬ペース流れ,4角コース取り
0,06221101,06,22,1,1,1,1,1910217320220105,19102173,20220105,...,1,,3,,,0.0,0,33,33,1
1,06221101,06,22,1,1,1,2,1910428820220105,19104288,20220105,...,1,,3,,,78.0,0,33,33,3
2,06221101,06,22,1,1,1,3,1910612720220105,19106127,20220105,...,1,,4,,,0.0,0,33,33,3
3,06221101,06,22,1,1,1,4,1910354220220105,19103542,20220105,...,1,,2,,170,130.0,0,33,33,1
4,06221101,06,22,1,1,1,5,1910194320220105,19101943,20220105,...,1,,4,,,0.0,0,33,33,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47706,09226912,09,22,6,9,12,12,1810522320221228,18105223,20221228,...,1,4,4,,,0.0,0,22,22,4
47707,09226912,09,22,6,9,12,13,1710613720221228,17106137,20221228,...,1,4,3,,,0.0,0,22,22,5
47708,09226912,09,22,6,9,12,14,1710046120221228,17100461,20221228,...,1,4,4,,,0.0,0,22,22,4
47709,09226912,09,22,6,9,12,15,1810638920221228,18106389,20221228,...,1,4,4,,,0.0,0,22,22,5


In [32]:
# SRB_df

In [33]:
# # merge SED_df and SRB_df on multiple columns
# merged_df = pd.merge(SED_df, SRB_df, on=['レースキー', '場コード', '年', '回', '日', 'R'], how='left')

In [34]:
# merged_df

In [35]:
# path/to/data/train/直下にSED_2022.csvとして保存
SED_df.to_csv('train_data/SED_2022.csv', index=False)

In [43]:
from sklearn.metrics import accuracy_score, f1_score

# 日付の新しいものを20%をテストデータとして分割
data_sorted = SED_df.sort_values('年月日', ascending=False)
split_index = int(len(data_sorted) * 0.8)
train_data = data_sorted.iloc[:split_index]
test_data = data_sorted.iloc[split_index:]

# 不要なカラムを除去
excluded_columns = [
    'Unnamed: 0', '年', '回', '日', 'R', '出遅', '位置取', '不利', '前不利', '中不利', '後不利', 'レース', 'コース取り',
    '上昇度コード', 'クラスコード', '馬体コード', '気配コード', '備考', 'コーナー順位1', 'コーナー順位2', 'コーナー順位3', 'コーナー順位4',
    '単勝', '複勝', 'レースペース流れ', '馬ペース流れ', '4角コース取り', '確定単勝オッズ', '確定複勝オッズ', '確定単勝人気順位'
]
train_data_reduced = train_data.drop(columns=excluded_columns, errors='ignore')
test_data_reduced = test_data.drop(columns=excluded_columns, errors='ignore')

# 3着以内を1、それ以外を0とする新しい目標変数を作成
y_train = (train_data['着順'] <= 3).astype(int)
y_test = (test_data['着順'] <= 3).astype(int)

# 欠損値の処理
imputer_reduced = SimpleImputer(strategy='mean')
X_train_reduced_imputed = imputer_reduced.fit_transform(train_data_reduced[numeric_features_reduced])
X_test_reduced_imputed = imputer_reduced.transform(test_data_reduced[numeric_features_reduced])

# LightGBMのGradient Boosting Classifierでモデルを学習
lgb_classifier = lgb.LGBMClassifier()
lgb_classifier.fit(X_train_reduced_imputed, y_train)

# テストデータに対する予測
predictions_reduced = lgb_classifier.predict(X_test_reduced_imputed)

# Accuracyの計算
accuracy_reduced = accuracy_score(y_test, predictions_reduced)

# F1 scoreの計算
f1_reduced = f1_score(y_test, predictions_reduced)

# 結果の表示
print("Accuracy:", accuracy_reduced)
print("F1 Score:", f1_reduced)

Accuracy: 0.8989835481504768
F1 Score: 0.7643031784841076
