In [1]:
import numpy as np
import pandas as pd
#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 50)

In [2]:
!ls ../../data/procecced/csv/race-2019.csv

ls: ../../data/procecced/csv/race-2019.csv: No such file or directory


In [3]:
race_df = pd.read_csv("../../data/processed/csv/race-2019.csv", sep=",")
horse_df = pd.read_csv("../../data/processed/csv/horse-2019.csv", sep=",")
# for year in range(2009, 2020):
#     race_tmp_df = pd.read_csv("./csv/race-"+str(year)+".csv", sep=",")
#     horse_tmp_df = pd.read_csv("./csv/horse-"+str(year)+".csv", sep=",")
#     race_df = pd.concat([race_df, race_tmp_df], axis=0)
#     horse_df = pd.concat([horse_df, horse_tmp_df], axis=0)

## 元データの確認

In [4]:
# race_id単位で重複したデータが存在しないか確認
print(len(race_df) == len(race_df['race_id'].unique()))
print(race_df.shape)
race_df.tail(2)

True
(0, 28)


Unnamed: 0,race_id,race_round,race_title,race_course,weather,ground_status,time,date,where_racecourse,total_horse_number,frame_number_first,horse_number_first,frame_number_second,horse_number_second,frame_number_third,horse_number_third,tansyo,hukusyo_first,hukusyo_second,hukusyo_third,wakuren,umaren,wide_1_2,wide_1_3,wide_2_3,umatan,renhuku3,rentan3


In [5]:
# 出走馬数の確認
race_df["total_horse_number"].value_counts()

Series([], Name: total_horse_number, dtype: int64)

In [6]:
print(horse_df.shape)
horse_df.head(2)

(0, 19)


Unnamed: 0,race_id,rank,frame_number,horse_number,horse_id,sex_and_age,burden_weight,rider_id,goal_time,goal_time_dif,time_value,half_way_rank,last_time,odds,popular,horse_weight,tame_time,tamer_id,owner_id


## raceデータの整形

In [7]:
race_df.head(1)

Unnamed: 0,race_id,race_round,race_title,race_course,weather,ground_status,time,date,where_racecourse,total_horse_number,frame_number_first,horse_number_first,frame_number_second,horse_number_second,frame_number_third,horse_number_third,tansyo,hukusyo_first,hukusyo_second,hukusyo_third,wakuren,umaren,wide_1_2,wide_1_3,wide_2_3,umatan,renhuku3,rentan3


### race_id
そのままでOK

In [8]:
# 一応確認
race_df["race_id"].dtypes

dtype('O')

### race_round
余分な空白とRを取り除く

In [9]:
race_df["race_round"].dtypes

dtype('O')

In [10]:
race_df['race_round'].unique()

array([], dtype=object)

In [11]:
race_df['race_round'] = race_df['race_round'].str.strip('R \n')

In [12]:
race_df['race_round'].unique()

array([], dtype=object)

In [13]:
race_df['race_round'] = race_df['race_round'].astype(int)
race_df["race_round"].dtypes

dtype('int64')

### race_title
いらないので削除

In [14]:
# もともとのカラムは不要なので削除
race_df.drop(['race_title'], axis=1, inplace=True)

### race_course
「ダ右1200m」などであれば、ダート・右回り・1200に分割して、それぞれ別のカラムにする。

新たに4つのカラムを追加
- 障害コースか？
- 地面のタイプは何か？
- 右回り・左回り・直線か？
- 距離は？

In [15]:
race_df["race_course"].unique()

array([], dtype=object)

In [16]:
# 正規表現で取得

# 障害か、地面のタイプは何か、左か、右か、直線か、
obstacle = race_df["race_course"].str.extract('(障)', expand=True)
ground_type = race_df["race_course"].str.extract('(ダ|芝)', expand=True)
is_left_right_straight = race_df["race_course"].str.extract('(左|右|直線)', expand=True)
distance = race_df["race_course"].str.extract('(\d+)m', expand=True)

obstacle.columns ={"is_obstacle"}
ground_type.columns ={"ground_type"}
is_left_right_straight.columns = {"is_left_right_straight"}
distance.columns = {"distance"}

race_df = pd.concat([race_df, obstacle], axis=1)
race_df = pd.concat([race_df, ground_type], axis=1)
race_df = pd.concat([race_df, is_left_right_straight], axis=1)
race_df = pd.concat([race_df, distance], axis=1)

In [17]:
# 'is_obstacle' 列の '障芝' を1に置き換え、Nanに0埋め
race_df['is_obstacle'] = race_df['is_obstacle'].replace('障', 1)
race_df.fillna(value={'is_obstacle': 0}, inplace=True)

Unnamed: 0,race_id,race_round,race_course,weather,ground_status,time,date,where_racecourse,total_horse_number,frame_number_first,horse_number_first,frame_number_second,horse_number_second,frame_number_third,horse_number_third,tansyo,hukusyo_first,hukusyo_second,hukusyo_third,wakuren,umaren,wide_1_2,wide_1_3,wide_2_3,umatan,renhuku3,rentan3,is_obstacle,ground_type,is_left_right_straight,distance


In [18]:
print("is_obstacle:", race_df["is_obstacle"].unique())
print("ground_type:", race_df["ground_type"].unique())
print("is_left_right_straight:", race_df["is_left_right_straight"].unique())
print("distance isnull sum:", race_df["distance"].isnull().sum())

is_obstacle: []
ground_type: []
is_left_right_straight: []
distance isnull sum: 0


In [19]:
# もともとのカラムは不要なので削除
race_df.drop(['race_course'], axis=1, inplace=True)

In [20]:
race_df["distance"] = race_df["distance"].astype(int)

### weather
そのままone_hotエンコーディングしてデータを食わせても良さそうだが...

余分な文字列を取り除く。

また、少雨よりも雨が強いはず、小雪よりも雪が強いはず。これらの単純な雨量は別のデータを取ってこないと分からないが、大小関係は情報として入れられるはず。

In [21]:
race_df["weather"].unique()

array([], dtype=object)

In [22]:
race_df['weather'] = race_df['weather'].str.strip('天候 :')

In [23]:
race_df["weather"].unique()

array([], dtype=object)

In [24]:
weather_rain = race_df["weather"].str.extract('(小雨|雨)', expand=True)
weather_snow = race_df["weather"].str.extract('(小雪|雪)', expand=True)
weather_rain.columns ={"weather_rain"}
weather_snow.columns ={"weather_snow"}
race_df = pd.concat([race_df, weather_rain], axis=1)
race_df = pd.concat([race_df, weather_snow], axis=1)

race_df.fillna(value={'weather_rain': 0}, inplace=True)
race_df['weather_rain'] = race_df['weather_rain'].replace('小雨', 1)
race_df['weather_rain'] = race_df['weather_rain'].replace('雨', 2)
race_df.fillna(value={'weather_snow': 0}, inplace=True)
race_df['weather_snow'] = race_df['weather_snow'].replace('小雪', 1)
race_df['weather_snow'] = race_df['weather_snow'].replace('雪', 2)

In [25]:
print("weather_rain:", race_df["weather_rain"].value_counts())
print("weather_snow:", race_df["weather_snow"].value_counts())

weather_rain: Series([], Name: weather_rain, dtype: int64)
weather_snow: Series([], Name: weather_snow, dtype: int64)


### ground_status
芝かダートかは既に別カラムにあるので、状態を見る。
大小関係があるので数値として。

In [26]:
race_df["ground_status"].unique()

array([], dtype=object)

In [27]:
race_df['ground_status'] = race_df['ground_status'].replace('.*(稍重).*', 4,regex=True)
race_df['ground_status'] = race_df['ground_status'].replace('.*(重).*', 3,regex=True)
race_df['ground_status'] = race_df['ground_status'].replace('.*(不良).*', 2,regex=True)
race_df['ground_status'] = race_df['ground_status'].replace('.*(良).*', 1,regex=True)


In [28]:
print("ground_status:", race_df["ground_status"].value_counts())

ground_status: Series([], Name: ground_status, dtype: int64)


### time と dateをあわせてdatetimeに

In [29]:
race_df["time"] = race_df["time"].str.replace('発走 : (\d\d):(\d\d)(.|\n)*', r'\1時\2分')

In [30]:
race_df["date"] = race_df["date"] + race_df["time"]

In [31]:
race_df["date"] = pd.to_datetime(race_df['date'], format='%Y年%m月%d日%H時%M分')

In [32]:
# もともとのtimeは不要なので削除
race_df.drop(['time'], axis=1, inplace=True)

In [33]:
print(race_df["date"].dtype)
print("date isnull sum:", race_df["date"].isnull().sum())

datetime64[ns]
date isnull sum: 0


### where_racecourse
例:1回小倉3日目 の中から小倉を取り出す

In [34]:
race_df["where_racecourse"] = race_df["where_racecourse"].str.replace('\d*回(..)\d*日目', r'\1')


In [35]:
# 確認
race_df["where_racecourse"].unique()

array([], dtype=object)

###  馬の数や順位
- total_horse_number                 int64
- frame_number_first                 int64
- horse_number_first                 int64
- frame_number_second                int64
- horse_number_second                int64
- frame_number_third                 int64
- horse_number_third                 int64

これらはそのままでOK

### オッズから余分な「,」を除く
- tansyo                            object
- hukuren_first                     object
- hukuren_second                    object
- hukuren_third                     object
- renhuku3                          object
- rentan3                           object

数値と文字列が混在しているので面倒
```
race_df['tansyo'] = race_df['tansyo'].str.strip(',')
```
などとしてもだめ

In [36]:
race_df.columns

Index(['race_id', 'race_round', 'weather', 'ground_status', 'date',
       'where_racecourse', 'total_horse_number', 'frame_number_first',
       'horse_number_first', 'frame_number_second', 'horse_number_second',
       'frame_number_third', 'horse_number_third', 'tansyo', 'hukusyo_first',
       'hukusyo_second', 'hukusyo_third', 'wakuren', 'umaren', 'wide_1_2',
       'wide_1_3', 'wide_2_3', 'umatan', 'renhuku3', 'rentan3', 'is_obstacle',
       'ground_type', 'is_left_right_straight', 'distance', 'weather_rain',
       'weather_snow'],
      dtype='object')

In [37]:
race_df['tansyo'] = race_df['tansyo'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['hukusyo_first'] = race_df['hukusyo_first'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['hukusyo_second'] = race_df['hukusyo_second'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['hukusyo_third'] = race_df['hukusyo_third'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['wakuren'] = race_df['wakuren'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['umaren'] = race_df['umaren'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['wide_1_2'] = race_df['wide_1_2'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['wide_1_3'] = race_df['wide_1_3'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['wide_2_3'] = race_df['wide_2_3'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['umatan'] = race_df['umatan'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['renhuku3'] = race_df['renhuku3'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))
race_df['rentan3'] = race_df['rentan3'].apply(lambda x: int(x.replace(",", "")) if type(x) is str else int(x))


In [38]:
race_df[race_df['race_id']==200808010709]

Unnamed: 0,race_id,race_round,weather,ground_status,date,where_racecourse,total_horse_number,frame_number_first,horse_number_first,frame_number_second,horse_number_second,frame_number_third,horse_number_third,tansyo,hukusyo_first,hukusyo_second,hukusyo_third,wakuren,umaren,wide_1_2,wide_1_3,wide_2_3,umatan,renhuku3,rentan3,is_obstacle,ground_type,is_left_right_straight,distance,weather_rain,weather_snow


In [39]:
# 確認
race_df['race_id'] = race_df['race_id'].astype(str)
#race_df['race_title'] = race_df['race_title'].astype(str)
print('dataframeの各列のデータ型を確認==>\n', race_df.dtypes)


race_df.head(1)

dataframeの各列のデータ型を確認==>
 race_id                           object
race_round                         int64
weather                           object
ground_status                     object
date                      datetime64[ns]
where_racecourse                  object
total_horse_number                object
frame_number_first                object
horse_number_first                object
frame_number_second               object
horse_number_second               object
frame_number_third                object
horse_number_third                object
tansyo                            object
hukusyo_first                     object
hukusyo_second                    object
hukusyo_third                     object
wakuren                           object
umaren                            object
wide_1_2                          object
wide_1_3                          object
wide_2_3                          object
umatan                            object
renhuku3                        

Unnamed: 0,race_id,race_round,weather,ground_status,date,where_racecourse,total_horse_number,frame_number_first,horse_number_first,frame_number_second,horse_number_second,frame_number_third,horse_number_third,tansyo,hukusyo_first,hukusyo_second,hukusyo_third,wakuren,umaren,wide_1_2,wide_1_3,wide_2_3,umatan,renhuku3,rentan3,is_obstacle,ground_type,is_left_right_straight,distance,weather_rain,weather_snow


### race dataの保存

In [41]:
race_df.to_csv("../../data/processed/csv/cleaned_race_data.csv", index=False )

## horse data の整形

In [42]:
print(horse_df.shape)
print(horse_df.dtypes)
horse_df['race_id'] = horse_df['race_id'].astype(str)
horse_df['horse_id'] = horse_df['horse_id'].astype(str)
horse_df['tamer_id'] = horse_df['tamer_id'].astype(str)
horse_df['owner_id'] = horse_df['owner_id'].astype(str)
horse_df['rider_id'] = horse_df['rider_id'].astype(str)

horse_df.head(2)

(0, 19)
race_id          object
rank             object
frame_number     object
horse_number     object
horse_id         object
sex_and_age      object
burden_weight    object
rider_id         object
goal_time        object
goal_time_dif    object
time_value       object
half_way_rank    object
last_time        object
odds             object
popular          object
horse_weight     object
tame_time        object
tamer_id         object
owner_id         object
dtype: object


Unnamed: 0,race_id,rank,frame_number,horse_number,horse_id,sex_and_age,burden_weight,rider_id,goal_time,goal_time_dif,time_value,half_way_rank,last_time,odds,popular,horse_weight,tame_time,tamer_id,owner_id


In [43]:
# 何かとデータ分析で便利なので、レース日時情報をmerge
race_tmp_df = race_df[["race_id", "date"]]
horse_df = pd.merge(horse_df, race_tmp_df, on='race_id')
horse_df.head()

Unnamed: 0,rank,frame_number,horse_number,horse_id,sex_and_age,burden_weight,rider_id,goal_time,goal_time_dif,time_value,half_way_rank,last_time,odds,popular,horse_weight,tame_time,tamer_id,owner_id,race_id,date


### 使わなさそうな情報を削除
- time_value, tame_time(プレミアム会員向けの情報)
- goal_time_dif(自分で作成する)

In [44]:
horse_df.drop(['time_value'], axis=1, inplace=True)
horse_df.drop(['goal_time_dif'], axis=1, inplace=True)
horse_df.drop(['tame_time'], axis=1, inplace=True)

### race_id
そのままでOK

### rank
> - 降着・・・	「その走行妨害がなければ被害馬が加害馬に先着していた」と判断した場合、加害馬は被害馬の後ろに降着となります。
> - 失格・・・	「極めて悪質で他の騎手や馬に対する危険な行為によって、競走に重大な支障を生じさせた」と判断した場合、加害馬は失格となります。

> 注記：被害馬が落馬や疾病発症等により競走を中止した場合には、上記の「失格」に該当しない限り着順は到達順位のとおり確定します。


- 降格は降格フラグに分割、順位そのまま入れておく
- 取・除はそもそも参加していないので削除
- 失は順位が全く当てにならないので情報を削除
- 中は最後まで到達していないが参加はしている。ひとまず20位にしておく。goal_timeが無いので、大きめに取る必要がある。
- 12(再)は12で最後の模様。そのまま12にする

In [45]:
# 確認
horse_df[horse_df['rank'] =='中'].sort_values('date').head(2)
horse_df[horse_df['rank'] =='取'].sort_values('date').head(2)
horse_df[horse_df['rank'] =='除'].sort_values('date').head(2)
horse_df[horse_df['rank'] =='16(降)'].sort_values('date').head(2)
horse_df[horse_df['rank'] =='12(再)'].sort_values('date').head(2)

Unnamed: 0,rank,frame_number,horse_number,horse_id,sex_and_age,burden_weight,rider_id,goal_time,half_way_rank,last_time,odds,popular,horse_weight,tamer_id,owner_id,race_id,date


In [46]:
# 降格を別へ
is_down = horse_df["rank"].str.extract('(\(降\))', expand=True)
is_down.columns ={"is_down"}
horse_df = pd.concat([horse_df, is_down], axis=1)

horse_df.fillna(value={'is_down': 0}, inplace=True)
horse_df['is_down'] = horse_df['is_down'].replace('(降)', 1)

## 余分な文字を削除
horse_df['rank'] = horse_df['rank'].apply(lambda x: x.replace("(降)", ""))
horse_df['rank'] = horse_df['rank'].apply(lambda x: x.replace("(再)", ""))

In [47]:
"""- 取・除はそもそも参加していないので削除
- 失は順位が全く当てにならないので情報を削除
- 中は最後まで到達していないが参加はしている。ひとまず20位にしておく"""

horse_df = horse_df[(horse_df['rank'] != "取") & (horse_df['rank'] != "除") & (horse_df['rank'] != "失")]
horse_df['rank'] = pd.DataFrame(horse_df['rank'].mask(horse_df['rank'] == "中", 20))

In [48]:
# 確認
horse_df["rank"].value_counts()

Series([], Name: rank, dtype: int64)

### 姓と年齢をsplit

In [49]:
horse_df['sex_and_age'].unique()

array([], dtype=object)

In [50]:
# 性別を別へ

is_senba = horse_df["sex_and_age"].str.extract('(セ)', expand=True)
is_senba.columns ={"is_senba"}
horse_df = pd.concat([horse_df, is_senba], axis=1)

is_mesu = horse_df["sex_and_age"].str.extract('(牝)', expand=True)
is_mesu.columns ={"is_mesu"}
horse_df = pd.concat([horse_df, is_mesu], axis=1)

is_osu = horse_df["sex_and_age"].str.extract('(牡)', expand=True)
is_osu.columns ={"is_osu"}
horse_df = pd.concat([horse_df, is_osu], axis=1)


In [51]:
horse_df.fillna(value={'is_osu': 0}, inplace=True)
horse_df['is_osu'] = horse_df['is_osu'].replace('牡', 1)
horse_df.fillna(value={'is_mesu': 0}, inplace=True)
horse_df['is_mesu'] = horse_df['is_mesu'].replace('牝', 1)
horse_df.fillna(value={'is_senba': 0}, inplace=True)
horse_df['is_senba'] = horse_df['is_senba'].replace('セ', 1)
## 余分な文字を削除
horse_df['sex_and_age'] = horse_df['sex_and_age'].str.strip("牝牡セ")
horse_df['sex_and_age'] = horse_df['sex_and_age'].astype(int)

In [52]:
horse_df = horse_df.rename(columns={'sex_and_age': 'age'})

## goal_timeをtimedelta型にしてから秒に(last_timeも)

In [53]:
# nullになるのは、レースで「中」になった馬
print(horse_df['goal_time'].isnull().sum())
print(horse_df['last_time'].isnull().sum())

0
0


In [54]:
horse_df['goal_time'] = pd.to_datetime(horse_df['goal_time'], format='%M:%S.%f') - pd.to_datetime('00:00.0', format='%M:%S.%f')
horse_df['goal_time'] = horse_df['goal_time'].dt.total_seconds()

In [55]:
# 欠損値を最大値で埋める
horse_df.fillna(value={'goal_time': horse_df['goal_time'].max()}, inplace=True)
horse_df.fillna(value={'last_time': horse_df['last_time'].max()}, inplace=True)

Unnamed: 0,rank,frame_number,horse_number,horse_id,age,burden_weight,rider_id,goal_time,half_way_rank,last_time,odds,popular,horse_weight,tamer_id,owner_id,race_id,date,is_down,is_senba,is_mesu,is_osu


In [56]:
horse_df.dtypes

rank                     object
frame_number             object
horse_number             object
horse_id                 object
age                       int64
burden_weight            object
rider_id                 object
goal_time               float64
half_way_rank            object
last_time                object
odds                     object
popular                  object
horse_weight             object
tamer_id                 object
owner_id                 object
race_id                  object
date             datetime64[ns]
is_down                  object
is_senba                 object
is_mesu                  object
is_osu                   object
dtype: object

### goal_timeとレース距離から、平均速度を求める

In [57]:
# レース距離情報をmerge
race_tmp_df = race_df[["race_id", "distance"]]
horse_df = pd.merge(horse_df, race_tmp_df, on='race_id')

In [58]:
horse_df["distance"] = horse_df["distance"].astype(int)
horse_df["avg_velocity"] = horse_df["distance"]/horse_df["goal_time"]


### half_way_rank
splitして平均値を保持する（レースによってまちまちなので）

In [59]:
from statistics import mean
horse_df["half_way_rank"] = horse_df["half_way_rank"].apply(lambda x: mean([float(n) for n in (x.split("-"))]) if type(x) is str else float(x) )

In [60]:
horse_df[horse_df["rank"] == 20] = horse_df[horse_df["rank"] == 20].fillna({'half_way_rank': 20})
horse_df["half_way_rank"] = horse_df["half_way_rank"].fillna(horse_df['half_way_rank'].mean())
horse_df["half_way_rank"].isnull().sum()

0

In [61]:
horse_df["half_way_rank"] = horse_df["half_way_rank"].astype(float)

### horse_weight と diff の分離
「計不」は平均で穴埋め

In [62]:
horse_weight_dif = horse_df["horse_weight"].str.extract('\(([-|+]?\d*)\)', expand=True)
horse_weight_dif.columns ={"horse_weight_dif"}

horse_df = pd.concat([horse_df, horse_weight_dif], axis=1)

horse_df['horse_weight'] = horse_df['horse_weight'].replace('\(([-|+]?\d*)\)', '', regex=True)



In [63]:
horse_df['horse_weight'] = horse_df['horse_weight'].replace('計不', np.nan)
horse_df['horse_weight'] = horse_df['horse_weight'].astype(float)
horse_df['horse_weight_dif'] = horse_df['horse_weight_dif'].astype(float)

In [64]:
# 計不 の horse_idを探し、馬ごとの平均値で穴埋め
no_records = horse_df[horse_df['horse_weight'].isnull()]['horse_id']
for no_record_id in no_records:
    horse_df.loc[(horse_df['horse_id'] == no_record_id)&(horse_df['horse_weight'].isnull()), 'horse_weight'] = horse_df[horse_df['horse_id'] == no_record_id]['horse_weight'].mean() 
    horse_df.loc[(horse_df['horse_id'] == no_record_id)&(horse_df['horse_weight_dif'].isnull()), 'horse_weight_dif'] = 0 
    

In [65]:
horse_df.dtypes

rank                        object
frame_number                object
horse_number                object
horse_id                    object
age                          int64
burden_weight               object
rider_id                    object
goal_time                  float64
half_way_rank              float64
last_time                   object
odds                        object
popular                     object
horse_weight               float64
tamer_id                    object
owner_id                    object
date                datetime64[ns]
is_down                     object
is_senba                    object
is_mesu                     object
is_osu                      object
race_id                     object
distance                     int64
avg_velocity               float64
horse_weight_dif           float64
dtype: object

### burden_weight, horse_weight の比率を追加

In [66]:
horse_df['burden_weight_rate'] = horse_df['burden_weight']/horse_df['horse_weight']

### last_time
とりあえず放置するが、外れ値の扱いを考えたほうが良さそう。

In [67]:
horse_df.plot(kind='hist', y='last_time' , bins=50, figsize=(16,4), alpha=0.5)

TypeError: no numeric data to plot

In [None]:
horse_df[horse_df['last_time']<20]['race_id'].unique()

In [None]:
race_df[(race_df['race_id']=='200808010804') | (race_df['race_id']=='200806010208') | (race_df['race_id']=='200806010304')]

### odds

In [68]:
horse_df['odds']= horse_df['odds'].astype(float)

### horse dataの保存

In [69]:
print(horse_df.dtypes)
horse_df.head(3)

rank                          object
frame_number                  object
horse_number                  object
horse_id                      object
age                            int64
burden_weight                 object
rider_id                      object
goal_time                    float64
half_way_rank                float64
last_time                     object
odds                         float64
popular                       object
horse_weight                 float64
tamer_id                      object
owner_id                      object
date                  datetime64[ns]
is_down                       object
is_senba                      object
is_mesu                       object
is_osu                        object
race_id                       object
distance                       int64
avg_velocity                 float64
horse_weight_dif             float64
burden_weight_rate            object
dtype: object


Unnamed: 0,rank,frame_number,horse_number,horse_id,age,burden_weight,rider_id,goal_time,half_way_rank,last_time,odds,popular,horse_weight,tamer_id,owner_id,date,is_down,is_senba,is_mesu,is_osu,race_id,distance,avg_velocity,horse_weight_dif,burden_weight_rate


In [71]:
horse_df.to_csv("../../data/processed/csv/cleaned_horse_data.csv", index=False )