<a href="https://colab.research.google.com/github/monda00/horse-race-notebook/blob/master/make_import_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# アプリにインポートするデータ形式に変換する

# ライブラリ・データ読み込み

In [1]:
import numpy as np
import pandas as pd
import re
import collections

In [2]:
DATA_PATH = '/content/drive/My Drive/data/horse-race/'

In [3]:
race_df = pd.read_csv(DATA_PATH + 'race_ex.csv')
horse_df = pd.read_csv(DATA_PATH + 'horse_ex.csv')

# 前処理

必要なデータ

## ７頭以下のレースを削除

In [4]:
horse_df['race_id'].value_counts()

202008030303    18
201904021003    18
201910020707    18
201907010209    18
201908030105    18
                ..
201945051701     4
201946061610     4
202035052403     4
201947070506     3
201935071404     3
Name: race_id, Length: 24221, dtype: int64

In [5]:
drop_races_s = horse_df['race_id'].value_counts() <= 7
drop_race_id = drop_races_s[drop_races_s == True].index

for race_id in drop_race_id:
  race_df = race_df[race_df['race_id'] != race_id]

for race_id in drop_race_id:
  horse_df = horse_df[horse_df['race_id'] != race_id]

In [6]:
horse_df['race_id'].value_counts()

201910010805    18
201910010107    18
201906050612    18
201908030105    18
201904030106    18
                ..
201936082609     8
201954021002     8
201950070403     8
201943080912     8
202047061904     8
Name: race_id, Length: 22630, dtype: int64

## 性別と年齢の分割

In [7]:
gen = []
age = []
for i in range(len(horse_df)):
  age_v = horse_df.iloc[i]['age']
  gen.append(re.search(r'(.)(\d{1,2})', age_v).group(1))
  age.append(re.search(r'(.)(\d{1,2})', age_v).group(2))

In [8]:
horse_df['age'] = age
horse_df['gen'] = gen

## 負担重量と体重と増減

In [9]:
horse_df['burden_weight'] = horse_df['weight']

In [10]:
weight = []
weight_diff = []
for i in range(len(horse_df)):
  horse_weight = horse_df.iloc[i]['horse_weight']
  if horse_weight == '計不':
    weight.append('計不')
    weight_diff.append('計不')
  else:
    weight.append(int(re.search(r'(.*)(\(.*?\))', horse_weight).group(1)))
    weight_diff.append(re.search(r'(.*)(\(.*?\))', horse_weight).group(2).replace('(', '').replace(')', ''))

In [11]:
horse_df['weight'] = weight
horse_df['weight_diff'] = weight_diff

## 日時

In [12]:
race_df['date_time'] = pd.to_datetime(race_df['date'] + ' ' + race_df['start_time'])

## 出走馬数

In [13]:
race_df['race_horse_number'] = list(collections.Counter(list(horse_df['race_id'].values)).values())

## 計測不能の置換

In [14]:
horse_df = horse_df.replace('計不', 0)

## データ数を制限

In [15]:
race_df['date'] = pd.to_datetime(race_df['date'])
horse_df['race_date'] = pd.to_datetime(horse_df['race_date'])

In [16]:
race_df = race_df[race_df['date'] > '2020-05-31']
horse_df = horse_df[horse_df['race_date'] > '2020-05-31']

# 確認

In [17]:
horse_df

Unnamed: 0.1,Unnamed: 0,agari,age,frame_number,horse_number,horse_weight,jockey,name,popular,race_date,race_id,race_name,rank,time,weight,win,gen,burden_weight,weight_diff
0,0,38.1,3,7.0,13,511(-5),森泰斗,ワイルドホース,2.0,2020-06-22,202044062212,タイタン賞競走,1,1:28.5,511,3.7,牡,54.0,-5
1,1,38.1,5,6.0,11,475(-2),山崎誠士,クインズオライリー,12.0,2020-06-22,202044062212,タイタン賞競走,2,1:28.7,475,81.4,牝,54.0,-2
2,2,38.6,6,3.0,4,469(+2),西啓太,メダーリアフレイム,4.0,2020-06-22,202044062212,タイタン賞競走,3,1:28.8,469,8.6,セ,56.0,+2
3,3,37.9,4,8.0,14,526(+6),藤本現暉,ゴールドプリンス,8.0,2020-06-22,202044062212,タイタン賞競走,4,1:28.8,526,34.4,牡,56.0,+6
4,4,38.5,5,4.0,7,475(+6),藤田凌,イグレット,9.0,2020-06-22,202044062212,タイタン賞競走,5,1:28.9,475,34.8,牝,54.0,+6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10890,11978,36.5,4,2.0,2,462(+11),岩本怜,ユズ,9.0,2020-06-01,202035060102,C2,8,1:02.4,462,57.9,牝,54.0,+11
10891,11981,38.2,6,7.0,7,479(+4),鈴木祐,キタスクワート,8.0,2020-06-01,202035060101,C2,6,1:02.5,479,105.3,牡,56.0,+4
10892,11982,38.2,6,2.0,2,458(-2),山本聡哉,スエヒロワンダー,3.0,2020-06-01,202035060101,C2,7,1:02.6,458,5.5,牡,56.0,-2
10893,11983,38.4,5,3.0,3,451(+1),木村暁,エナジーウェーブ,5.0,2020-06-01,202035060101,C2,8,1:03.2,451,11.1,牝,54.0,+1


In [18]:
race_df

Unnamed: 0.1,Unnamed: 0,clockwise,date,distance,field_condition,field_type,name,place,race_id,race_round,start_time,weather,date_time,race_horse_number
0,0,右,2020-06-22,1400,不,ダ,タイタン賞競走,大井,202044062212,12R,20:50,雨,2020-06-22 20:50:00,15
1,1,右,2020-06-22,1800,不,ダ,ポートサイド賞競走,大井,202044062211,11R,20:10,雨,2020-06-22 20:10:00,15
2,2,右,2020-06-22,1200,不,ダ,C1九　十11,大井,202044062210,10R,19:30,雨,2020-06-22 19:30:00,14
3,3,右,2020-06-22,1400,不,ダ,C1九　十11,大井,202044062209,9R,18:55,雨,2020-06-22 18:55:00,14
4,4,右,2020-06-22,1600,不,ダ,C1九　十11,大井,202044062208,8R,18:20,雨,2020-06-22 18:20:00,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1104,左,2020-06-01,1200,良,ダ,C2七組,盛岡,202035060105,5R,14:20,晴,2020-06-01 14:20:00,10
996,1105,左,2020-06-01,1200,良,ダ,C2八組,盛岡,202035060104,4R,13:45,晴,2020-06-01 13:45:00,9
997,1106,左,2020-06-01,1200,良,ダ,C2九組,盛岡,202035060103,3R,13:10,晴,2020-06-01 13:10:00,9
998,1107,左,2020-06-01,1000,良,ダ,C2,盛岡,202035060102,2R,12:40,晴,2020-06-01 12:40:00,9


# エクスポート

In [19]:
use_columns_horse = [
                    'race_id',
                    'horse_number',
                    'frame_number',
                    'age',
                    'gen',
                    'weight',
                    'weight_diff',
                    'burden_weight'
]

use_columns_race = [
                    'race_id',
                    'name',
                    'place',
                    'race_horse_number',
                    'distance',
                    'clockwise',
                    'field_type',
                    'field_condition',
                    'weather',
                    'date_time'
]

In [20]:
horse_df[use_columns_horse].to_csv(DATA_PATH+'import_horse.csv', index=False)
race_df[use_columns_race].to_csv(DATA_PATH+'import_race.csv', index=False)

In [21]:
horse_df[use_columns_horse]

Unnamed: 0,race_id,horse_number,frame_number,age,gen,weight,weight_diff,burden_weight
0,202044062212,13,7.0,3,牡,511,-5,54.0
1,202044062212,11,6.0,5,牝,475,-2,54.0
2,202044062212,4,3.0,6,セ,469,+2,56.0
3,202044062212,14,8.0,4,牡,526,+6,56.0
4,202044062212,7,4.0,5,牝,475,+6,54.0
...,...,...,...,...,...,...,...,...
10890,202035060102,2,2.0,4,牝,462,+11,54.0
10891,202035060101,7,7.0,6,牡,479,+4,56.0
10892,202035060101,2,2.0,6,牡,458,-2,56.0
10893,202035060101,3,3.0,5,牝,451,+1,54.0


In [22]:
race_df[use_columns_race]

Unnamed: 0,race_id,name,place,race_horse_number,distance,clockwise,field_type,field_condition,weather,date_time
0,202044062212,タイタン賞競走,大井,15,1400,右,ダ,不,雨,2020-06-22 20:50:00
1,202044062211,ポートサイド賞競走,大井,15,1800,右,ダ,不,雨,2020-06-22 20:10:00
2,202044062210,C1九　十11,大井,14,1200,右,ダ,不,雨,2020-06-22 19:30:00
3,202044062209,C1九　十11,大井,14,1400,右,ダ,不,雨,2020-06-22 18:55:00
4,202044062208,C1九　十11,大井,12,1600,右,ダ,不,雨,2020-06-22 18:20:00
...,...,...,...,...,...,...,...,...,...,...
995,202035060105,C2七組,盛岡,10,1200,左,ダ,良,晴,2020-06-01 14:20:00
996,202035060104,C2八組,盛岡,9,1200,左,ダ,良,晴,2020-06-01 13:45:00
997,202035060103,C2九組,盛岡,9,1200,左,ダ,良,晴,2020-06-01 13:10:00
998,202035060102,C2,盛岡,9,1000,左,ダ,良,晴,2020-06-01 12:40:00
