In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('csv/boat_learning_data.csv')

In [5]:
df.head()

Unnamed: 0,race_date,place_id,race_no,bracket_no,is_miss,place_id.1,player_grade,branch,born_area,age,...,win_pattern,race_grade,distance,course_direction,weather,temperature,wind,wind_direction,water_temperature,wave_height
0,20190412,4,1,1,False,4,B1,長崎,長崎,37.0,...,,予選,1800,5,曇り,10.0,3.0,1.0,13.0,3.0
1,20190412,4,1,2,False,4,B2,東京,東京,27.0,...,,予選,1800,5,曇り,10.0,3.0,1.0,13.0,3.0
2,20190412,4,1,3,False,4,B1,福岡,福岡,32.0,...,,予選,1800,5,曇り,10.0,3.0,1.0,13.0,3.0
3,20190412,4,1,4,False,4,A2,福井,石川,28.0,...,,予選,1800,5,曇り,10.0,3.0,1.0,13.0,3.0
4,20190412,4,1,5,False,4,B1,東京,東京,50.0,...,まくり差し,予選,1800,5,曇り,10.0,3.0,1.0,13.0,3.0


In [6]:
for c in df.columns:
    print(c, df[c].dtype, df.iloc[5][c])

race_date int64 20190412
place_id int64 4
race_no int64 1
bracket_no int64 6
is_miss object False
place_id.1 int64 4
player_grade object B1
branch object 愛知
born_area object 愛知
age float64 43.0
weight float64 50.0
f_count float64 0.0
l_count float64 1.0
start_time_avg float64 0.2
first_rate_all float64 4.38
second_rate_all float64 27.68
third_rate_all float64 39.29
first_rate_area float64 4.88
second_rate_area float64 35.29
third_rate_area float64 52.94
motor_no float64 22.0
motor_within_second_rate float64 31.89
motor_within_third_rate float64 52.43
boat_no float64 58.0
boat_within_second_rate float64 41.14
boat_within_third_rate float64 59.43
pre_time float64 6.81
tilt_angle float64 -0.5
propeller object nan
parts object nan
adjust_weight float64 1.0
pre_start_timing object F.04
finish_order float64 5.0
player_race_time object  
start_timing float64 0.43
win_pattern object nan
race_grade object 予選
distance int64 1800
course_direction int64 5
weather object 曇り
temperature float64 10.0

In [7]:
df['pre_start_timing'].dtype

dtype('O')

ValueError: invalid literal for int() with base 10: '×'

In [14]:
df['course_direction'].unique()

array([ 1.,  3., 15., 13.,  9., 11., 12.,  4.,  6.,  5.,  7., 17., 14.,
       10.,  2.,  8., 16., nan])

In [22]:
PARTS = {'キャリボ': 4,
         'ピストン': 4,
         'リング': 3,
         '電気': 2, 
         'キャブ': 3,
         'ギヤ': 2,
         'シリンダ': 4,
         'シャフト': 4}
def parts_count(x, key, num):
    x = str(x)
    if x.find(key) != -1:
        if x.find('×') != -1:
            return x[x.find(key)+num+1]
        else:
            return 1
    else:
        return 0

def drop_miss(df):
    return df[df['is_miss'] == False]
    
def preprocessing(df):
    # 月
    df['month'] = df['race_date'].apply(lambda x: (x%10000)//100)
    # 日
    df['date'] = df['race_date'].apply(lambda x: x%100)
    
    # 周期データを三角関数に変換
    # 月
    df['month_cos'] = np.cos(2 * np.pi * df['month']/df['month'].max())
    df['month_sin'] = np.sin(2 * np.pi * df['month']/df['month'].max())
    # 日
    df['date_cos'] = np.cos(2 * np.pi * df['date']/df['date'].max())
    df['date_sin'] = np.sin(2 * np.pi * df['date']/df['date'].max())
    # コース方向
    df['course_direction_cos'] = np.cos(2 * np.pi * df['course_direction']/df['course_direction'].max())
    df['course_direction_sin'] = np.sin(2 * np.pi * df['course_direction']/df['course_direction'].max())
    # 風方向
    df['wind_direction_cos'] = np.cos(2 * np.pi * df['wind_direction']/df['wind_direction'].max())
    df['wind_direction_sin'] = np.sin(2 * np.pi * df['wind_direction']/df['wind_direction'].max())
    # 風とコースの差分方向
    df['course_wind_direction_cos'] = np.cos(2 * np.pi * (df['wind_direction']/df['wind_direction'].max() - df['course_direction']/df['course_direction'].max()))
    df['course_wind_direction_sin'] = np.sin(2 * np.pi * (df['wind_direction']/df['wind_direction'].max() - df['course_direction']/df['course_direction'].max()))
    
    # 展示フライングフラグ
    df['pre_f_flg'] = df['pre_start_timing'].apply(lambda x: str(x).find('f')!=-1)
    
    for key, values in PARTS.values():
        df[key+'_flg'] = df['parts'].apply(lambda x: True if str(x).find(key)!=-1 else False)
        df[key+'_count'] = df['parts'].apply(parts_count, key=key, num=PARTS[key])
    
    return df

In [25]:
df['pre_start_timing']

0          .14
1          .13
2         F.01
3          .09
4         F.01
5         F.04
6          .08
7          .38
8          .16
9          .06
10         .08
11         .09
12         .07
13        F.04
14         .09
15         .06
16         .13
17         .02
18         .04
19         .08
20         .08
21        F.01
22         .06
23         .10
24         .09
25         .02
26         .16
27        F.02
28         .01
29        F.06
          ... 
202560    F.11
202561    F.09
202562     .01
202563    F.11
202564    F.07
202565    F.02
202566    F.07
202567     .01
202568     .10
202569     .31
202570     .16
202571     .07
202572     .13
202573     .04
202574     .04
202575     .31
202576     .03
202577     .02
202578     .09
202579    F.07
202580     .02
202581     .18
202582     .01
202583    F.04
202584    F.03
202585    F.01
202586     .07
202587    F.02
202588     .02
202589     .06
Name: pre_start_timing, Length: 202590, dtype: object