In [None]:
# ロジティック回帰で競馬予想

In [11]:
import pandas as pd

# ピクルファイルからデータを読み込む
results = pd.read_pickle('results.pickle')

# 関数を取得
def preprocessing(results):
  df = results.copy() # 関数にいれたデータを保持したまま新しいデータをコピー

  # 着順に数字以外の文字列が含まれているものを取り除く
  df = df[~(df['着 順'].astype(str).str.contains('\D'))] # ~は否定を示す,着順で1~9以外を省いたデータを取得
  df['着 順'] = df['着 順'].astype(int) # 整数型に変換
  
  # 性齢を性と年齢に分ける
  df['性'] = df['性齢'].map(lambda x: str(x)[0]) # xに値を代入していく
  df['年齢'] = df['性齢'].map(lambda x: str(x)[1:]).astype(int) 

  # 馬体重を体重と体重変化に分ける
  df['体重'] = df['馬体重'].str.split('(', expand = True )[0].astype(int) #数字型に変換
  df['体重変化'] = df['馬体重'].str.split('(', expand = True )[1].str[:-1].astype(int) #数字型に変換

  # データをint,floatに変換
  df['単勝'] = df['単勝'].astype(float)

  # 不要な列を削除
  df.drop(['タイム', '着差', '調教師', '性齢', '馬体重'], axis=1, inplace=True)
  
  return df

In [12]:
results2 = preprocessing(results)

In [13]:
results2

Unnamed: 0,着 順,枠 番,馬 番,馬名,斤量,騎手,単勝,人 気,性,年齢,体重,体重変化
201902010101,1,6,12,アドマイヤデルタ,56.0,吉田隼人,4.0,3.0,牡,3,490,32
201902010101,2,3,5,ティレニア,54.0,藤岡佑介,3.7,2.0,牝,3,442,4
201902010101,3,4,7,ドゥシャンパーニュ,54.0,北村友一,2.9,1.0,牝,3,428,-4
201902010101,4,5,10,シャイニーブランコ,56.0,岩田康誠,10.3,5.0,牡,3,442,2
201902010101,5,4,8,シュンカジョウ,54.0,松岡正海,57.5,8.0,牝,3,438,14
...,...,...,...,...,...,...,...,...,...,...,...,...
201910020812,10,7,12,オーシャンスケイプ,52.0,大野拓弥,45.7,10.0,牝,3,414,6
201910020812,11,3,3,プリヴェット,55.0,藤岡康太,29.8,9.0,牝,4,486,2
201910020812,12,5,7,アドマイヤクィーン,55.0,松山弘平,29.6,8.0,牝,4,476,6
201910020812,13,8,13,タムロドリーム,49.0,亀田温心,134.8,12.0,牝,3,472,2


In [15]:
# 4位以下は一つの関数にまとめる
clip_rank = lambda x: x if x < 4 else 4

results2['rank'] = results2['着 順'].map(clip_rank)

In [16]:
results2['rank'].value_counts()

rank
4    33745
1     3193
3     3188
2     3185
Name: count, dtype: int64

In [18]:
results2['騎手'].value_counts()

騎手
和田竜二    822
三浦皇成    804
松山弘平    802
岩田康誠    776
大野拓弥    709
       ... 
永島太郎      1
山下雅之      1
阪野学       1
竹吉徹       1
石川慎将      1
Name: count, Length: 187, dtype: int64

In [21]:
results2.drop(['着 順', '馬名'], axis=1, inplace=True)

In [22]:
results_d = pd.get_dummies(results2)

In [23]:
results_d

Unnamed: 0,枠 番,馬 番,斤量,単勝,人 気,年齢,体重,体重変化,rank,騎手_アヴドゥ,...,騎手_高田潤,騎手_高野和馬,騎手_鮫島克駿,騎手_鮫島良太,騎手_黒岩悠,騎手_黛弘人,騎手_Ｍ．デム,性_セ,性_牝,性_牡
201902010101,6,12,56.0,4.0,3.0,3,490,32,1,False,...,False,False,False,False,False,False,False,False,False,True
201902010101,3,5,54.0,3.7,2.0,3,442,4,2,False,...,False,False,False,False,False,False,False,False,True,False
201902010101,4,7,54.0,2.9,1.0,3,428,-4,3,False,...,False,False,False,False,False,False,False,False,True,False
201902010101,5,10,56.0,10.3,5.0,3,442,2,4,False,...,False,False,False,False,False,False,False,False,False,True
201902010101,4,8,54.0,57.5,8.0,3,438,14,4,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201910020812,7,12,52.0,45.7,10.0,3,414,6,4,False,...,False,False,False,False,False,False,False,False,True,False
201910020812,3,3,55.0,29.8,9.0,4,486,2,4,False,...,False,False,False,False,False,False,False,False,True,False
201910020812,5,7,55.0,29.6,8.0,4,476,6,4,False,...,False,False,False,False,False,False,False,False,True,False
201910020812,8,13,49.0,134.8,12.0,3,472,2,4,False,...,False,False,False,False,False,False,False,False,True,False


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = results_d.drop(['rank'], axis=1)
y = results_d['rank']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=0)

In [27]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
print(model.score(X_train, y_train), model.score(X_test, y_test))

0.7791338193092984 0.7791288286901646


In [33]:
#アンダーサンプリング
from imblearn.under_sampling import RandomUnderSampler

rank_1 = y_train.value_counts()[1]
rank_2 = y_train.value_counts()[2]
rank_3 = y_train.value_counts()[3]

rus = RandomUnderSampler(ratio={1:rank_1, 2:rank_2, 3:rank_3, 4:rank_1}, random_state=71)
X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)

TypeError: RandomUnderSampler.__init__() got an unexpected keyword argument 'ratio'