## netkeiba.comからスクレイピング

In [None]:
!pip install lxml
!pip install html5lib
!pip install bs4
!pip install imblearn

In [None]:
import pandas as pd

In [None]:
# サンプル確認
url = 'https://db.netkeiba.com/race/202009030811/'

print(type(pd.read_html(url)[0]))
# -> DataFrame

pd.read_html(url)[0]
# -> 	着順	枠番	馬番	馬名	性齢	斤量	騎手	タイム	着差	単勝	人気	馬体重	調教師

In [None]:
import time

def scrape_race_results(race_id_list: list)-> dict:
    """
    netkeiba.comのレースIDのリストを渡して、それらをまとめて{'レースID', 結果のDataFrame}という形式の辞書型に格納する。
    race_results['201901010101'] -> DataFrame
        着順	枠番	馬番	馬名	性齢	斤量	騎手	タイム	着差	単勝	人気	馬体重	調教師
    """
    race_results = {}
    for race_id in race_id_list:
        try:
            url = 'https://db.netkeiba.com/race/' + race_id
            race_results[race_id] = pd.read_html(url)[0]
            time.sleep(0.1)
        except IndexError:
            continue
        except:
            break
    return race_results

In [None]:
# race id を生成（機械的に生成できる）
race_id_list = []

for place in range(1,11):
    for kai in range(1,6):
        for day in range(1,9):
            for r in range(1,13):
                race_id = '2019' + str(place).zfill(2) + str(kai).zfill(2) + str(day).zfill(2) + str(r).zfill(2)
                race_id_list.append(race_id)
# -> ['201901010101', '201901010102', ,,, ]

In [None]:
test = scrape_race_results(race_id_list[0:50])

In [None]:
# 表として見やすいように、dfのindexにrace idを入れる
for key in test.keys():
    print(test[key].index)
    test[key].index = [key]*len(test[key])

results = pd.concat((test[key] for key in test.keys()),sort=False)
# ここまでで、2019年のレースが一つのdfにまとめることができている

In [None]:
results
results.to_pickle('results.pickle')

## データ整形　前処理

In [None]:
import pandas as pd

In [None]:
def preprocessing(results):
    df = results.copy()    
    df = df[~(df['着順'].astype(str).str.contains('\D'))]
    df['着順'] = df['着順'].astype(int)
    
    df['性'] = df['性齢'].map(lambda x:str(x)[0])
    df['年齢'] = df['性齢'].map(lambda x:str(x)[1:]).astype(int)
    df['体重'] = df['馬体重'].str.split('(',expand = True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(',expand = True)[1].str[:-1].astype(int)
    
    df['単勝'] = df['単勝'].astype(float)
    
    df.drop(['タイム','着差','調教師','性齢','馬体重'],axis = 1,inplace = True)
    
    return df

In [None]:
test = preprocessing(results)

In [None]:
# 4位より下はまとめる
clip_rank = lambda x: x if x < 4 else 4
test['rank'] = test['着順'].map(clip_rank)

# test['馬名'].value_counts()などでカウントし、数が多そうなのは落とした後、ダミー変数化
test.drop(['着順','馬名'], axis = 1,inplace = True)
test_d = pd.get_dummies(test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 説明変数
X = test_d.drop(['rank'],axis=1)
# 目的変数
y = test_d['rank']

X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.3,random_state=0)

# 下記で学習もできるが、今回はunder samplingする
# model = LogisticRegression()
# model.fit(X_train,y_train)

# print(model.score(X_train,y_train),model.score(X_test,y_test))

# テストデータで予測してみる
# y_pred = model.predict(X_test)

# pd.Series(y_pred).value_counts()

In [None]:
from imblearn.under_sampling import RandomUnderSampler

cnt_rank_1 = y_train.value_counts()[1]
cnt_rank_2 = y_train.value_counts()[2]
cnt_rank_3 = y_train.value_counts()[3]

In [None]:
model = LogisticRegression()
rus = RandomUnderSampler(sampling_strategy={1:cnt_rank_1,2:cnt_rank_2,3:cnt_rank_3,4:cnt_rank_1},random_state=71)

X_train_rus,y_train_rus = rus.fit_sample(X_train,y_train)

model = LogisticRegression()
model.fit(X_train_rus,y_train_rus)

print(model.score(X_train,y_train),model.score(X_test,y_test))

In [None]:
y_pred = model.predict(X_test)

pred_df = pd.DataFrame({'pred':y_pred,'actual':y_test})

print(pred_df)