In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [2]:
def url_to_soup(url):
    req = requests.get(url)
    
    return BeautifulSoup(req.content, 'html.parser')

In [3]:
def get_race_data(result_url):
    soup = url_to_soup(result_url)
    
    going_ = soup.select("table.tb01")[3].text.replace('\n','　').split('　')[8]#予想レースの馬場状態
    wether_ = soup.select("table.tb01")[3].text.replace('\n','　').split('　')[6]#予想レースの天候
    len_ = int(soup.find(id="race-data01-a").get_text().replace('\n','').split('　')[3].replace(',','')[1:5])#予想レースの距離

    win = int(re.sub('\<.*?\>','',str(soup.find_all('tr', class_='bg-1chaku')[0]).split('</td>')[2]).replace('\n',''))
    
    return going_, wether_, len_, win

In [4]:
def horse_page_link(uma_info_url):
    soup = url_to_soup(uma_info_url)
    link_lst = ['https://www.nankankeiba.com'+x.get('href') for x in soup.find_all('a', class_='tx-mid tx-low')]
    
    return link_lst

In [5]:
def uma_info(url, going_, wether_, len_):
    uma_df = pd.io.html.read_html(url)#馬のページから過去レースの表を取得
    df = pd.DataFrame(uma_df[5])#表をdf形式にする
    
    feature = df.iloc[1:11,[1,4,5,8,9,10]]#表から欲しい特徴量の列だけ取得
    feature = pd.concat([feature,feature[5].str.split('/', expand=True),feature[8].str.split('/', expand=True)], axis=1).drop([5,8],axis=1)#天候と馬場をバラしてdfに追加
    feature.columns = range(0, len(feature.columns))#カラム名変えるために、カラムの番号をふる
    feature.rename(columns={0:'place',1:'len',2:'time',3:'gap',4:'wether',5:'going',6:'rank',7:'cnt'},inplace=True)#カラム名変え
    
    
    feature['place'].where((feature['place'] =='大井')|(feature['place']=='大井☆'), 0,inplace=True)#placeが大井と大井☆以外の要素を0にする
    feature.loc[(feature['place']=='大井')|(feature['place']=='大井☆'), 'place'] = 1#placeが大井か大井☆の要素を1にする
    
    
    feature['wether'].where(feature['wether'] == wether_, 0,inplace=True)#予想対象レースの天候と一致してなかったら0
    feature.loc[feature['wether']==wether_, 'wether'] = 1#予想対象レースの天候と一致してたら1

    
    feature['going'].where(feature['going'] == going_, 0,inplace=True)#予想対象レースの馬場と一致してなかったら0
    feature.loc[feature['going']==going_, 'going'] = 1#予想対象レースの馬場と一致してたら1
    
    #走行時間を秒に変換
    try:
        base_time = pd.to_datetime('00:00.0', errors='coerce', format='%M:%S.%f')
        feature['time'] = pd.to_datetime(feature['time'], errors='coerce', format='%M:%S.%f') - base_time
        feature['time'] = feature['time'].dt.total_seconds()
    except ValueError:
        base_time = pd.to_datetime('00.0', errors='coerce',format='%S.%f')
        feature['time'] = pd.to_datetime(feature['time'], errors='coerce', format='%S.%f') - base_time
        feature['time'] = feature['time'].dt.total_seconds()
    
    feature.loc[feature.gap.str.endswith(('除外','止','取消')),'gap'] = np.nan#レース中止の場合、gapの要素をNaNにする
    feature.loc[feature.len.str.endswith('芝')] = np.nan#JRAのレースのデータをNaNにする
    feature.fillna(feature.median(),inplace=True)#NaNに中央値を埋める
    
    
    feature['len'] = abs(pd.Series(feature['len'],dtype=int) - len_)#予想対象レースの距離との差を絶対値にして返す

    return feature

In [6]:
def pass_url(url):
    soup = url_to_soup(url)
    sp1 = soup.select('div em a')
    sp2 = str(sp1).split(',')
    sp3 = [i for i in sp2 if '大' in i]
    
    pr_url=[]
    ends = ['01','02','03','04','05','06','07','08','09','10','11','12']
    race_info_list=[]
    result_list=[]
    
    for i in range(len(sp3)):
        p = sp3[i].split('.')
        for x in range(len(ends)):
            pr_url.append(p[0]+ends[x]+'.'+p[1])
            
    for i in pr_url:
        race_info_list.append(i.split('\n')[0].split('"')[1].replace('program','race_info'))
        result_list.append(i.split('\n')[0].split('"')[1].replace('program','result'))
        
    return race_info_list,result_list



mother_url = ['https://www.nankankeiba.com/calendar/201204.do',
'https://www.nankankeiba.com/calendar/201210.do',
'https://www.nankankeiba.com/calendar/201304.do',
'https://www.nankankeiba.com/calendar/201310.do',
'https://www.nankankeiba.com/calendar/201404.do',
'https://www.nankankeiba.com/calendar/201410.do',
'https://www.nankankeiba.com/calendar/201504.do',
'https://www.nankankeiba.com/calendar/201510.do',
'https://www.nankankeiba.com/calendar/201604.do',
'https://www.nankankeiba.com/calendar/201610.do',
'https://www.nankankeiba.com/calendar/201704.do',
'https://www.nankankeiba.com/calendar/201710.do',
'https://www.nankankeiba.com/calendar/201804.do']

race_info_lst=[]
result_lst=[]
for url in mother_url:
    race_info_lst.append(pass_url(url)[0])
    result_lst.append(pass_url(url)[1])

new_race_info_lst = race_info_lst[0]
new_result_lst = result_lst[0]
for i in range(12):
    new_race_info_lst = new_race_info_lst+race_info_lst[i]
    new_result_lst = new_result_lst+result_lst[i]


new_race_info_lst.remove('/race_info/2018012420170301.do')
new_race_info_lst.remove('/race_info/2018012420170302.do')
new_race_info_lst.remove('/race_info/2018012420170303.do')
new_race_info_lst.remove('/race_info/2018012320170201.do')
new_race_info_lst.remove('/race_info/2018012320170202.do')
new_race_info_lst.remove('/race_info/2018012320170203.do')
new_race_info_lst.remove('/race_info/2018012320170204.do')
new_race_info_lst.remove('/race_info/2018012320170205.do')
new_race_info_lst.remove('/race_info/2018012320170206.do')
new_race_info_lst.remove('/race_info/2018012320170207.do')
new_race_info_lst.remove('/race_info/2018012320170208.do')
new_race_info_lst.remove('/race_info/2018012320170209.do')
new_race_info_lst.remove('/race_info/2018012320170210.do')
new_race_info_lst.remove('/race_info/2018012320170211.do')
new_race_info_lst.remove('/race_info/2018012320170212.do')
new_result_lst.remove('/result/2018012420170301.do')
new_result_lst.remove('/result/2018012420170302.do')
new_result_lst.remove('/result/2018012420170303.do')
new_result_lst.remove('/result/2018012320170201.do')
new_result_lst.remove('/result/2018012320170202.do')
new_result_lst.remove('/result/2018012320170203.do')
new_result_lst.remove('/result/2018012320170204.do')
new_result_lst.remove('/result/2018012320170205.do')
new_result_lst.remove('/result/2018012320170206.do')
new_result_lst.remove('/result/2018012320170207.do')
new_result_lst.remove('/result/2018012320170208.do')
new_result_lst.remove('/result/2018012320170209.do')
new_result_lst.remove('/result/2018012320170210.do')
new_result_lst.remove('/result/2018012320170211.do')
new_result_lst.remove('/result/2018012320170212.do')

result_ = ['https://www.nankankeiba.com'+x for x in new_result_lst]
race_info_ = ['https://www.nankankeiba.com'+x for x in new_race_info_lst]

In [7]:
def summary(result_, race_info_):
    going_ = get_race_data(result_)[0]
    wether_ = get_race_data(result_)[1]
    len_ = get_race_data(result_)[2]
    win = get_race_data(result_)[3]

    link_lst = horse_page_link(race_info_)
    
    df_ = pd.DataFrame()
    
    #頭数足りない時用の0埋めデータフレーム作り
    fill_z = np.zeros((10,8))
    zero_ = pd.DataFrame(fill_z)
    zero_.rename(columns={0:'place',1:'len',2:'time',3:'gap',4:'wether',5:'going',6:'rank',7:'cnt'},inplace=True)
    #レース数足りない時用の0埋めデータフレーム作り
    fill_z_ = np.zeros((1,8))
    zero_row = pd.DataFrame(fill_z_)
    zero_row.rename(columns={0:'place',1:'len',2:'time',3:'gap',4:'wether',5:'going',6:'rank',7:'cnt'},inplace=True)
    
    #df_に全特徴量をまとめる
    for i in range(len(link_lst)):
        if len(uma_info(link_lst[i], going_, wether_, len_).index) < 10:
            df_ = df_.append(uma_info(link_lst[i], going_, wether_, len_))
            for x in range(10 - len(uma_info(link_lst[i], going_, wether_, len_).index)):
                df_ = df_.append(zero_row)
        else:
            df_ = df_.append(uma_info(link_lst[i], going_, wether_, len_))
        
    #16頭立てじゃないとき用の0埋め
    for i in range(16-len(link_lst)):
        df_ = df_.append(zero_)
        
    
    #各インプット正規化
    df_['len'] = df_['len']/df_['len'].max()
    df_['time'] = df_['time']/df_['time'].max()
    df_['gap'] = pd.Series(df_['gap'],dtype=float)/pd.Series(df_['gap'],dtype=float).max()
    df_['rank'] = pd.Series(df_['rank'],dtype=float)/pd.Series(df_['rank'],dtype=float).max()
    df_['cnt'] = pd.Series(df_['cnt'],dtype=float)/pd.Series(df_['cnt'],dtype=float).max()
    
    #df_をflattenデータにする
    data_summary = df_.round(5).values.flatten()
    
    return data_summary ,win

In [9]:
sum_array = np.zeros((0,1280))

In [12]:
len(result_)

7869

In [11]:
from tqdm import tqdm
import time

for i in tqdm(range(100)):
    time.sleep(0.1)

for i in range(len(result_)):

	a=summary(result_[i], race_info_[i])[0]
	sum_array = np.vstack((sum_array,a))

100%|██████████| 100/100 [00:11<00:00,  8.66it/s]


TypeError: '>=' not supported between instances of 'str' and 'float'