In [None]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import pandas as pd
import os
from google.cloud import storage

In [None]:
def create_netkeiba_url_potential(year_start, year_end):
    """
    Create a DataFrame with potential netkeiba URLs for scraping.

    Args:
        year_start (int): The starting year for scraping.
        year_end (int): The ending year for scraping.

    Returns:
        pd.DataFrame: DataFrame containing the potential netkeiba URLs.
    """

    # Create an empty DataFrame
    netkeiba_url_potential_df = pd.DataFrame(columns=['url', 'current_race_id', 'racecourse_i', 'place', 'race_number_i', 'year'])
    # ①競馬場ごとにループ（10競馬場）
    for year in range(year_start, year_end+1):
        # racecourse_list=["01","02","03","04","05","06","07","08","09","10"]
        racecourse_list=["01"]
        for racecourse_i in range(len(racecourse_list)):
            place = ""
            if racecourse_list[racecourse_i] == "01":
                place = "札幌"
            elif racecourse_list[racecourse_i] == "02":
                place = "函館"
            elif racecourse_list[racecourse_i] == "03":
                place = "福島"
            elif racecourse_list[racecourse_i] == "04":
                place = "新潟"
            elif racecourse_list[racecourse_i] == "05":
                place = "東京"
            elif racecourse_list[racecourse_i] == "06":
                place = "中山"
            elif racecourse_list[racecourse_i] == "07":
                place = "中京"
            elif racecourse_list[racecourse_i] == "08":
                place = "京都"
            elif racecourse_list[racecourse_i] == "09":
                place = "阪神"
            elif racecourse_list[racecourse_i] == "10":
                place = "小倉"    
            # ②開催回数ごとにループ（6回）
            for session_number_i in range(6+1):
                continueCounter = 0  # 'continue'が実行された回数をカウントするためのカウンターを追加
                # ③開催日数分ループ（12日）
                for event_date_i in range(12+1):
                    race_id = ''
                    if event_date_i<9:
                        race_id = str(year)+racecourse_list[racecourse_i]+"0"+str(session_number_i+1)+"0"+str(event_date_i+1)
                        url1="https://db.netkeiba.com/race/"+race_id
                    else:
                        race_id = str(year)+racecourse_list[racecourse_i]+"0"+str(session_number_i+1)+"0"+str(event_date_i+1)
                        url1="https://db.netkeiba.com/race/"+race_id
                    # event_date_iの更新をbreakするためのカウンター
                    event_date_i_BreakCounter = 0
                    # ④レース数分ループ（12R）
                    for race_number_i in range(12):
                        if race_number_i<9:
                            url=url1+str("0")+str(race_number_i+1)
                            current_race_id = race_id+str("0")+str(race_number_i+1)
                        else:
                            url=url1+str(race_number_i+1)
                            current_race_id = race_id+str(race_number_i+1)
                        # add the URL to the DataFrame
                        netkeiba_url_potential_df.loc[len(netkeiba_url_potential_df)] = [url, current_race_id, racecourse_i, place, race_number_i, year]
    return netkeiba_url_potential_df



In [None]:
#取得開始年
year_start = 2019
#取得終了年（含む）
year_end = 2019
netkeiba_url_potential_df = create_netkeiba_url_potential(year_start, year_end)
netkeiba_url_potential_df

In [None]:
def download_read_netkeiba_url_scraped_csv_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """
    Download a CSV file from a GCS bucket and read it into a DataFrame.

    Args:
    bucket_name (str): The name of the GCS bucket.
    source_blob_name (str): The name of the blob in the GCS bucket.
    destination_file_name (str): The name of the file to save the downloaded CSV.

    Returns:
    pd.DataFrame: DataFrame containing the data from the downloaded CSV file.
    """
    # Create a client
    storage_client = storage.Client()
    # Get the bucket
    bucket = storage_client.bucket(bucket_name)
    # Get the blob
    blob = bucket.blob(source_blob_name)

    # Check if the file exists
    if blob.exists():
        # Download the file
        blob.download_to_filename(destination_file_name)

        # If the file exists, read it into a DataFrame
        if os.path.exists(destination_file_name):
            netkeiba_url_scraped_df = pd.read_csv(destination_file_name)
            return netkeiba_url_scraped_df
    else:
        print(f"The file {source_blob_name} does not exist in the bucket {bucket_name}.")
        # Return an empty DataFrame
        return pd.DataFrame()

In [None]:
netkeiba_url_scraped_df = download_read_netkeiba_url_scraped_csv_from_gcs("dev-kh-gcs-bucket", "data/netkeiba_url_scraped.csv", "netkeiba_url_scraped.csv")
netkeiba_url_scraped_df

In [None]:
def remove_scraped_urls(netkeiba_url_potential_df, netkeiba_url_scraped_df):
    # Check if netkeiba_url_scraped_df is not None and not empty
    if netkeiba_url_scraped_df is not None and not netkeiba_url_scraped_df.empty:
        # Remove rows in netkeiba_url_potential_df that are also in netkeiba_url_scraped_df
        df_unique = netkeiba_url_potential_df[~netkeiba_url_potential_df['url'].isin(netkeiba_url_scraped_df['url'])]
        return df_unique
    else:
        print("netkeiba_url_scraped_df is None or empty. Returning netkeiba_url_potential_df as is.")
        return netkeiba_url_potential_df

In [None]:
remove_scraped_urls(netkeiba_url_potential_df, netkeiba_url_scraped_df)

In [None]:
def scrape_netkeiba_data(year_start, year_end):
        race_data_all = []
        #取得するデータのヘッダー情報を先に追加しておく
        race_data_all.append(['race_id','馬','騎手','馬番','走破時間','オッズ','通過順','着順','体重','体重変化','性','齢','斤量','上がり','人気','レース名','日付','開催','クラス','芝・ダート','距離','回り','馬場','天気','場id','場名'])

                        try:
                            r=requests.get(url)
                        #リクエストを投げすぎるとエラーになることがあるため
                        #失敗したら10秒待機してリトライする
                        except requests.exceptions.RequestException as e:
                            print(f"Error: {e}")
                            print("Retrying in 10 seconds...")
                            time.sleep(10)  # 10秒待機
                            r=requests.get(url)
                        #バグ対策でdecode
                        soup = BeautifulSoup(r.content.decode("euc-jp", "ignore"), "html.parser")
                        soup_span = soup.find_all("span")
                        # テーブルを指定
                        main_table = soup.find("table", {"class": "race_table_01 nk_tb_common"})
    
                        # テーブル内の全ての行を取得
                        try:
                            main_rows = main_table.find_all("tr")
                        except:
                            print('continue: ' + url)
                            continueCounter += 1  # 'continue'が実行された回数をカウントアップ
                            if continueCounter == 2:  # 'continue'が2回連続で実行されたらループを抜ける
                                continueCounter = 0
                                break
                            continue
    
                        race_data = []
                        for i, row in enumerate(main_rows[1:], start=1):# ヘッダ行をスキップ
                            cols = row.find_all("td")
                            #走破時間
                            runtime=''
                            try:
                                runtime= cols[7].text.strip()
                            except IndexError:
                                runtime = ''
                            soup_nowrap = soup.find_all("td",nowrap="nowrap",class_=None)
                            #通過順
                            pas = ''
                            try:
                                pas = str(cols[10].text.strip())
                            except:
                                pas = ''
                            weight = 0
                            weight_dif = 0
                            #体重
                            var = cols[14].text.strip()
                            try:
                                weight = int(var.split("(")[0])
                                weight_dif = int(var.split("(")[1][0:-1])
                            except ValueError:
                                weight = 0
                                weight_dif = 0
                            weight = weight
                            weight_dif = weight_dif
                            #上がり
                            last = ''
                            try:
                                last = cols[11].text.strip()
                            except IndexError:
                                last = ''
                            #人気
                            pop = ''
                            try:
                                pop = cols[13].text.strip()
                            except IndexError:
                                pop = ''
                            
                            #レースの情報
                            try:
                                var = soup_span[8]
                                sur=str(var).split("/")[0].split(">")[1][0]
                                rou=str(var).split("/")[0].split(">")[1][1]
                                dis=str(var).split("/")[0].split(">")[1].split("m")[0][-4:]
                                con=str(var).split("/")[2].split(":")[1][1]
                                wed=str(var).split("/")[1].split(":")[1][1]
                            except IndexError:
                                try:
                                    var = soup_span[7]
                                    sur=str(var).split("/")[0].split(">")[1][0]
                                    rou=str(var).split("/")[0].split(">")[1][1]
                                    dis=str(var).split("/")[0].split(">")[1].split("m")[0][-4:]
                                    con=str(var).split("/")[2].split(":")[1][1]
                                    wed=str(var).split("/")[1].split(":")[1][1]
                                except IndexError:
                                    var = soup_span[6]
                                    sur=str(var).split("/")[0].split(">")[1][0]
                                    rou=str(var).split("/")[0].split(">")[1][1]
                                    dis=str(var).split("/")[0].split(">")[1].split("m")[0][-4:]
                                    con=str(var).split("/")[2].split(":")[1][1]
                                    wed=str(var).split("/")[1].split(":")[1][1]
                            soup_smalltxt = soup.find_all("p",class_="smalltxt")
                            detail=str(soup_smalltxt).split(">")[1].split(" ")[1]
                            date=str(soup_smalltxt).split(">")[1].split(" ")[0]
                            clas=str(soup_smalltxt).split(">")[1].split(" ")[2].replace(u'\xa0', u' ').split(" ")[0]
                            title=str(soup.find_all("h1")[1]).split(">")[1].split("<")[0]
    
                            race_data = [
                                current_race_id,
                                cols[3].text.strip(),#馬の名前
                                cols[6].text.strip(),#騎手の名前
                                cols[2].text.strip(),#馬番
                                runtime,#走破時間
                                cols[12].text.strip(),#オッズ,
                                pas,#通過順
                                cols[0].text.strip(),#着順
                                weight,#体重
                                weight_dif,#体重変化
                                cols[4].text.strip()[0],#性
                                cols[4].text.strip()[1],#齢
                                cols[5].text.strip(),#斤量
                                last,#上がり
                                pop,#人気,
                                title,#レース名
                                date,#日付
                                detail,
                                clas,#クラス
                                sur,#芝かダートか
                                dis,#距離
                                rou,#回り
                                con,#馬場状態
                                wed,#天気
                                racecourse_i,#場
                                place]
                            race_data_all.append(race_data)
                        
                        print(detail+str(race_number_i+1)+"R")#進捗を表示
                        
                    if event_date_i_BreakCounter == 12:#12レース全部ない日が検出されたら、その開催中の最後の開催日と考える
                        break
        #1年毎に出力
        #出力先とファイル名は修正してください
        with open('data/'+str(year)+'.csv', 'w', newline='',encoding="SHIFT-JIS") as f:
            csv.writer(f).writerows(race_data_all)
        print("終了")