1-Sep-2019 to 12-Jul-2020

In [1]:
#Import relevant modules
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import time
import re
import pandas as pd

In [2]:
#Initialize variable
#In this program, only data of 2019/2020 season is scraped
#We start scrapping with the race on 1 Sep 2019 and end at 12 Jul 2020
URL_racing_res = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2020/07/15&RaceNo=1'
URL_date_suffix= re.search(r'\d{4}/\d{2}/\d{2}',URL_racing_res).group(0)
URL_race_no_suffix = re.search(r'\d+$',URL_racing_res).group(0)
internation_race_day = ['2019/09/29','2019/10/13','2019/10/19','2019/11/05','2019/11/10','2019/11/24','2019/12/22','2020/07/04','2020/02/01','2020/02/20','2020/03/07','2020/04/04','2020/04/11','2020/06/01','2020/06/06','2020/06/16','2020/06/17','2020/06/18','2020/06/19','2020/06/20','2020/06/28','2020/07/04']

In [3]:
#Please change the path to the location of your chromedriver
driver = webdriver.Chrome('/Users/Hei/Applications/chromedriver')


In [4]:
def create_pref_df():
    '''Create DataFrame from performance table data'''
    
    global soup
    #Get #Get all <td> elements in html
    list_performance = get_html_from_soup(soup)

    #Number of columns of performance data table
    num_of_columns=12
    #Number of cells in performance data table
    num_of_table_element=len(list_performance)
    #Number of horse
    num_of_horse = int(num_of_table_element/num_of_columns)
    
    #Create dictionary for storing perfotmance table data
    list_columns = ['place','horse_no','horse','jockey','trainer','actual_weight',\
                 'declared_horse_weight','draw','lbw','running_position','finish_time',\
                 'win_odds']

    #Load performance table data into dataframe
    table = soup.find_all(class_="performance")
    table_data = [i.find_all('td') for i in table]
    l = [i.text.strip() for i in table_data[0]]
    table=[]
    for i in range(1,int(len(l)/12)):
        table.append(l[12*i:12*i+12])
    df_table=pd.DataFrame(table,columns=list_columns)
  
    return df_table,num_of_horse

def get_html_from_soup(soup):
    #Get all <td> elements in html
    data_performance=soup.find_all(class_="f_fs12")[1:-2][0]
    list_performance=data_performance.find_all('td')
    return list_performance

def create_race_info_df():
    global soup,num_of_horse,URL_date_suffix
    
    table = soup.find_all(class_="race_tab")
    table_data = [i.find_all('td')for i in table]
    l = "".join([i.text for i in table_data[0]])
    
    line = soup.find_all(class_="f_fl f_fs13")
    line = line[0].text
    location=re.search(r'\w+\s{2,}(.+)$',line).group(1)
    

    d=[[re.findall("RACE\s(\d+)", l)[0]+re.findall(r"RACE.+(\(\d+\))", l)[0],re.search(r'\)(.+)Going',l).group(1),
      re.findall("Going \:(FIRM|GOOD TO FIRM|GOOD|GOOD TO YIELDING|YIELDING|YIELDING TO SOFT|SOFT|HEAVY|GOOD TO SOFT|WET FAST|FAST|SLOW|WET SLOW|RAIN AFFECTED|NORMAL WATERING)\w+", l)[0],
       re.findall("Course \:(.+)HK", l)[0],
       re.findall("HK\$ \d+\,\d*\,*\d*",l)[0],location,URL_date_suffix] for i in range(num_of_horse)]

    df=pd.DataFrame(d)
    df.columns=('race','class','going','turf','prize','location','date')
    
    return df

def create_data_df(performance,race_info):
    '''Create a DataFrame containing performance and race info data'''
    df_data=pd.concat([performance,race_info],axis=1)
    return df_data

def get_num_of_race():
    '''Find out the number of image for each race'''
    pattern_img = re.compile('.+src="/racing/info/StaticFile/Images/Racing/racecard_rt.+')
    img_list = [ str(tag) for tag in soup.find_all('img') if pattern_img.match(str(tag)) ]
    race_no = [int(re.search(r'Racing/racecard_rt_(\d+)',str(img)).group(1)) for img in img_list]

    return max(race_no)

def get_next_race():
    '''Update URL_date_suffix, URL_race_no_suffix and return the url of next race. If it is internation event, skip current day'''
    global soup,URL_date_suffix,URL_race_no_suffix

    #local event:
    #get next race no. suffix in url
    max_race = get_num_of_race()
    if (int(URL_race_no_suffix)+1<=max_race):
        URL_race_no_suffix = str(int(URL_race_no_suffix)+1)
    else:
        #get next date suffix in url
        date = soup.find(class_="f_fs11")
        date_selection = date.text.replace('\n',',')[1:-1].split(',')
        cur_race_day = URL_date_suffix[-2:]+URL_date_suffix[4:8]+URL_date_suffix[:4]
        next_race_day = date_selection[date_selection.index(cur_race_day)-1]
        while (next_race_day[-4:]+next_race_day[2:6]+next_race_day[:2]) in internation_race_day:
            cur_race_day=next_race_day
            next_race_day = date_selection[date_selection.index(cur_race_day)-1]
        URL_date_suffix = next_race_day[-4:]+next_race_day[2:6]+next_race_day[:2]
        URL_race_no_suffix = str(1)
    
    next_race_url='https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate='+URL_date_suffix+'&RaceNo='+URL_race_no_suffix
    return next_race_url

def check_race():
    '''Check whether the race is cancelled. Return True if the race is scheduled and False if the race is cancalled'''
    perf_content = soup.find(class_="race_tab")
    return bool(perf_content)

def isIntRace():
    '''Return True if it is an international event. Otherwise return false.'''
    global internation_race_day
    return True if URL_date_suffix in internation_race_day else False

In [None]:
#Create an empty dataframe storing all data from the whole season
df_output = pd.DataFrame(columns=['place','horse_no','horse','jockey','trainer','actual_weight',\
                 'declared_horse_weight','draw','lbw','running_position','finish_time',\
                 'win_odds','race','class','going','turf','prize','location','date'])
while True:
    #Scrape data using chromedriver
    #If the code fail to fetch enough html, please extend the sleep time
    driver.get(URL_racing_res)
    time.sleep(5)
    subhtml = driver.page_source
    soup = BeautifulSoup(subhtml, 'html.parser')
    
    if isIntRace():
        #Skip current page and go to next day
        URL_racing_res=get_next_race()
    else:
        #Get html data only if the race is scheduled. Otherwise, skip the page.
        if check_race():
            #Create dataframe containing performance data
            df_perf,num_of_horse = create_pref_df()
            #Creat dataframe containing racing info
            df_race_info=create_race_info_df()
            #Creat dataframe for analysis by concatenating df_perf and df_race_info
            df_data = create_data_df(df_perf,df_race_info)
            #Append df_output with df_data
            df_output=pd.concat([df_output,df_data],axis=0)

        #Update the url for next race
        if (URL_date_suffix=='2020/07/15') and (URL_race_no_suffix=='9'):
            break
        else:
            URL_racing_res=get_next_race()

In [None]:
df_output

In [None]:
URL_date_suffix,URL_race_no_suffix

In [None]:
URL_racing_res

In [None]:
df_output.to_csv('2020_0715_performance.csv',index=False)