Todo
* Scrape home and away team names in scrape_match()
* Gather data from a season and build training pipeline
* Train on limited data and evaluate
* Gather and train on more data if initial train is promising

In [None]:
from bs4 import BeautifulSoup
import datetime
import numpy as np
import pandas as pd
import re
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import sys
import time
from tqdm.auto import tqdm
from webdriver_manager.chrome import ChromeDriverManager

sys.path.append('../..')
import ScraperFC as sfc

In [None]:
#=======================================================================================================================
def get_match_links(year=None):
    """
    year=None for current season
    """
    if not year:
        url = 'https://www.oddsportal.com/soccer/england/premier-league/results/'
    else:
        url = f'https://www.oddsportal.com/soccer/england/premier-league-{year-1}-{year}/results/'

    driver.get(url)

    done = False
    match_links = list()
    while not done:
        soup = BeautifulSoup(driver.page_source, 'html.parser') # update soup
        links = soup.find_all('a',{'class': re.compile('flex-col')}, href=True)
        links = ['https://oddsportal.com'+el['href'] for el in links if 'premier-league' in el['href']]
        match_links += links

        next_button = [el for el in soup.find_all('p') if 'next'==el.text.lower()]
        if len(next_button) == 0:
            done = True
        else:
            next_button_xpath = sfc.xpath_soup(next_button[0])
            next_button = driver.find_element(By.XPATH, next_button_xpath)
            driver.execute_script('arguments[0].scrollIntoView()', next_button)
            driver.execute_script('arguments[0].click()', next_button)
            # Wait for next or prev buttons to load
            loaded = False
            while not loaded:
                soup = BeautifulSoup(driver.page_source, 'html.parser') # update soup
                next_button = [el for el in soup.find_all('p') if 'next'==el.text.lower()]
                prev_button = [el for el in soup.find_all('p') if 'prev'==el.text.lower()]
                if next_button or prev_button:
                    loaded = True

    match_links = list(set(match_links)) # remove any repeats links, if they exist for some reason

    return match_links

#=======================================================================================================================
def scrape_match(url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Date
    date = soup.find('div', {'class': re.compile('start-time')}).parent.text.replace('\xa0', ' ')
    date = datetime.datetime.strptime(date, '%A, %d %b %Y, %H:%M')
    # Team names
    imgs  = soup.find_all('img')
    teams = [
        img.parent.find('p').text for img in imgs 
        if img['src'] and 'team-logo' in img['src']
    ]
    team1 = teams[0]
    team2 = teams[1]
    # Goals and result
    final_result = [
        el.text for el in soup.find_all('strong') 
        if re.search('(?=.*Final)(?=.*result)', el.parent.text)
    ][0]
    goals1 = int(final_result.split(':')[0])
    goals2 = int(final_result.split(':')[1])
    result = '1' if goals1>goals2 else ('X' if goals1==goals2 else '2')

    match_df = pd.Series(dtype=object)
    match_df['Date'] = date
    match_df['Team1'] = team1
    match_df['Team2'] = team2
    match_df['Result'] = result
    match_df['Goals1'] = goals1
    match_df['Goals2'] = goals2
    match_df['Total goals'] = goals1 + goals2

    # Scrape odds
    match1X2odds_df = get_1X2odds_from_match(url)
    matchOUodds_df = get_OUodds_from_match(url)

    match_df = match_df.to_frame().T
    match_df = pd.concat([match_df, match1X2odds_df, matchOUodds_df], axis=1)

    return match_df

#=======================================================================================================================
def get_1X2odds_from_match(url):
    if '#1X2' not in url:
        url += '#1X2'
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # # Hide inactive odds
    # hide_inactive_checkbox = [el for el in soup.find_all('label') if 'Hide inactive odds' in el.text][0].parent.find('input', {'type': 'checkbox'})
    # hide_inactive_checkbox = driver.find_element(By.XPATH, sfc.xpath_soup(hide_inactive_checkbox))
    # driver.execute_script('arguments[0].scrollIntoView', hide_inactive_checkbox)
    # driver.execute_script('arguments[0].click()', hide_inactive_checkbox)
    # time.sleep(0.5)
    # soup = BeautifulSoup(driver.page_source, 'html.parser') # update soup

    odds_df = pd.Series(dtype=object)

    odds_table = soup.find_all('div', {'class':'flex flex-col'})[1]
    rows = odds_table.find_all('div', {'class':re.compile('flex text-xs')})
    for row in rows:
        bookie_info = row.find_all('a')
        odds = row.find_all('div', recursive=False)

        # Skip some rows
        skip_conds = (
            (len(odds) <= 1) # Odds not formatted right
            or ('coupon' in odds[0].text.lower()) # Coupon row
            or odds[4].text == ' - ' # Odds are crossed out (payout column is a dash)
        )
        if skip_conds:
            continue

        # Odds 0 is the bookie info div
        odds1 = float(odds[1].text)
        oddsX = float(odds[2].text)
        odds2 = float(odds[3].text)
        payout_perc = float(odds[4].text.replace('%',''))

        if (len(bookie_info) <= 1):
            # Average and max odds rows
            agg_type = odds[0].text
            odds_df[f'{agg_type} 1'] = odds1
            odds_df[f'{agg_type} X'] = oddsX
            odds_df[f'{agg_type} 2'] = odds2
            odds_df[f'{agg_type} po %'] = payout_perc
        else:
            # This is a row with odds from bookie
            bookie_url = bookie_info[0]['href']
            bookie_name = bookie_info[1].text
            # bookie info 2 is the info button (link to oddsportal page for bookie)
            # bookie info 3 is whether the bookie is running a bonus or not
            odds_df[f'{bookie_name} 1'] = odds1
            odds_df[f'{bookie_name} X'] = oddsX
            odds_df[f'{bookie_name} 2'] = odds2
            odds_df[f'{bookie_name} po %'] = payout_perc

    odds_df = odds_df.to_frame().T

    return odds_df

#=======================================================================================================================
def get_OUodds_from_match(url):
    if '#over-under' not in url:
        url += '#over-under'
    driver.get(url)

    # Wait for handicaps table to load
    loaded = False
    while not loaded:
        soup = BeautifulSoup(driver.page_source, 'html.parser') # update soup
        handicaps_table = soup.find_all('div', {'class':'min-md:px-[10px]'})
        if len(handicaps_table) >= 2:
            loaded = True

    # # Hide inactive odds
    # hide_inactive_checkbox = [el for el in soup.find_all('label') if 'Hide inactive odds' in el.text][0].parent.find('input', {'type': 'checkbox'})
    # hide_inactive_checkbox = driver.find_element(By.XPATH, sfc.xpath_soup(hide_inactive_checkbox))
    # driver.execute_script('arguments[0].scrollIntoView', hide_inactive_checkbox)
    # driver.execute_script('arguments[0].click()', hide_inactive_checkbox)
    # time.sleep(0.5)
    # soup = BeautifulSoup(driver.page_source, 'html.parser') # update soup
    
    # soup = BeautifulSoup(driver.page_source, 'html.parser') # update soup
    # handicaps_table = soup.find_all('div', {'class':'min-md:px-[10px]'})[1]
    handicaps_table = handicaps_table[1]
    handicap_rows = handicaps_table.find_all('div', {'class':'relative flex flex-col'}, recursive=False)

    odds_df = pd.Series(dtype=object)
    for handicap_row in handicap_rows:
        handicap = handicap_row.find('p').text.replace('Over/Under','').strip()

        # Click on handicap row to expand odds
        row_button = driver.find_element(By.XPATH, sfc.xpath_soup(handicap_row.find('div')))
        driver.execute_script('arguments[0].scrollIntoView()', row_button)
        driver.execute_script('arguments[0].click()', row_button)

        # Wait for odds table to load
        loaded = False
        while not loaded:
            soup = BeautifulSoup(driver.page_source, 'html.parser') # update soup
            odds_table = soup.find_all('div', {'class': 'flex flex-col'})
            if len(odds_table) > 0:
                loaded = True

        soup = BeautifulSoup(driver.page_source, 'html.parser') # update soup
        odds_table = soup.find_all('div', {'class': 'flex flex-col'})[1]
        odds_rows = odds_table.find_all('div', {'class':re.compile('flex text-xs')})
        for odds_row in odds_rows:
            bookie_info = odds_row.find_all('a')
            odds = odds_row.find_all('div', recursive=False)

            # Skip some rows
            skip_conds = (
                (len(odds) <= 1) # Odds not formatted right
                or ('coupon' in odds[0].text.lower()) # Coupon row
                or np.any([el.text==' - ' for el in odds]) # Odds are crossed out (payout column is a dash)
            )
            if skip_conds:
                continue

            if (len(bookie_info) <= 1):
                # Average and max odds rows
                agg_type = odds[0].text
                over = None if odds[1].text=='-' else float(odds[1].text)
                under = None if odds[2].text=='-' else float(odds[2].text)
                payout_perc = float(odds[3].text.replace('%',''))
                odds_df[f'{agg_type} {handicap} over'] = over
                odds_df[f'{agg_type} {handicap} under'] = under
                odds_df[f'{agg_type} {handicap} po %'] = payout_perc
            else:
                bookie_url = bookie_info[0]['href']
                bookie_name = bookie_info[1].text
                over = None if odds[2].text=='-' else float(odds[2].text)
                under = None if odds[3].text=='-' else float(odds[3].text)
                payout_perc = float(odds[4].text.replace('%',''))
                odds_df[f'{bookie_name} {handicap} over'] = over
                odds_df[f'{bookie_name} {handicap} under'] = under
                odds_df[f'{bookie_name} {handicap} po %'] = payout_perc

    odds_df = odds_df.to_frame().T

    return odds_df

In [None]:
options = Options()
prefs = {'profile.managed_default_content_settings.images': 2} # don't load images
options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(
    service=ChromeService(ChromeDriverManager().install()),
    options=options
)

In [None]:
for year in [2021, 2020, 2019, 2018]:
    odds_df = pd.DataFrame()
    match_links = get_match_links(year)
    for link in tqdm(match_links, desc=f'Scraping {year} EPL match odds'):
        match_df = scrape_match(link)
        odds_df = pd.concat([odds_df, match_df], axis=0, ignore_index=True)
    odds_df.to_pickle(f'epl_{year}_odds.pkl')

In [None]:
driver.close()
driver.quit()