# Import structure

## Setup

Import libraries

In [1]:
import requests
import time
import itertools
import heapq
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Variables

In [2]:
# URLs
url_table = "https://www.premierleague.com/tables"
url_fixtures = "https://www.premierleague.com/fixtures"

In [3]:
# Team abbreviations
dct_nicknames_to_abbr = {
    'Arsenal':"ARS",
    'Aston Villa':"AVL",
 'Bournemouth':"BOU",
 'Brentford':"BRE",
 'Brighton':"BHA",
 'Chelsea':"CHE",
 'Crystal Palace':"CRY",
 'Everton':"EVE",
 'Fulham':"FUL",
 'Leeds':"LEE",
 'Leicester': "LEI",
 'Liverpool': "LIV",
 'Man City': "MCI",
 'Man Utd': "MUN",
 'Newcastle': "NEW",
 "Nott'm Forest": "NFO",
 'Southampton': "SOU",
 'Spurs': "TOT",
 'West Ham': "WHU",
 'Wolves': "WOL"
                        }

# My team
my_team = "ARS"
my_team_points = 0
desired_positions = [1, 4, 17]

## Download the website data

Create the Google Chrome driver

In [4]:
# Parameter includes the path of the webdriver.
driver = webdriver.Chrome('chromedriver') 
# Minimize
driver.minimize_window

<bound method WebDriver.minimize_window of <selenium.webdriver.chrome.webdriver.WebDriver (session="3d86744b86d576f554ae5584f40ffd8f")>>

Access the first website and grab the EPL table

In [5]:
driver.get(url_table)

# this is just to ensure that the page is loaded
time.sleep(5)

# Accept All Cookies pop-up
wait = WebDriverWait(driver, 5)

try:
    wait.until(EC.element_to_be_clickable((By.XPATH, '//button[normalize-space()="Accept All Cookies"]'))).click()
except TimeoutException:
    pass

# this renders the JS code and stores all of the information in static HTML code.
html_table = driver.page_source

Access the second website and grab the fixtures

In [6]:
# Parse the fixtures page
driver.get(url_fixtures)

# this is just to ensure that the page is loaded
time.sleep(5)

# Scroll to end of page
SCROLL_PAUSE_TIME = 2

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# this renders the JS code and stores all of the information in static HTML code.
    
html_fixtures = driver.page_source

Close the browser

In [7]:
driver.close() # closing the webdriver

## Parse the web pages

### Parse information from the EPL table

In [8]:
# Make soup from the HTML of the League Table
soup_table = BeautifulSoup(html_table, "html.parser")

# Teamographic information
dct_table = {'team':[], 'badge': [], 'long_name': [], 'points': [], 'played': []}

# Grab all the rows
rows = soup_table.find("tbody", {"class": "isPL"}).find_all("tr", {"data-filtered-table-row-expander": None})

# Loop through each row to grab name and the points
for row in rows:
    # Grab the cell with the club badge and name
    team = row.find("td", {"class": "team"})
    # Grab the badge
    badge = team.find("img", {"class": "badge-image"})['src']
    # Grab the long name
    long_name = team.find("span", {"class": "long"}).text
    # Grab the short name
    short_name = team.find("span", {"class": "short"}).text
    # Grab the current points
    points = int(row.find("td", {"class": "points"}).text)
    # Calculate max points
    played = int(row.findAll("td")[3].text)
    
    # Add the team details to the dictionary
    dct_table['team'].append(short_name)
    dct_table['badge'].append(badge)
    dct_table['long_name'].append(long_name)
    dct_table['points'].append(points)
    dct_table['played'].append(played)
    
    if short_name == my_team:
        my_team_points = points
    
    
# Store the dictionary as a pandas DataFrame
df_table = pd.DataFrame.from_dict(dct_table)

In [9]:
# Calculate the max points
df_table['max_points'] = df_table['points'] + (38 - df_table['played'])*3

### Parse information from the fixtures

In [10]:
# Make soup from the HTML of the fixtures
soup_fixtures = BeautifulSoup(html_fixtures, "html.parser")

Grab the dates

In [11]:
elm_dates = soup_fixtures.find_all("div", class_='fixtures__matches-list')
match_dates = []
for elm_date in elm_dates:
    date_attr = elm_date.attrs["data-competition-matches-list"]
    date = datetime.strptime(date_attr, "%A %d %B %Y") if date_attr != "Date To Be Confirmed" else ""
    match_dates.append(date)

Grab the fixtures

In [12]:
elm_matches = soup_fixtures.find_all("li", class_="matchFixtureContainer")

Simulate the fixtures

Grab the table containing all the teams and their points

In [13]:
matches = []
for elm_match in elm_matches:

    # Get the teams
    home = dct_nicknames_to_abbr[elm_match['data-home']]
    away = dct_nicknames_to_abbr[elm_match['data-away']]

    # Create the points possibilities
    match = {"home":home, "away":away}

    # Append
    matches.append(match)

In [14]:
def make_ordinal(n):
    '''
    Convert an integer into its ordinal representation::

        make_ordinal(0)   => '0th'
        make_ordinal(3)   => '3rd'
        make_ordinal(122) => '122nd'
        make_ordinal(213) => '213th'
    '''
    n = int(n)
    if 11 <= (n % 100) <= 13:
        suffix = 'th'
    else:
        suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)]
    return str(n) + suffix

In [15]:
def oxford_comma(items):
    length = len(items)
    if length == 1:
        return items[0]
    if length == 2:
        return '{} and {}'.format(*items)
    return '{}, and {}'.format(', '.join(items[:-1]), items[-1])

In [16]:
def magic_number(my_team, positions, current_table, matches_to_play):
    
    def magic_number_helper(pos, current_table, matches):# Ge
        # Get the points total of the team right outside that position
        first_out = max_table.loc[pos, 'max_points']

        # Get all the team above the threshold, but cap it at the first 6 teams above the threshold
        filter_my_team = max_table['team']!=my_team
        filter_eq_threshold = max_table['max_points'] == first_out
        filter_gt_threshold = max_table['max_points'] > first_out
        filter_index = max_table.index.isin(range(max(0,pos-4), pos))
        filtered_table = max_table[filter_my_team & (filter_eq_threshold | (filter_gt_threshold & filter_index))].copy()
        
        # Get all the matches by the teams in question
        relevant_teams = list(filtered_table['team'])
        relevant_matches = []
        for match in matches:
            home = match['home']
            away = match['away']
            if home in relevant_teams and away in relevant_teams:
                possibilities = [{home: -3, away:0}, {home: -2, away:-2},{home:0, away:-3}]
                relevant_matches.append(possibilities)    
    
        simulation_points_min = []
        # Iterate through all simulation combinations so that we can adjust the actual probable ending tables
        for simulation in itertools.product(*relevant_matches):
            relevant_table_simulated = dict(filtered_table[['team', 'max_points']].values)
            for simulated_match in simulation:
                for team, value in simulated_match.items():
                    # Update the adjustment
                    relevant_table_simulated[team] += value
            simulated_first_out = heapq.nlargest(pos, relevant_table_simulated.values())[-1]
            simulation_points_min.append(simulated_first_out)
        return max(simulation_points_min)+1, relevant_matches, relevant_teams
        
    # Sort the table
    max_table = current_table.copy()
    max_table.sort_values(by=['max_points'])
    
    for pos in positions:
        magic_number, relevant_matches, teams = magic_number_helper(pos, max_table, matches)
        
        print(f"Position: \t\t{pos}")
        print(f"\tMagic number: \t{magic_number - my_team_points}")
        print(f"\tTeams: \t\t{oxford_comma(teams)}")
        print(f"\tMatches: \t{len(relevant_matches)}")
        print("")
              

magic_number("ARS", [1,4,17], df_table, matches)

Position: 		1
	Magic number: 	47
	Teams: 		MCI
	Matches: 	0

Position: 		4
	Magic number: 	38
	Teams: 		MCI, MUN, NEW, and TOT
	Matches: 	0

Position: 		17
	Magic number: 	20
	Teams: 		LEI, WOL, WHU, LEE, and EVE
	Matches: 	1

