In [1]:
import sys
sys.path.insert(0, '../../../ScraperFC') # import local ScraperFC
import ScraperFC as sfc

from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

import traceback
import pandas as pd
from bs4 import BeautifulSoup
import time
import numpy as np
from tqdm import tqdm

# Scrape matches from FBRef

In [None]:
scraper = sfc.FBRef()
try:
    for year in range(2018,2023):
        print(year)
        matches = scraper.scrape_matches(year, 'EPL')
        matches.to_pickle(f'{year}_matches.pkl')
except:
    traceback.print_exc()
scraper.close()

# Add ELO scores for matches

In [None]:
# For mapping the team names on FBRef to the team names for ClubELO API
fbref2clubelo_teams = {
    'Hull City': 'Hull',
    'Leicester City': 'Leicester',
    'Stoke City': 'Stoke',
    'Swansea City': 'Swansea',
    'Tottenham Hotspur': 'Tottenham',
    'Crystal Palace': 'CrystalPalace',
    'West Bromwich Albion': 'WestBrom',
    'Manchester City': 'ManCity',
    'Manchester United': 'ManUnited',
    'West Ham United': 'WestHam',
    'Huddersfield Town': 'Huddersfield',
    'Brighton & Hove Albion': 'Brighton',
    'Newcastle United': 'Newcastle',
    'Cardiff City': 'Cardiff',
    'Wolverhampton Wanderers': 'Wolves',
    'Norwich City': 'Norwich',
    'Sheffield United': 'SheffieldUnited',
    'Aston Villa': 'AstonVilla',
    'Leeds United': 'Leeds',
}

for year in range(2018,2023):
    print(year)
    matches = pd.read_pickle(f'{year}_matches.pkl')

    home_elos = list()
    away_elos = list()
    for i in tqdm(matches.index):
        match = matches.loc[i,:]

        # Date and team names. Team names may need to be remapped to be found on ClubELO
        date = str(match['Date'])
        hteam = (
            match['Home Team'] 
            if match['Home Team'] not in fbref2clubelo_teams.keys() 
            else fbref2clubelo_teams[match['Home Team']]
        )
        ateam = (
            match['Away Team'] 
            if match['Away Team'] not in fbref2clubelo_teams.keys() 
            else fbref2clubelo_teams[match['Away Team']]
        )

        # Get the ELO scores
        helo = sfc.ClubElo().scrape_team_on_date(hteam, date)
        aelo = sfc.ClubElo().scrape_team_on_date(ateam, date)
        # Print out team names if the ELO score can't be found
        if helo == -1:
            print(hteam)
            break
        if aelo == -1:
            print(ateam)
            break

        home_elos.append(helo)
        away_elos.append(aelo)

    matches['Home ELO'] = home_elos
    matches['Away ELO'] = away_elos

    matches.to_pickle(f'{year}_matches.pkl')

# Add 538 data

In [2]:
scraper = sfc.FiveThirtyEight()
try:
    for year in range(2018,2023):
        print(year)
        
        # Load data from file and load 538 data
        matches = pd.read_pickle(f'{year}_matches.pkl')
        data = scraper.scrape_matches(year, 'EPL')
        
        # Set dtype of columns for merge
        matches['Date'] = matches['Date'].astype(str)
        data['date'] = data['date'].astype(str)
        
        # Rename some team names in 538 dataframe to match FBRef team names
        for team_from, team_to in [('AFC Bournemouth', 'Bournemouth'),
                                   ('Brighton and Hove Albion', 'Brighton & Hove Albion'),
                                   ('Newcastle', 'Newcastle United'),
                                   ('Wolverhampton', 'Wolverhampton Wanderers')]:
            data.loc[data['team1']==team_from, 'team1'] = team_to
            data.loc[data['team2']==team_from, 'team2'] = team_to
        assert np.all(np.unique(matches['Home Team']) == np.unique(data['team1']))

        # Merge
        merged = matches.merge(data, 
                               left_on=['Date', 'Home Team', 'Away Team',], 
                               right_on=['date', 'team1', 'team2',],
                               suffixes=['', '_538'])
        
        # Rename 538 column names
        merged['Home SPI'] = merged['spi1'].copy()
        merged['Away SPI'] = merged['spi2'].copy()
        merged['Prob Home Win'] = merged['prob1'].copy()
        merged['Prob Away Win'] = merged['prob2'].copy()
        merged['Prob Tie'] = merged['probtie'].copy()
        merged['Home Proj Score'] = merged['proj_score1'].copy()
        merged['Away Proj Score'] = merged['proj_score2'].copy()
        merged['Home Importance'] = merged['importance1'].copy()
        merged['Away Importance'] = merged['importance2'].copy()
        merged['Home 538 xG'] = merged['xg1'].copy()
        merged['Away 538 xG'] = merged['xg2'].copy()
        merged['Home nsxG'] = merged['nsxg1'].copy()
        merged['Away nsxG'] = merged['nsxg2'].copy()
        merged['Home Adj Score'] = merged['adj_score1'].copy()
        merged['Away Adj Score'] = merged['adj_score2'].copy()

        # Delete columns with 538 column names
        merged = merged.drop(columns=data.columns)
        
        # Save
        merged.to_pickle(f'{year}_matches.pkl')
except:
    traceback.print_exc()
scraper.close()

[WDM] - 

[WDM] - Current google-chrome version is 105.0.5195
[WDM] - Get LATEST driver version for 105.0.5195
[WDM] - Driver [C:\Users\Owner\.wdm\drivers\chromedriver\win32\105.0.5195.52\chromedriver.exe] found in cache


2018
2019
2020
2021
2022


# Scrape historic FPL data

In [None]:
# options = Options()
# # options.headless = True
# prefs = {'profile.managed_default_content_settings.images': 2} # don't load images
# options.add_experimental_option('prefs', prefs)
# driver = webdriver.Chrome(
#     service=ChromeService(ChromeDriverManager().install()),
#     options=options
# )

In [None]:
# year = 2022

# driver.get('https://www.fantasynutmeg.com/history')
# time.sleep(2)

# #### Load data for the season ####
# # select the season from the dropdown
# soup = BeautifulSoup(driver.page_source, 'html.parser')
# season_option_tag = soup.find('select').find('option', {'label': f'{year-1}-{str(year)[-2:]}'})
# driver.find_element(By.XPATH, sfc.xpath_soup(season_option_tag)).click()

# # click the update button to update the table
# update_button_tag = soup.find('button', {'ng-click': 'loadHistory()'})
# driver.find_element(By.XPATH, sfc.xpath_soup(update_button_tag)).click()

# time.sleep(2)

In [None]:
# ################################################################################
# #### Scroll through all rows ####
# season_df = pd.DataFrame()
# season_done = False
# while not season_done:

#     player_rows = BeautifulSoup(driver.page_source, 'html.parser')\
#         .find('div', {'id': 'ptsTable'})\
#         .find_all('div', {'role': 'row'})
    
#     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#     # Iterate across visible player rows
#     for player_row in player_rows[1:]:
        
#         # Player info
#         player_name = player_row.find_all('div', {'role': 'gridcell'})[0].getText()
#         team = player_row.find_all('div', {'role': 'gridcell'})[1].getText()
#         position = player_row.find_all('div', {'role': 'gridcell'})[2].getText()
        
#         # Click on player name to get gw data popup
#         cols = player_row.find_all('div', {'role': 'gridcell'})
#         driver.find_element(By.XPATH, sfc.xpath_soup(cols[0])).click()
#         time.sleep(5)
        
#         #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#         # Gather player gameweek data
#         player_df = pd.DataFrame()
#         player_done = False
#         while not player_done:
            
#             gw_rows = BeautifulSoup(driver.page_source, 'html.parser')\
#                 .find_all('div', {'role': 'presentation'})[-1]\
#                 .find_all('div', {'role': 'row'})
            
#             # Scrape visible rows
#             for gw_row in gw_rows[1:]:
#                 gw_data = np.array([
#                     tag.getText() \
#                     for tag in gw_row.find_all('div', {'role': 'gridcell'})
#                 ]).reshape(1,-1)
#                 gw_df = pd.DataFrame(gw_data)
#                 player_df = pd.concat([player_df, gw_df], axis=0, ignore_index=True)
            
#             # Scroll to last visible gameweek row
#             og_fixture = BeautifulSoup(driver.page_source, 'html.parser')\
#                 .find_all('div', {'role': 'presentation'})[-1]\
#                 .find_all('div', {'role': 'row'})[-1]\
#                 .find_all('div', {'role': 'gridcell'})[1].getText()
#             driver.execute_script(
#                 'arguments[0].scrollIntoView();',
#                 driver.find_element(By.XPATH, sfc.xpath_soup(gw_rows[-1]))
#             )
#             time.sleep(5)
#             new_fixture = BeautifulSoup(driver.page_source, 'html.parser')\
#                 .find_all('div', {'role': 'presentation'})[-1]\
#                 .find_all('div', {'role': 'row'})[-1]\
#                 .find_all('div', {'role': 'gridcell'})[1].getText()
#             if og_fixture == new_fixture:
#                 player_done = True
            
#         # Clean player df
#         player_df[0] = player_df[0].astype(int)
#         player_df = player_df.drop_duplicates(ignore_index=True)
#         player_df = player_df.sort_values(0).reset_index(drop=True)
#         player_df['Name'] = [player_name,] * 38
#         player_df['Team'] = [team,] * 38
#         player_df['Pos'] = [position,] * 38
        
#         # Close player gw popup
#         driver.find_element(
#             By.XPATH, 
#             sfc.xpath_soup(
#                 BeautifulSoup(driver.page_source, 'html.parser')\
#                     .find('button', {'class': 'close'})
#             ),
#         ).click()
#         time.sleep(5)
        
#         # Update season df
#         season_df = pd.concat([season_df, player_df], ignore_index=True, axis=0)

#     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#     # Scroll to last visible player row
#     og_first_player = player_rows[1].find('div', {'role': 'gridcell'}).getText()
#     driver.execute_script(
#         'arguments[0].scrollIntoView();',
#         driver.find_element(By.XPATH, sfc.xpath_soup(player_rows[-1]))
#     )
#     time.sleep(5)
#     new_first_player = BeautifulSoup(driver.page_source, 'html.parser')\
#         .find('div', {'id': 'ptsTable'})\
#         .find_all('div', {'role': 'row'})[1]\
#         .find('div', {'role': 'gridcell'}).getText()
    
#     print(og_first_player, new_first_player)
#     print('-'*80)
    
#     if og_first_player == new_first_player:
#         season_done = True
    
# # Clean season df
# season_df.columns = [
#     'GW', 'Fixture', 'Pts', 'MP', 'GS', 'A', 'CS', 'GC', 'OG', 'PS',
#     'PM', 'YC', 'RC', 'S', 'B', 'BPS', 'ICT', 'Cost', 'TX_IN', 'TX_OUT',
#     'Name', 'Team', 'Pos'
# ]

In [None]:
# season_df

In [None]:
# driver.close()
# driver.quit()