In [1]:
# Web scraping
from bs4 import BeautifulSoup
import requests

# Data manipulation
import pandas as pd
import numpy as np
import datetime as dt
import os 

In [2]:
def create_folder(directory, team_name):
    
    out_path = f'{directory}/{team_name}'
    try:
        if not os.path.exists(out_path):
            os.makedirs(out_path)
    except OSError:
        print(f'Error: Creating directory {out_path}')

#### Functions for creating team datasets

In [3]:
def create_team_dataset(link, table_id):
    
    # Create soup object
    fbref_link = requests.get(link).text
    soup = BeautifulSoup(fbref_link, 'lxml')

    # Find the table with team stats
    table = soup.find('table', id = table_id)
    
    # Get the column names from the first row of the table
    cols = []

    # Get date column from the row header
    cols.append(table.tbody.tr.th.get('data-stat'))

    # Get other column titles from the specific table cells
    for cell in table.tbody.tr.find_all('td'):
        cols.append(cell.get('data-stat'))

    # Create empty dataframe with column names from created list
    team_data = pd.DataFrame(columns = cols)
    
    # Iterate over all the rows in the table, and add to empty dataframe
    for row in table.tbody.find_all('tr'):

        # Initialize empty dict for the team
        team_dict = {}

        # Get the match date from row header
        team_dict['date'] = row.th.get('csk')

        # Iterate over all the cells and get the remaining column values
        for cell in row.find_all('td'):
            stat = cell.get('data-stat')
            value = cell.text
            team_dict[stat] = value

         # Append team-match row to dataframe        
        team_data = team_data.append(team_dict, ignore_index=True)
        
    return team_data

In [4]:
def clean_team_dataset(dataset):
    
    cols_to_keep = ['date', 'comp', 'round', 'dayofweek', 'venue', 'result', 'goals_for', 'goals_against', 'opponent', 
               'xg_for', 'xg_against', 'possession']

    dataset = dataset[cols_to_keep]

    # # Convert date to date format
    dataset['date'] = pd.to_datetime(dataset['date'])
    
    # Keep only premier league games
    dataset = dataset[dataset['comp'].str.strip() == 'Premier League']

    # Strip the round variable to only keep the number
    dataset['round'] = dataset['round'].str.split().str[1]

    # Replace empty space with missing values
    dataset = dataset.replace(r'^\s*$', np.nan, regex=True)
    
    # Drop rows where there is no match data
    dataset = dataset.dropna(subset = ['result'])

    # Create a list for object cols
    object_cols = ['date', 'dayofweek', 'round', 'venue', 'result','opponent']
    
    # Convert round to int64
    dataset['round'] = dataset['round'].astype('int64', errors='ignore')

    # # Except objectcols all columns should be numerical
    for col in dataset.columns:
        if col not in object_cols:
            dataset[col] = dataset[col].astype(float, errors='ignore')
            
    return dataset 

In [5]:
def save_team_dataset(dataset, directory, team_name):
    
    create_folder(directory, team_name)

    dataset.to_csv(f'{directory}/{team_name}/summary_team_data.csv', index=False)

#### Functions for creating squad datasets

In [6]:
def create_squad_dataset(link, table_id):
    
    fbref_link = requests.get(link).text
    soup = BeautifulSoup(fbref_link, 'lxml')
    
    # Create a soup object for the all-stats table
    table = soup.find('table', id = table_id)

    # Get the column names from the first row of the table
    cols = []

    # Get player name column from the row header
    cols.append(table.tbody.tr.th.get('data-stat'))

    # Get other column titles from the specific table cells
    for cell in table.tbody.tr.find_all('td'):
        cols.append(cell.get('data-stat'))

    # Create empty dataframe with column names from created list
    squad_data = pd.DataFrame(columns = cols)
    
    # Iterate over all the rows in the table, and add to empty dataframe
    for row in table.tbody.find_all('tr'):
    
        # Initialize empty dict for the player
        squad_dict = {}

        # Get the player's name from row header
        squad_dict['player'] = row.th.get('csk')

        # Iterate over all the cells and get the remaining column values
        for cell in row.find_all('td'):
            stat = cell.get('data-stat')
            if cell.get('data-stat') != 'matches':
                value = cell.text
                squad_dict[stat] = value
            else:
                link = 'https://fbref.com' + cell.a.get('href')
                squad_dict[stat] = link

         # Append player row to dataframe        
        squad_data = squad_data.append(squad_dict, ignore_index=True)
        
    return squad_data

In [7]:
def clean_squad_dataset(dataset):
    
    # Correct age variable
    dataset['age'] = dataset['age'].str[0:2]

    # Correct nationality variable
    dataset['nationality'] = dataset['nationality'].str.split().str[-1]

    # Replace empty space with missing values
    dataset = dataset.replace(r'^\s*$', np.nan, regex=True)

    # Except first 3 columns, all columns should be numerical
    for col in dataset.columns:
        if col not in ['player', 'nationality', 'position']:
            dataset[col] = dataset[col].astype(float, errors='ignore')
            
    return dataset

In [8]:
def save_squad_dataset(dataset, data_type, directory, team_name):
    
    create_folder(directory, team_name)

    dataset.to_csv(f'{directory}/{team_name}/{data_type}_squad_data.csv', index=False)

#### Functions for creating player datasets

In [9]:
def create_player_dataset(link, table_id):
    
    # Connect to FBref page
    fbref_link = requests.get(link).text
    soup = BeautifulSoup(fbref_link, 'lxml')

    # Create a soup object for the all-stats table
    table = soup.find('table', id = table_id)

    # Create list to get all player links
    player_links = []
    for row in table.tbody.find_all('tr'):
        for cell in row.find_all('td'):
            if cell.get('data-stat') == 'matches':
                link = 'https://fbref.com' + cell.a.get('href')
        player_links.append(link)
        
    # Get the column names of the eventual dataframe by just going to the first player link page
    player_link = player_links[0]
    player_page = requests.get(player_link).text
    soup = BeautifulSoup(player_page, 'lxml')

    # Get the div that contains the premier league filter
    player_prem_div = soup.find('div', class_ = 'filter')

    for filt in player_prem_div.find_all('div', class_ = ''):
        if filt.a.text.strip() == '2021-2022 Premier League':
            player_prem_link = 'https://fbref.com' + filt.a.get('href')

    # Connect to player prem page
    player_prem_page = requests.get(player_prem_link).text
    soup = BeautifulSoup(player_prem_page, 'lxml')

    # Create table object to parse through
    table = soup.find('table', id = 'matchlogs_11160')

    # Get the column names from the first row of the table
    cols = []

    # Append player name to column name
    cols.append('name')

    # Get match date from the row header
    cols.append(table.tbody.tr.th.get('data-stat'))

    # Get other column titles from the specific table cells
    for row in table.tbody.find_all('tr'):
        if row.get('class') == "unused_sub hidden":
            pass
        else:
            for cell in row.find_all('td'):
                if cell.get('data-stat') != 'match_report':
                    cols.append(cell.get('data-stat'))

    # Get unique list of columns while preserving order
    variables = list(dict.fromkeys(cols))

    # Create empty dataframe with column names from created list
    player_data = pd.DataFrame(columns = variables)
    
    # Loop through the links in the player_links list and extract info we need 
    for link in player_links:

        # First browse overall stats page 
        player_page = requests.get(link).text
        soup = BeautifulSoup(player_page, 'lxml')

        # Get the div that contains the premier league filter
        player_prem_div = soup.find('div', class_ = 'filter')

        # Loop through each filter and get the link for the filter that corresponds to the premier league
        for filt in player_prem_div.find_all('div', class_ = ''):
            if filt.a.text.strip() == '2021-2022 Premier League':
                player_prem_link = 'https://fbref.com' + filt.a.get('href')

        # Connect to premier league player stats page
        player_prem_page = requests.get(player_prem_link).text
        soup = BeautifulSoup(player_prem_page, 'lxml')

        # Get player name from the player page
        player_name = soup.find('h1', itemprop="name").span.text

        # Create table object to parse through player stats
        table = soup.find('table', id = 'matchlogs_11160')

        # Loop through each match stats and add to dataframe
        for row in table.tbody.find_all('tr'):

            # Initialize empty dict for the player
            player_dict = {}

            # Get the name from the previously stored variable
            player_dict['name'] = player_name

            # Get the match date from row header
            player_dict['date'] = row.th.get('csk')

            # Iterate over all the cells and get the remaining column values
            for cell in row.find_all('td'):
                if cell.get('data-stat') != 'match_report':
                    stat = cell.get('data-stat')
                    value = cell.text
                    player_dict[stat] = value

            # Append player row to dataframe        
            player_data = player_data.append(player_dict, ignore_index=True)
            
    return player_data

In [10]:
def clean_player_dataset(dataset):
    
    # Convert date to date format
    dataset['date'] = pd.to_datetime(dataset['date'])

    # Strip the round variable to only keep the number
    dataset['round'] = dataset['round'].str.split().str[1]

    # Replace empty space with missing values
    dataset = dataset.replace(r'^\s*$', np.nan, regex=True)

    # Create a list for object cols
    object_cols = ['name', 'date', 'dayofweek', 'round', 'venue', 'result', 'squad',
                   'opponent', 'game_started', 'position', 'bench_explain']

    # # Except first 3 columns, all columns should be numerical
    for col in dataset.columns:
        if col not in object_cols:
            dataset[col] = dataset[col].astype(float, errors='ignore')
            
    return dataset

In [11]:
def save_player_dataset(dataset, data_type, directory, team_name):
    
    create_folder(directory, team_name)

    dataset.to_csv(f'{directory}/{team_name}/{data_type}_player_data.csv', index=False)

#### Aggregate function for scraping FBref data

In [12]:
def scrape_fbref_data(team_name, link, team_table_id, table_ids, data_types, directory):
    
    print(f'Scraping team data for {team_name}')
    
    save_team_dataset(clean_team_dataset(create_team_dataset(link, team_table_id)), directory, team_name)
    
    for table_id, data_type in zip(table_ids, data_types):
        
        print(f'Scraping squad {data_type} data for {team_name}')
        save_squad_dataset(clean_squad_dataset(create_squad_dataset(link, table_id)), data_type, directory, team_name)
        
        print(f'Scraping player {data_type} data for {team_name}')
        save_player_dataset(clean_player_dataset(create_player_dataset(link, table_id)), data_type, directory, team_name)

#### Running scraping function for Arsenal F.C

In [13]:
# Set parameters
table_ids = ['stats_standard_11160', 'stats_keeper_adv_11160', 'stats_shooting_11160', 'stats_passing_11160',
            'stats_gca_11160', 'stats_defense_11160', 'stats_possession_11160']

data_types = ['summary', 'keeping', 'shooting', 'passing', 'goal_creation', 'defense', 'possession']

team_name = 'Arsenal'
team_table_id = 'matchlogs_for'
link = 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats'
directory = '../../data/input/scraped'

In [14]:
# Run function
scrape_fbref_data(team_name = team_name, link = link, team_table_id = team_table_id,
                  table_ids = table_ids, data_types = data_types, directory = directory)

Scraping team data for Arsenal


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Scraping squad summary data for Arsenal
Scraping player summary data for Arsenal
Scraping squad keeping data for Arsenal
Scraping player keeping data for Arsenal
Scraping squad shooting data for Arsenal
Scraping player shooting data for Arsenal
Scraping squad passing data for Arsenal
Scraping player passing data for Arsenal
Scraping squad goal_creation data for Arsenal
Scraping player goal_creation data for Arsenal
Scraping squad defense data for Arsenal
Scraping player defense data for Arsenal
Scraping squad possession data for Arsenal
Scraping player possession data for Arsenal


#### Running scraping function for Chelsea F.C. 

In [15]:
# Set parameters
table_ids = ['stats_standard_11160', 'stats_keeper_adv_11160', 'stats_shooting_11160', 'stats_passing_11160',
            'stats_gca_11160', 'stats_defense_11160', 'stats_possession_11160']

data_types = ['summary', 'keeping', 'shooting', 'passing', 'goal_creation', 'defense', 'possession']

team_name = 'Chelsea'
team_table_id = 'matchlogs_for'
link = 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats'
directory = '../../data/input/scraped'

In [16]:
# Run function
scrape_fbref_data(team_name = team_name, link = link, team_table_id = team_table_id,
                  table_ids = table_ids, data_types = data_types, directory = directory)

Scraping team data for Chelsea


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Scraping squad summary data for Chelsea
Scraping player summary data for Chelsea
Scraping squad keeping data for Chelsea
Scraping player keeping data for Chelsea
Scraping squad shooting data for Chelsea
Scraping player shooting data for Chelsea
Scraping squad passing data for Chelsea
Scraping player passing data for Chelsea
Scraping squad goal_creation data for Chelsea
Scraping player goal_creation data for Chelsea
Scraping squad defense data for Chelsea
Scraping player defense data for Chelsea
Scraping squad possession data for Chelsea
Scraping player possession data for Chelsea


#### Running scraping function for Liverpool F.C. 

In [19]:
# Set parameters
table_ids = ['stats_standard_11160', 'stats_keeper_adv_11160', 'stats_shooting_11160', 'stats_passing_11160',
            'stats_gca_11160', 'stats_defense_11160', 'stats_possession_11160']

data_types = ['summary', 'keeping', 'shooting', 'passing', 'goal_creation', 'defense', 'possession']

team_name = 'Liverpool'
team_table_id = 'matchlogs_for'
link = 'https://fbref.com/en/squads/822bd0ba/Liverpool-Stats'
directory = '../../data/input/scraped'

In [20]:
# Run function
scrape_fbref_data(team_name = team_name, link = link, team_table_id = team_table_id,
                  table_ids = table_ids, data_types = data_types, directory = directory)

Scraping team data for Liverpool


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Scraping squad summary data for Liverpool
Scraping player summary data for Liverpool
Scraping squad keeping data for Liverpool
Scraping player keeping data for Liverpool
Scraping squad shooting data for Liverpool
Scraping player shooting data for Liverpool
Scraping squad passing data for Liverpool
Scraping player passing data for Liverpool
Scraping squad goal_creation data for Liverpool
Scraping player goal_creation data for Liverpool
Scraping squad defense data for Liverpool
Scraping player defense data for Liverpool
Scraping squad possession data for Liverpool
Scraping player possession data for Liverpool


#### Running scraping function for Man City F.C. 

In [21]:
# Set parameters
table_ids = ['stats_standard_11160', 'stats_keeper_adv_11160', 'stats_shooting_11160', 'stats_passing_11160',
            'stats_gca_11160', 'stats_defense_11160', 'stats_possession_11160']

data_types = ['summary', 'keeping', 'shooting', 'passing', 'goal_creation', 'defense', 'possession']

team_name = 'Man_City'
team_table_id = 'matchlogs_for'
link = 'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats'
directory = '../../data/input/scraped'

In [22]:
# Run function
scrape_fbref_data(team_name = team_name, link = link, team_table_id = team_table_id,
                  table_ids = table_ids, data_types = data_types, directory = directory)

Scraping team data for Man_City


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Scraping squad summary data for Man_City
Scraping player summary data for Man_City
Scraping squad keeping data for Man_City
Scraping player keeping data for Man_City
Scraping squad shooting data for Man_City
Scraping player shooting data for Man_City
Scraping squad passing data for Man_City
Scraping player passing data for Man_City
Scraping squad goal_creation data for Man_City
Scraping player goal_creation data for Man_City
Scraping squad defense data for Man_City
Scraping player defense data for Man_City
Scraping squad possession data for Man_City
Scraping player possession data for Man_City


#### Running scraping function for West Ham F.C. 

In [23]:
# Set parameters
table_ids = ['stats_standard_11160', 'stats_keeper_adv_11160', 'stats_shooting_11160', 'stats_passing_11160',
            'stats_gca_11160', 'stats_defense_11160', 'stats_possession_11160']

data_types = ['summary', 'keeping', 'shooting', 'passing', 'goal_creation', 'defense', 'possession']

team_name = 'West_Ham'
team_table_id = 'matchlogs_for'
link = 'https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats'
directory = '../../data/input/scraped'

In [24]:
# Run function
scrape_fbref_data(team_name = team_name, link = link, team_table_id = team_table_id,
                  table_ids = table_ids, data_types = data_types, directory = directory)

Scraping team data for West_Ham


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Scraping squad summary data for West_Ham
Scraping player summary data for West_Ham
Scraping squad keeping data for West_Ham
Scraping player keeping data for West_Ham
Scraping squad shooting data for West_Ham
Scraping player shooting data for West_Ham
Scraping squad passing data for West_Ham
Scraping player passing data for West_Ham
Scraping squad goal_creation data for West_Ham
Scraping player goal_creation data for West_Ham
Scraping squad defense data for West_Ham
Scraping player defense data for West_Ham
Scraping squad possession data for West_Ham
Scraping player possession data for West_Ham


#### Running scraping function for Man Utd F.C.

In [25]:
# Set parameters
table_ids = ['stats_standard_11160', 'stats_keeper_adv_11160', 'stats_shooting_11160', 'stats_passing_11160',
            'stats_gca_11160', 'stats_defense_11160', 'stats_possession_11160']

data_types = ['summary', 'keeping', 'shooting', 'passing', 'goal_creation', 'defense', 'possession']

team_name = 'Man_Utd'
team_table_id = 'matchlogs_for'
link = 'https://fbref.com/en/squads/19538871/Manchester-United-Stats'
directory = '../../data/input/scraped'

In [26]:
# Run function
scrape_fbref_data(team_name = team_name, link = link, team_table_id = team_table_id,
                  table_ids = table_ids, data_types = data_types, directory = directory)

Scraping team data for Man_Utd


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Scraping squad summary data for Man_Utd
Scraping player summary data for Man_Utd
Scraping squad keeping data for Man_Utd
Scraping player keeping data for Man_Utd
Scraping squad shooting data for Man_Utd
Scraping player shooting data for Man_Utd
Scraping squad passing data for Man_Utd
Scraping player passing data for Man_Utd
Scraping squad goal_creation data for Man_Utd
Scraping player goal_creation data for Man_Utd
Scraping squad defense data for Man_Utd
Scraping player defense data for Man_Utd
Scraping squad possession data for Man_Utd
Scraping player possession data for Man_Utd


#### Running scraping function for Tottenham F.C. 

In [27]:
# Set parameters
table_ids = ['stats_standard_11160', 'stats_keeper_adv_11160', 'stats_shooting_11160', 'stats_passing_11160',
            'stats_gca_11160', 'stats_defense_11160', 'stats_possession_11160']

data_types = ['summary', 'keeping', 'shooting', 'passing', 'goal_creation', 'defense', 'possession']

team_name = 'Tottenham'
team_table_id = 'matchlogs_for'
link = 'https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats'
directory = '../../data/input/scraped'

In [28]:
# Run function
scrape_fbref_data(team_name = team_name, link = link, team_table_id = team_table_id,
                  table_ids = table_ids, data_types = data_types, directory = directory)

Scraping team data for Tottenham


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Scraping squad summary data for Tottenham
Scraping player summary data for Tottenham
Scraping squad keeping data for Tottenham
Scraping player keeping data for Tottenham
Scraping squad shooting data for Tottenham
Scraping player shooting data for Tottenham
Scraping squad passing data for Tottenham
Scraping player passing data for Tottenham
Scraping squad goal_creation data for Tottenham
Scraping player goal_creation data for Tottenham
Scraping squad defense data for Tottenham
Scraping player defense data for Tottenham
Scraping squad possession data for Tottenham
Scraping player possession data for Tottenham


#### Running scraping function for Leicester F.C. 

In [29]:
# Set parameters
table_ids = ['stats_standard_11160', 'stats_keeper_adv_11160', 'stats_shooting_11160', 'stats_passing_11160',
            'stats_gca_11160', 'stats_defense_11160', 'stats_possession_11160']

data_types = ['summary', 'keeping', 'shooting', 'passing', 'goal_creation', 'defense', 'possession']

team_name = 'Leicester'
team_table_id = 'matchlogs_for'
link = 'https://fbref.com/en/squads/a2d435b3/Leicester-City-Stats'
directory = '../../data/input/scraped'

In [30]:
# Run function
scrape_fbref_data(team_name = team_name, link = link, team_table_id = team_table_id,
                  table_ids = table_ids, data_types = data_types, directory = directory)

Scraping team data for Leicester


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Scraping squad summary data for Leicester
Scraping player summary data for Leicester
Scraping squad keeping data for Leicester
Scraping player keeping data for Leicester
Scraping squad shooting data for Leicester
Scraping player shooting data for Leicester
Scraping squad passing data for Leicester
Scraping player passing data for Leicester
Scraping squad goal_creation data for Leicester
Scraping player goal_creation data for Leicester
Scraping squad defense data for Leicester
Scraping player defense data for Leicester
Scraping squad possession data for Leicester
Scraping player possession data for Leicester
