In [137]:
import requests

In [138]:
standings_url = "https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats"

In [139]:
# Makes a request to the server and downloads the HTML of this page
data = requests.get(standings_url)

In [140]:
# Library that helps parse HTMl code
from bs4 import BeautifulSoup

In [141]:
soup = BeautifulSoup(data.text)

In [142]:
# select the table using the CSS selector
standings_table = soup.select('table.stats_table')[0] # table is the tag name and stats_table is the title(indexing only the first one)


.select uses a CSS selector where it has more flexibility in finding different elements, classes, and ids where as .find_all only finds tags.

In [143]:
# Finding all of the a tags in the table which correspond to the links
links = standings_table.find_all('a')


In [144]:
# Finding the href property of each link
links = [l.get("href") for l in links]

In [145]:
# Filtering the links so we only have the squad links with list comprehension
links = [l for l in links if '/squads/' in l] # Gets rid of the link if it doesnt have squad 

In [146]:
# Turning the links into full urls
team_urls = [f"https://fbref.com{l}" for l in links] # takes the links and adds the string to the begining

In [147]:
team_urls

['https://fbref.com/en/squads/b8fd03ef/2023-2024/Manchester-City-Stats',
 'https://fbref.com/en/squads/18bb7c10/2023-2024/Arsenal-Stats',
 'https://fbref.com/en/squads/822bd0ba/2023-2024/Liverpool-Stats',
 'https://fbref.com/en/squads/8602292d/2023-2024/Aston-Villa-Stats',
 'https://fbref.com/en/squads/361ca564/2023-2024/Tottenham-Hotspur-Stats',
 'https://fbref.com/en/squads/cff3d9bb/2023-2024/Chelsea-Stats',
 'https://fbref.com/en/squads/b2b47a98/2023-2024/Newcastle-United-Stats',
 'https://fbref.com/en/squads/19538871/2023-2024/Manchester-United-Stats',
 'https://fbref.com/en/squads/7c21e445/2023-2024/West-Ham-United-Stats',
 'https://fbref.com/en/squads/47c64c55/2023-2024/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/d07537b9/2023-2024/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/4ba7cbea/2023-2024/Bournemouth-Stats',
 'https://fbref.com/en/squads/fd962109/2023-2024/Fulham-Stats',
 'https://fbref.com/en/squads/8cec06e1/2023-2024/Wolverhampton-Wanderers-Stat

## Extracting Match Stats using Pandas and Requests

In [148]:
# Requesting the HTML from the team url
team_url = team_urls[0]

In [149]:
data = requests.get(team_url)

In [150]:
# Python data analysis library
import pandas as pd
from io import StringIO

# Turning the matches table into a pandas dataframe
matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures") # Reading all the table tags and looking at which has the string in it



In [151]:
matches[0].head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee,Match Report,Notes
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (1),1 (4),Arsenal,,,55,81145.0,Kyle Walker,4-2-3-1,4-3-3,Stuart Attwell,Match Report,
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,1.9,0.3,65,21572.0,Kevin De Bruyne,4-2-3-1,5-4-1,Craig Pawson,Match Report,
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,1 (5),1 (4),es Sevilla,,,74,,Kyle Walker,4-2-3-1,4-2-3-1,François Letexier,Match Report,
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,1.0,0.3,59,53419.0,Kyle Walker,4-2-3-1,4-3-3,Robert Jones,Match Report,
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,3.5,0.7,79,31336.0,Kyle Walker,4-2-3-1,3-5-2,Jarred Gillett,Match Report,


## Get Match Shooting Stats with Requests and Pandas

In [152]:
soup = BeautifulSoup(data.text)

In [153]:
links = soup.find_all('a')

In [154]:
links = [l.get("href") for l in links]

In [155]:
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [156]:
links

['/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions']

In [157]:
# Downloads the HTML of the shooting stats page
data = requests.get(f"https://fbref.com{links[0]}")

In [158]:
# Reading the shooting stats and put it into a pandas dataframe
shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]

## Cleaning and merging scraped data with Pandas

In [159]:
shooting.head()

Unnamed: 0_level_0,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (1),1 (4),Arsenal,...,,,0,0,,,,,,Match Report
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,13.9,0.0,0,0,1.9,1.9,0.12,1.1,1.1,Match Report
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,1 (5),1 (4),es Sevilla,...,,,0,0,,,,,,Match Report
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,17.9,0.0,0,0,1.0,1.0,0.07,0.0,0.0,Match Report
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,17.3,2.0,0,1,3.5,2.8,0.1,-1.5,-0.8,Match Report


In [160]:
# Drop the top index level
shooting.columns = shooting.columns.droplevel()

In [161]:
shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (1),1 (4),Arsenal,...,,,0,0,,,,,,Match Report
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,13.9,0.0,0,0,1.9,1.9,0.12,1.1,1.1,Match Report
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,1 (5),1 (4),es Sevilla,...,,,0,0,,,,,,Match Report
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,17.9,0.0,0,0,1.0,1.0,0.07,0.0,0.0,Match Report
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,17.3,2.0,0,1,3.5,2.8,0.1,-1.5,-0.8,Match Report


In [162]:
# Merging the two dataframes of matches and shooting into one dataframe
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

## Scraping data for multiple season and teams with a loop

In [163]:
# Scale the method up to scrape data for multiple teams and multiple years

years = list(range(2024, 2022, -1)) # List of the years we want to scrape

In [164]:
# Contains several dataframes that contain the match data for one team in one season
all_matches = []

In [165]:
standings_url = "https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats"

In [None]:
import time
# Loops through all of the years
for year in years:
    # Makes a request to the server and downloads the HTML of this page
    data = requests.get(standings_url)
    # Parses the HTML code
    soup = BeautifulSoup(data.text)
    # select the table using the CSS selector
    standings_table = soup.select('table.stats_table')[0] # table is the tag name and stats_table is the title(indexing only the first one)
    
    # Finding all of the a tags in the table which correspond to the links
    links = standings_table.find_all('a')
    # Finding the href property of each link
    links = [l.get("href") for l in links]
    # Filtering the links so we only have the squad links with list comprehension
    links = [l for l in links if '/squads/' in l] # Gets rid of the link if it doesnt have squad
    team_urls = [f"https://fbref.com{l}" for l in links]

    # Every time the loop continues, it will get the standings url for the next season
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # Loop through each of the team urls and individually scrape the match logs
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ") # Parses by the last slash to get the team name from the url and gets rid of the Stats part and also the -

        data = requests.get(team_url)
        matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]

    	# Getting the shooting stats
        soup = BeautifulSoup(data.text)
        links = soup.find_all('a')
        links = [l.get("href") for l in links]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        # Downloads the HTML of the shooting stats page
        data = requests.get(f"https://fbref.com{links[0]}")
        # Reading the shooting stats and put it into a pandas dataframe
        shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
        # Drop the top index level
        shooting.columns = shooting.columns.droplevel()

        # Try statement to catch when some matches don't have shooting stats
        try:
            # Merging the two dataframes of matches and shooting into one dataframe
            team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue

        # Filter out all of the other competitions and leave the Premier League games
        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        # Slowing down the rate of webscraping to not get blocked by the website
        time.sleep(1)

In [None]:
# Combining all the dataframes into one
match_df = pd.concat(all_matches)

In [None]:
# Making all the columns lowercase
matches_df.columns = [c.lower() for c in match_df.columns]

In [None]:
# Writes all of the data to a csv file called matches.csv
match_df.to_csv("matches.csv")