Web Data Scraping (EPL Football Matches)
2021 - 2025

Reference List:
[Dataquest](https://www.youtube.com/watch?v=Nt7WJa2iu0s)
[fbref.com](https://fbref.com/en/comps/9/Premier-League-Stats)

In [5]:
import requests
import cloudscraper
import pandas as pd

In [6]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
scraper = cloudscraper.create_scraper()
data = scraper.get(standings_url)
print(data)
data = scraper.get(standings_url).text

<Response [429]>


In [None]:
print(data)

In [None]:
from bs4 import BeautifulSoup
import time

In [None]:
soup = BeautifulSoup(data)

In [None]:
standings_table = soup.select('table.stats_table')[0]

In [None]:
# find all a tags in the table (find_all finds tags)
links = standings_table.find_all('a')

In [None]:
# get the href property of each link via a list comprehension

# go through each a element and find the value of each href property
links = [l.get("href") for l in links]

# get the squad links only
links = [l for l in links if '/squads' in l]
print(links)

In [None]:
# transform links to full URLs
team_urls = [f"https://fbref.com{l}" for l in links]
print(team_urls)

Extract match stats (pandas and cloudscraper)

In [None]:
team_url = team_urls[0]
# Liverpool
# team_url = team_urls[3]

# get the html from team_url
data = scraper.get(team_url).text
print(data)

In [None]:
from io import StringIO

matches = pd.read_html(StringIO(data), match="Scores & Fixtures")

In [None]:
matches[0]

Get match shooting stats with cloudscraper and pandas

In [None]:
# find all a tags on the page
soup = BeautifulSoup(data)
links = soup.find_all('a')

In [None]:
# get the actual URL of the link
links = [l.get("href") for l in links]

# filter links to get the shooting stats link
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [None]:
links

In [None]:
# download data
data = scraper.get(f"https://fbref.com{links[0]}")

In [None]:
shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
print(shooting)

In [None]:
shooting.head()

In [None]:
# drop one index level to index based on the heading (drop the top index level)
shooting.columns = shooting.columns.droplevel()

shooting.head()

In [None]:
shooting["Date"]

In [None]:
# merge match and shooting stats dataframes together based on the date
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [None]:
team_data.head()

In [None]:
team_data

Scraping data for multiple seasons and teams with a for loop

In [None]:
# list of target years (going backwards)
years = list(range(2025,2020,-1))
years

In [None]:
# list to hold DataFrames that contain match logs of each team in a season
all_matches = []

In [None]:
import time

# find starting URL
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

# loop through all the years
for year in years:
    data = scraper.get(standings_url).text
    # parse with BeautifulSoup
    soup = BeautifulSoup(data)

    # look for stats_table as it holds individual team links for individual match data of each team
    standings_table = soup.select('table.stats_table')[0]
    # find all a tags in the table (find_all finds tags)
    links = standings_table.find_all('a')

    # get the href property of each link via a list comprehension
    #  go through each a element and find the value of each href property
    links = [l.get("href") for l in links]

    #  get the squad links only
    links = [l for l in links if '/squads/' in l]  
    # transform links to full URLs
    team_urls = [f"https://fbref.com{l}" for l in links]

    # grab URL for the previous season (select the first anchor tag with a prev class)
    # and get the href property
    previous_season = soup.select("a.prev")[0].get("href")
    # convert to an absolute URL
    standings_url = f"https://fbref.com/{previous_season}"

    
    # loop through each team URL and individually scrape the match logs of each team
    for team_url in team_urls:
        # get the team name by splitting by /, remove "-Stats" with nothing, and replace "-" with space
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")

        # get the team URL
        data = scraper.get(team_url)
        matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]

        # find all a tags on the page
        soup = BeautifulSoup(data.text)
        # get the actual URL of the link
        links = [l.get("href") for l in soup.find_all('a')]
        # filter links to get the shooting stats link
        links = [l for l in links if l and 'all_comps/shooting/' in l]

        if not links:  # no shooting link -> skip
            continue

        # download data
        data = scraper.get(f"https://fbref.com{links[0]}")

        # try to read Shooting table stats safely
        try:
            shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
        except ValueError:
            # no Shooting stats found -> skip
            continue

        # drop one index level to index based on the heading (drop the top index level)
        shooting.columns = shooting.columns.droplevel()

        # merge Shooting and Match stats tables
        # merge available Shooting statas with Match stats of a certain team, skip team if Shooting data is unavailable
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue

        # filter so that it only has Premier League matches
        team_data = team_data[team_data["Comp"] == "Premier League"]
        # add a Season column to distinguish the season
        team_data["Season"] = year
        # add a Team column to distinguish the team
        team_data["Team"] = team_name

        # save it to all_matches list
        all_matches.append(team_data)

        # sleep for 1 second to reduce scrape speed (to prevent from being blocked dueo to too many requests)
        time.sleep(15)


In [None]:
# combine all individual DataFrames into one
# concat func. takes a list of DataFrames as input and return a DataFrame
match_df =  pd.concat(all_matches)
# ensure all column names are lower case
match_df.columns = [c.lower() for c in match_df.columns]

# write to csv (matches.csv)
match_df.to_csv("matches.csv")