In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from google.colab import drive

Below, we retrieve the data and parse it using BeautifulSoup.

In [5]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
data = requests.get(standings_url)
soup = BeautifulSoup(data.text)

Now, we want to retrieve the stats table, and retrieve the links associated with each Premier League team in the table.

In [None]:
# We want the standings table, which is a table element with a class of 'stats_table'
# .select() will return a list of matching elements, but we want the first one, thus
# we index at 0
standings_table = soup.select('table.stats_table')[0]
# We use .find_all() because .select() uses CSS selectors, and .find() and .find_all()
# search just for tags. Could use either, but not necessary
links = standings_table.find_all('a')
# Want to get href property of each link. Use list comprehension to achieve this.
# NOTE: There are other links in the table we will need to filter out
links = [l.get("href") for l in links]
# Filter out non-team links
links = [l for l in links if '/squads/' in l]
# Upon examining links, we can see that they only include the subdomains.
# Need to format links to include the main website domain.
team_urls = [f"https://fbref.com{l}" for l in links]

Scrape match statistics for each team

In [None]:
team_url = team_urls[0]
data = requests.get(team_url)

In [None]:
# Use pandas to turn the Scores & Fixtures table into a dataframe
matches = pd.read_html(data.text, match="Scores & Fixtures")
#matches[0]

Get match shooting stats

In [None]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]
data = requests.get(f"https://fbref.com{links[0]}")
shooting = pd.read_html(data.text, match="Shooting")[0]
#shooting.head()
# Need to drop first index level to remove the upper level of headers
shooting.columns = shooting.columns.droplevel()

Merge the tables

In [None]:
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
team_data.shape

(43, 25)

Scrape from multiple teams and seasons

In [6]:
years = list(range(2023, 2018, -1))
#print(years)
all_matches = []
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [7]:
for year in years:
  data = requests.get(standings_url)
  #print(data.text)
  soup = BeautifulSoup(data.text)
  standings_table = soup.select('table.stats_table')[0]
  links = [l.get("href") for l in standings_table.find_all('a')]
  links = [l for l in links if '/squads/' in l]
  team_urls = [f"https://fbref.com{l}" for l in links]

  previous_season = soup.select("a.prev")[0].get("href")
  standings_url = f"https://fbref.com{previous_season}"
  for team_url in team_urls:
    team_name = team_url.split('/')[-1].replace("-Stats", "").replace("-", " ")

    data = requests.get(team_url)
    matches = pd.read_html(data.text, match="Scores & Fixtures")

    soup = BeautifulSoup(data.text)
    links = [l.get("href") for l in soup.find_all('a')]
    links = [l for l in links if l and 'all_comps/shooting/' in l]
    data = requests.get(f"https://fbref.com{links[0]}")
    shooting = pd.read_html(data.text, match="Shooting")[0]
    shooting.columns = shooting.columns.droplevel()

    try:
      team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
    except ValueError:
      continue

    team_data = team_data[team_data["Comp"] == "Premier League"]
    team_data["Season"] = year
    team_data["Team"] = team_name
    all_matches.append(team_data)
    time.sleep(4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Season"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Team"] = team_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Season"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

In [None]:
match_df = pd.concat(all_matches)
#match_df
match_df.columns = [c.lower() for c in match_df.columns]
match_df

In [11]:
match_df.to_csv("matches.csv")