### Scrape PL Data From Multiple Seasons

In [1]:
import time
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
scraper = cloudscraper.create_scraper()

In [3]:
years = list(range(2025, 2022, -1)) # get previous three seasons
all_matches = [] # store match DFs in a list
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
for year in years:

    # Get the standings table from fbref.com
    data = scraper.get(standings_url) # get the url for page with PL table
    soup = BeautifulSoup(data.text) # parse through html with bs4 library
    time.sleep(5) 
    standings_table = soup.select('table.stats_table')[0] # grab first table instance on page (table_class.name_of_table)

    # Look for Individual team links in table
    links = standings_table.find_all('a') 
    links = [l.get("href") for l in links] 
    links = [l for l in links if '/squads' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select('a.prev')[0].get('href') # get the standings url for the previous season
    standings_url = f"https://fbref.com/{previous_season}" # update standings_url to be the url for the previous season page

    # Loop thru each team url
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ") # clean up team name 

        # Get matches data
        match_data = scraper.get(team_url) # go to page to get Scores & Fixtures table 
        matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0] # Look for the Scores & Fixtures table

        # Get shooting data 
        soup = BeautifulSoup(match_data.text)
        time.sleep(5)
        links = soup.find_all('a')
        links = [l.get("href") for l in links]
        links = [l for l in links if l and "all_comps/shooting/" in l]
        shooting_data = scraper.get(f"https://fbref.com/{links[0]}") # download shooting stats
        shooting = pd.read_html(shooting_data.text, match="Shooting")[0] # convert data to pd df

        # Clean and merge scraped data
        shooting.columns = shooting.columns.droplevel() # drop index level se we can filter based on columns

        try: 
            # shooting data may be empty for some teams
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue # ignore team

        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year # preserve year
        team_data["Team"] = team_name # preserve team
        all_matches.append(team_data) # add each team season to all_matches list of dfs
        time.sleep(5) # help prevent 403 error 


  matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0] # Look for the Scores & Fixtures table
  shooting = pd.read_html(shooting_data.text, match="Shooting")[0] # convert data to pd df
  matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0] # Look for the Scores & Fixtures table
  shooting = pd.read_html(shooting_data.text, match="Shooting")[0] # convert data to pd df
  matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0] # Look for the Scores & Fixtures table
  shooting = pd.read_html(shooting_data.text, match="Shooting")[0] # convert data to pd df
  matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0] # Look for the Scores & Fixtures table
  shooting = pd.read_html(shooting_data.text, match="Shooting")[0] # convert data to pd df
  matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0] # Look for the Scores & Fixtures table
  shooting = pd.read_html(shooting_data.text, match="Shooting")[0] # co

### Preview Final DF

In [15]:
# Join all dfs (each df is a season of matches for a given PL team)
match_df = pd.concat(all_matches) 
match_df

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,...,Match Report,,18.0,5.0,14.8,0.0,0,0,2025,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,...,Match Report,,19.0,8.0,13.6,1.0,0,0,2025,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,...,Match Report,,11.0,3.0,13.4,0.0,0,0,2025,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,Match Report,,14.0,5.0,14.9,0.0,0,0,2025,Liverpool
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3,0,Bournemouth,...,Match Report,,19.0,12.0,16.6,0.0,0,0,2025,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2023-04-30,14:00,Premier League,Matchweek 34,Sun,Away,L,1,3,Newcastle Utd,...,Match Report,,4.0,3.0,17.3,0.0,0,0,2023,Southampton
43,2023-05-08,20:00,Premier League,Matchweek 35,Mon,Away,L,3,4,Nott'ham Forest,...,Match Report,,18.0,4.0,14.0,0.0,1,1,2023,Southampton
44,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Home,L,0,2,Fulham,...,Match Report,,5.0,1.0,24.2,0.0,0,0,2023,Southampton
45,2023-05-21,14:00,Premier League,Matchweek 37,Sun,Away,L,1,3,Brighton,...,Match Report,,5.0,1.0,13.8,1.0,0,0,2023,Southampton


In [18]:
# Confirm there is no missing data
20 * 38 * 3

2280

In [20]:
# Lowercase column names
match_df.columns = [c.lower() for c in match_df.columns]
match_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,...,Match Report,,18.0,5.0,14.8,0.0,0,0,2025,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,...,Match Report,,19.0,8.0,13.6,1.0,0,0,2025,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,...,Match Report,,11.0,3.0,13.4,0.0,0,0,2025,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,Match Report,,14.0,5.0,14.9,0.0,0,0,2025,Liverpool
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3,0,Bournemouth,...,Match Report,,19.0,12.0,16.6,0.0,0,0,2025,Liverpool


### Save .csv File

In [21]:
match_df.to_csv("pl_matches.csv")