### Scrape PL Data From Multiple Seasons

In [1]:
import time
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
scraper = cloudscraper.create_scraper()

In [3]:
years = list(range(2025, 2022, -1)) # get previous three seasons
all_matches = [] # store match DFs in a list
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
for year in years:

    # Get the standings table from fbref.com
    data = scraper.get(standings_url) # get the url for page with PL table
    soup = BeautifulSoup(data.text) # parse through html with bs4 library
    time.sleep(5) # sleep for 5 second to prevent webpage denying access for bot suspicsion 
    standings_table = soup.select('table.stats_table')[0] # grab first table instance on page (table_class.name_of_table)

    # Look for Individual team links in table
    links = standings_table.find_all('a') 
    links = [l.get("href") for l in links] 
    links = [l for l in links if '/squads' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    # Extract match stats using Pandas and CloudScraper
    team_url = team_urls[0] 
    match_data = scraper.get_(team_url) # Get html from team page now
    matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0] # Look for the Scores & Fixtures table

    # Get match and shooting stats
    soup = BeautifulSoup(match_data.text)
    links = soup.find_all('a')
    links = [l.get("href") for l in links]
    links = [l for l in links if l and "all_comps/shooting/" in l]
    shooting_data = scraper.get(f"https://fbref.com/{links[0]}") # download shooting stats
    shooting = pd.read_html(shooting_data.text, match="Shooting")[0]

    # Clean and merge scraped data
    shooting.columns = shooting.columns.droplevel() # drop index level se we can filter based on columns

    try: # shooting data may be empty for some teams
        team_data = matches.merge(shooting_data["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"])
    except ValueError:
        continue # ignore team
    

    previous_season = soup.select("a.prev")[0].get("href")


### Testing

In [5]:
data = scraper.get(standings_url) 
soup = BeautifulSoup(data.text) 
time.sleep(5) 
standings_table = soup.select('table.stats_table')[0]

# Look for Individual team links in table
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [6]:
team_urls = [f"https://fbref.com{l}" for l in links]
team_urls

['https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
 'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats',
 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats',
 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
 'https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats',
 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats',
 'https://fbref.com/en/squads/cd051869/Brentford-Stats',
 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/d3fd31cc/Everton-Stats',
 'https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats',
 'https://fbref.com/en/squads/19538871/Manchester-United-Stats',
 'https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 'https://fbref.com/en/s

In [7]:
team_url = team_urls[0] 
match_data = scraper.get(team_url)
matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0]
matches.head()

  matches = pd.read_html(match_data.text, match = "Scores & Fixtures")[0]


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee,Match Report,Notes
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,2.6,0.5,62.0,30014.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Tim Robinson,Match Report,
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,2.5,0.5,62.0,60017.0,Virgil van Dijk,4-2-3-1,4-4-2,Stuart Attwell,Match Report,
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,1.8,1.4,47.0,73738.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Anthony Taylor,Match Report,
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,0.9,0.4,68.0,60344.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Michael Oliver,Match Report,
4,2024-09-17,21:00,Champions Lg,League phase,Tue,Away,W,3,1,it Milan,3.1,0.6,51.0,59826.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Espen Eskås,Match Report,


In [17]:
soup = BeautifulSoup(match_data.text)
links = soup.find_all("a")
links = [l.get("href") for l in links]
links = [l for l in links if l and "all_comps/shooting/" in l]
# shooting_data = scraper.get(f"https://fbref.com/{links[0]}")
# shooting = pd.read_html(shooting_data.text, match = "Shooting")[0]
# shooting.head()
links[0]

'/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions'

In [None]:
0