In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
import pandas as pd

In [4]:
from io import StringIO

In [5]:
import time

In [6]:
years = list(range(2024, 2020, -1))
all_matches = []

In [7]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [8]:
for year in years:
    # Request the standings page
    data = requests.get(standings_url)
    print(f"Fetching data for {year} - Status: {data.status_code}")
    if data.status_code != 200:
        print(f"Failed to fetch standings for {year}.")
        continue

    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(data.text, 'html.parser')
    
    # Extract the standings table (index 0 for first table)
    standings_table = soup.select('table.stats_table')[0]
    
    # Extract all the team URLs
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    # Get the URL for the previous season to continue scraping
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # Loop through each team URL
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        
        # Request team data (matches & fixtures)
        data = requests.get(team_url)
        if data.status_code != 200:
            print(f"Failed to fetch data for {team_name}.")
            continue
        
        # Parse matches and fixtures table
        matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]
        
        # Parse the shooting data
        soup = BeautifulSoup(data.text, 'html.parser')
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        
        if links:
            shooting_data_url = f"https://fbref.com{links[0]}"
            data = requests.get(shooting_data_url)
            shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
            shooting.columns = shooting.columns.droplevel()

            # Merge matches with shooting data
            try:
                team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
            except ValueError as e:
                print(f"Merge failed for {team_name}: {e}")
                continue

            # Filter to only Premier League matches
            team_data = team_data[team_data["Comp"] == "Premier League"]
            
            # Add season and team name to the data
            team_data["Season"] = year
            team_data["Team"] = team_name
            all_matches.append(team_data)
        
        # Sleep to avoid making too many requests in a short time
        time.sleep(10)

Fetching data for 2024 - Status: 200
Fetching data for 2023 - Status: 200
Fetching data for 2022 - Status: 200
Fetching data for 2021 - Status: 200


In [9]:
if all_matches:
    match_df = pd.concat(all_matches)
    match_df.columns = [c.lower() for c in match_df.columns]  # Ensure column names are lowercase
    print(f"Number of rows in match_df: {match_df.shape[0]}")
    
    # Save to CSV if data is present
    match_df.to_csv("matches_2020_2024.csv", index=False)
else:
    print("No data available to save.")

Number of rows in match_df: 2500
