In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import random
import time

prem_standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
#create year range
years = list(range(2024,2021, -1))

processed_year = []

#initialize every match list
all_matches = []


#loop to scrape all the data!!
for year in years:
    print(year)
    processed_year.append(year)
    data = requests.get(prem_standings_url)
    soup = BeautifulSoup(data.text, 'html.parser')
    standings_table = soup.find('table', class_ ='stats_table')
    
    # Check if standings_table is found
    if standings_table is not None:
        time.sleep(random.uniform(5, 15))


        links = [l.get("href") for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]

        #for the previous season 
        previous_season = soup.select("a.prev")[0].get("href")
        previous_url = f"https://fbref.com{previous_season}"

        for team_url in team_urls:
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            match_data = requests.get(team_url)
            
            # Check if the request was successful
            if match_data.status_code == 200:
                matches = pd.read_html(StringIO(str(match_data.text)), match="Scores & Fixtures")
                soup = BeautifulSoup(match_data.text, 'html.parser')
                shoot_links = [l.get("href") for l in soup.find_all('a')]
                shoot_links = [l for l in shoot_links if l and 'all_comps/shooting/' in l]
                if shoot_links: # Check if shoot_links is not empty
                    shoot_data = requests.get(f"https://fbref.com{shoot_links[0]}")
                    shooting = pd.read_html(StringIO(str(shoot_data.text)), match="Shooting")[0]
                    shooting.columns = shooting.columns.droplevel()
                    try:
                        team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
                    except ValueError:
                        continue
                    team_data = team_data[team_data["Comp"] == "Premier League"]

                    team_data["Season"] = year
                    team_data["Team"] =  team_name
                    all_matches.append(team_data)
            else:
                # Handle the case where the request was not successful
                print(f"Failed to fetch data for {team_name} - Status code: {match_data.status_code}")
            
            time.sleep(10)
    else:
        print("Standings table not found. Check the website structure or your selector.")
len(all_matches)

2024
2023
2022


60

In [3]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,Match Report,,18,5,14.8,0.0,0,0,2024,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,Match Report,,19,8,13.6,1.0,0,0,2024,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,Match Report,,11,3,13.4,0.0,0,0,2024,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,Match Report,,14,5,14.9,0.0,0,0,2024,Liverpool
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,...,Match Report,,19,12,16.6,0.0,0,0,2024,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,2024-08-31,15:00,Premier League,Matchweek 3,Sat,Away,D,1.0,1.0,Nott'ham Forest,...,Match Report,,11,2,20.7,0.0,0,0,2022,Wolverhampton Wanderers
4,2024-09-15,16:30,Premier League,Matchweek 4,Sun,Home,L,1.0,2.0,Newcastle Utd,...,Match Report,,12,5,15.2,0.0,0,0,2022,Wolverhampton Wanderers
6,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Away,L,1.0,3.0,Aston Villa,...,Match Report,,10,4,14.9,0.0,0,0,2022,Wolverhampton Wanderers
7,2024-09-28,17:30,Premier League,Matchweek 6,Sat,Home,L,1.0,2.0,Liverpool,...,Match Report,,8,3,22.9,0.0,0,0,2022,Wolverhampton Wanderers


In [4]:
match_df.to_csv("matches.csv")