# Premier League Stats

In [1]:
# This project consists of getting information about a team (in this case the first place team) and looking at all the team's match results
# and the information about shooting statistics and joining it to the same table.
#
# After that, there is an iteration for all the teams in the Premier League to do the same and saving it to a csv document.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

In [3]:
# Get information from url
url = "https://fbref.com/en/comps/9/Premier-League-Stats"
data = requests.get(url)
soup = BeautifulSoup(data.text)

In [4]:
# Select Standings table to find the teams' urls
standings_table = soup.select("table.stats_table")[0]
links = standings_table.find_all('a')

In [5]:
# Find the urls of the teams and add them into a list
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]
team_urls = [f"https://fbref.com{l}" for l in links]

In [6]:
# Select a team to inspect
team_url = team_urls[0]

In [7]:
# See the team's results
data = requests.get(team_url)
matches = StringIO(data.text)
matches = pd.read_html(matches, match = "Scores & Fixtures")

In [8]:
# See more information about the team's shooting informations
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [9]:
data = requests.get(f"https://fbref.com{links[0]}")

In [10]:
shooting = StringIO(data.text)
shooting = pd.read_html(shooting, match = "Shooting")[0]
shooting.columns = shooting.columns.droplevel()

In [11]:
# Merge table from matches with shooting stats
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [12]:
# Looking for data from previous seasons
years = list(range(2022, 2020, -1))
all_matches = []
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [14]:
import time
# Iterarate for previous years
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    time.sleep(5)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com/{previous_season}"
    
    # Same process of finding a team's url, looking for it's stats & results and merge with shooting stats
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-","")

        data = requests.get(team_url)
        time.sleep(5)
        matches = StringIO(data.text)
        matches = pd.read_html(matches, match = "Scores & Fixtures")[0]

        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = StringIO(data.text)
        shooting = pd.read_html(shooting, match = "Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue

        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
    

In [16]:
match_df = pd.concat(all_matches)

In [17]:
match_df

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0,0,2022,ManchesterCity
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0,0,2022,ManchesterCity
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0,1,2022,ManchesterCity
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1,1,2022,ManchesterCity
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0,0,2022,ManchesterCity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2023-04-30,14:00,Premier League,Matchweek 34,Sun,Away,L,1,3,Newcastle Utd,...,Match Report,,4.0,3.0,17.3,0.0,0,0,2021,Southampton
43,2023-05-08,20:00,Premier League,Matchweek 35,Mon,Away,L,3,4,Nott'ham Forest,...,Match Report,,18.0,4.0,14.0,0.0,1,1,2021,Southampton
44,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Home,L,0,2,Fulham,...,Match Report,,5.0,1.0,24.2,0.0,0,0,2021,Southampton
45,2023-05-21,14:00,Premier League,Matchweek 37,Sun,Away,L,1,3,Brighton,...,Match Report,,5.0,1.0,13.8,1.0,0,0,2021,Southampton


In [18]:
match_df.to_csv("matches.csv")