## Scraping Premier League Data

In [None]:
# import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

#### Data URL

In [None]:
# the url for data
prem_standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

#### Get Data for Each Team and Previous Years

In [None]:
# create year range
years = list(range(2024, 2022, -1))

#initialize every match list
prem_team_matches = []

# loop scrape
for year in years:

    data = requests.get(prem_standings_url)
    soup = BeautifulSoup(data.text)
    time.sleep(10) 
    
    # filtering
    prem_standings_table = soup.select('table.stats_table')[0]
    team_links = [l.get("href") for l in prem_standings_table.find_all('a')]
    team_links = [l for l in team_links if '/squads/' in l]
    
    # get urls
    team_urls = [f"https://fbref.com{l}" for l in team_links]
    
    # declare previous season url
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # scrape each individual team
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        
        # use pandas to extract the teams' match data
        team_data = requests.get(team_url)
        matches = pd.read_html(team_data.text, match="Scores & Fixtures")
        
        # prevent fbref website slowing down using sleep
        time.sleep(10) 
        
        # shooting data
        temp_soup = BeautifulSoup(team_data.text)
        
        # filter
        temp_links = [l.get("href") for l in temp_soup.find_all('a')]
        temp_links = [l for l in temp_links if 'all_comps/shooting/' in l]

        # get shooting data
        shooting_data = requests.get(f"https://fbref.com{temp_links[0]}")
        shooting = pd.read_html(shooting_data.text, match="Shooting")[0]
        
        # drop first unneeded level
        shooting.columns = shooting.columns.droplevel()
        
        try:
            # merge both dataframes
            team_data = matches[0].merge(shooting[["Date", "Sh",
                                                   "SoT", "Dist",
                                                   "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
            
        # filter just premier league
        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name
        prem_team_matches.append(team_data)
        time.sleep(10) 

#### Combine All Dataframes

In [None]:
# use pandas concatenation 
match_df = pd.concat(prem_team_matches)

#### Make All Columns Lowercase

In [None]:
# make columns lowercase
match_df.columns = [c.lower() for c in match_df.columns]

#### Export Data as CSV File for Prediction Part of Project

In [None]:
# export as csv
match_df.to_csv("prem_matches.csv")