In [229]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import time
import random

In [None]:
base_url = 'https://fbref.com'

In [17]:
headers_list = [
    { 
        'authority': 'fbref.com', 
        'cache-control': 'max-age=0', 
        'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', 
        'sec-ch-ua-mobile': '?0', 
        'upgrade-insecure-requests': '1', 
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 
        'sec-fetch-site': 'none', 
        'sec-fetch-mode': 'navigate', 
        'sec-fetch-user': '?1', 
        'sec-fetch-dest': 'document', 
        'sec-ch-ua-platform': 'macOS',
        'accept-language': 'en-US,en;q=0.9', 
    },
    { 
        'authority': 'fbref.com', 
        'cache-control': 'max-age=0', 
        'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Brave";v="128"', 
        'sec-ch-ua-mobile': '?0', 
        'upgrade-insecure-requests': '1', 
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', 
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate', 
        'sec-fetch-user': '?1', 
        'sec-fetch-dest': 'document', 
        'sec-ch-ua-platform': 'macOS',
        'accept-language': 'en-US,en;q=0.9', 
    }
] 


In [18]:
def crawl(url):
    """
    Sends a GET request to the url and returns the response.
    Includes a random timeout between 2 and 20 seconds to avoid crawlting rate limits.
    If the server responds with a 429 error, raises an exception
    """
    timeout = random.randint(2, 20)
    time.sleep(timeout)
    print('sleeping for ', timeout)

    # Choose a random set of headers to include in the request
    headers = random.choice(headers_list)
    
    # Send the request
    data = requests.get(url, headers=headers)
    
    # If the server responds with a 429 error, raise an exception
    if 'Rate Limited Request (429 error)' in data.text:
        raise Exception('Rate limit error')
    
    # Return the response
    return data


In [249]:
def get_teams_url(url) :
    data = crawl(url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]
    
    links = standings_table.findAll('a')
    links = [l.get('href') for l in links if '/squads/' in l.get('href')]
    teams_url = [f'{base_url}{l}' for l in links]
    
    return teams_url

In [248]:
def get_team_data(url):
    data = crawl(url)
    matches = pd.read_html(StringIO(data.text), match= 'Scores & Fixtures')[0]
    
    soup = BeautifulSoup(data.text)
    links = soup.findAll('a')
    links = [l.get('href') for l in links]
    links = [f'{base_url}/{l}' for l in links if l is not None and '/all_comps/shooting' in l]
    
    data = crawl(links[0])
    shooting = pd.read_html(StringIO(data.text), match= 'Shooting')[0]
    shooting.columns = shooting.columns.droplevel()
    team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'PK', 'FK', 'PKatt', 'Dist']], on= 'Date')

    return team_data

In [None]:
years = list(range(2022, 2020, -1))
all_matches = []

for year in years:
    season = f'{year - 1}-{year}'
    standing_url = f'{base_url}/en/comps/9/{season}/{season}-Premier-League-Stats'
    teams_url = get_teams_url(standing_url)

    for team in teams_url:
        team_name = team.split('/')[-1].replace('Stats', '').replace('-', ' ')

        try:
            team_data = get_team_data(team)
            team_data = team_data[team_data['Comp'] == 'Premier League']
            team_data['Name'] = team_name
            team_data['Season'] = year

            all_matches.append(team_data)
        except ValueError:
            continue

match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]

match_df.to_csv('matches.csv')

In [None]:
data = crawl(base_url)
data.text
data.text