In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [14]:
def get_matches(main_url,years):
    matches = []
    count = 0
    export_links = []
    for year in years:
        data = requests.get(main_url)
        soup = BeautifulSoup(data.text)
        stat_table = soup.select('table.stats_table')[0]
        links = stat_table.find_all('a')
        links = [l.get('href') for l in links]
        links = [l for l in links if '/squads/' in l]
        links =  [f'https://fbref.com{l}' for l in links]
        previous_season = soup.select("a.prev")[0].get("href")
        main_url = f"https://fbref.com{previous_season}"
        
        for l in links:
            export_links.append(l)
            count += 1
            print(f'{count}: match proccesing link: {l}')
            match_data = requests.get(l).text
            match = pd.read_html(match_data, match='Scores & Fixtures')[0]
            match.columns  = [c.lower() for c in match.columns]
            match['year'] = year
            matches.append(match)

    return matches,export_links


def get_shootings(url):
    team_name = url.split('/')[-1].lower().replace('-stats',"").replace('-'," ")
    data = requests.get(url)
    soup = BeautifulSoup(data.text)
    links = soup.find_all('a')
    links = [l.get('href') for l in links]
    links = [l for l in links if  l and '/all_comps/shooting/' in l]
    shooting_link = f'https://fbref.com{links[0]}'
    url_data = requests.get(shooting_link)
    shooting = pd.read_html(url_data.text, match = 'Shooting')[0]
    shooting.columns = shooting.columns.droplevel()
    shooting.columns  = [c.lower() for c in shooting.columns]
    shooting['team'] = team_name
    return shooting


def merge_tables(data1,data2,filtered_columns,on_column):
    try:
        merged_data = pd.merge(data1,data2[filtered_columns], how='left',on= on_column)
    except ValueError:
        print('not found')
        return None
    return merged_data



def combine_all_data(years):
    #get matches tables
    matches,main_links = get_matches(url,years)
    
    #get shootings tables
    shootings = []
    count = 0
    for l in main_links:
        count += 1
        print(f'{count}: shooting proccesing link: {url}')
        shooting = get_shootings(l)
        shootings.append(shooting)

    #merge tables
    all_matches = []    
    for i in range(len(matches)):
        all_matches.append(merge_tables(matches[i],shootings[i],columns,'date'))

    #concat all tables    
    data_combined = pd.concat(all_matches)    
    return data_combined


In [15]:
url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
columns = ['date','team','sh','sot','dist','fk','pk']
years = list(range(2022, 2020, -1))
final_data = combine_all_data(years)
final_data.to_csv('matches.csv')


1: match proccesing link: https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats
2: match proccesing link: https://fbref.com/en/squads/822bd0ba/Liverpool-Stats
3: match proccesing link: https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats
4: match proccesing link: https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats
5: match proccesing link: https://fbref.com/en/squads/18bb7c10/Arsenal-Stats
6: match proccesing link: https://fbref.com/en/squads/19538871/Manchester-United-Stats
7: match proccesing link: https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats
8: match proccesing link: https://fbref.com/en/squads/a2d435b3/Leicester-City-Stats
9: match proccesing link: https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats
10: match proccesing link: https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats
11: match proccesing link: https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats
12: match proccesing link: https://fbref.com/en/squads/47c64c55/

In [18]:
final_data.shape

(1915, 26)

In [20]:
final_data.year.value_counts()

2021    960
2022    955
Name: year, dtype: int64

In [21]:
final_data.team.value_counts()

chelsea                     120
manchester city             119
liverpool                   116
tottenham hotspur           113
leicester city              111
manchester united           110
arsenal                     103
west ham united             100
everton                      90
southampton                  89
brighton and hove albion     87
burnley                      86
crystal palace               84
wolverhampton wanderers      84
aston villa                  83
newcastle united             83
leeds united                 82
brentford                    44
norwich city                 43
fulham                       43
sheffield united             43
watford                      41
west bromwich albion         41
Name: team, dtype: int64