In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
standings_url = "http://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
data = requests.get(standings_url)

In [4]:
# Parse HTML data
soup = BeautifulSoup(data.text)

In [None]:
# Selecting Stats table
standings_table = soup.select('table.stats_table')[0]


In [None]:
#finding all anchor tags in standing table
links = standings_table.find_all('a')

In [None]:
# list comprehension to get all links to teams from the table
links = [l.get('href') for l in links]

In [None]:
links = [l for l in links if '/squads/' in l]

In [None]:
# turning links into full URL "absolute links"
team_urls = [f"https://fbref.com{l}" for l in links]

In [None]:
# extracting data from URL
team_url = team_urls[0]

In [None]:
data = requests.get(team_url)

In [None]:
# creating dataframe from html
matches = pd.read_html(data.text, match="Scores & Fixtures")

In [None]:
matches[0].head()

### getting match shooting stats with request and pandas

In [None]:
soup = BeautifulSoup(data.text)

In [None]:
links = soup.find_all('a')

In [None]:
links = [l.get('href') for l in links]

In [None]:
# getting link with tag 'all_comps/shooting'
links = [l for l in links if l and 'all_comps/shooting' in l]

In [None]:
#link was reapeated 4 times but is the same link
links

In [None]:
data = requests.get(f"https://fbref.com{links[0]}")

In [None]:
shooting = pd.read_html(data.text, match="Shooting")[0]


In [None]:
shooting.head()

### cleaning and merging scrapped data with pandas

In [None]:
# cleaning mutlple level indexes in shooting table
shooting.columns = shooting.columns.droplevel()

In [None]:
shooting.head()

In [None]:
# merging match and shooting dataframe

team_data = matches[0].merge(shooting[["Date", "Sh", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [None]:
team_data.head()

In [None]:
matches[0].shape, shooting.shape

### scraping data for multiple season and teams with a loop

In [24]:
years = list(range(2022,2017, -1))
all_matches = []
standing_url = "http://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
for year in years:
    data = requests.get(standing_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get('href') for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f'https://fbref.com{l}' for l in links]

    previous_season = soup.select('a.prev')[0].get('href')
    standings_url = f'https://fbref.com{previous_season}'

    for team_url in team_urls:
        team_name = team_url.split('/')[-1].replace('-Stats', '').replace('-', '')
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get('href') for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f'http://fbref.com{links[0]}')
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        try:
            team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')
        except ValueError:
            continue

        team_data = team_data[team_data['Comp'] == "Premier League"]
        team_data['Season'] = year
        team_data['Team'] = team_name
        all_matches.append(team_data)
        time.sleep(3)


In [None]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.head()

In [None]:
match_df.to_csv("matches.csv")