In [1]:
# Import necessary packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Testing scraping Data from `FBREF.COM`

### Getting a `Team Stats`

In [None]:
# Sets URL to EPL standings
standings_url = "http://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
# Using get method to get the HTML data
data = requests.get(standings_url)

In [None]:
# Parsing HTML data
soup = BeautifulSoup(data.text)

In [None]:
# Selecting Stats table
standings_table = soup.select('table.stats_table')[0]

In [None]:
# Finding all anchor tags in standing table
links = standings_table.find_all('a')

In [None]:
# List comprehension to get all links to teams from the table
links = [l.get('href') for l in links]
links = [l for l in links if '/squads/' in l]

In [None]:
# Turning links into full URL "absolute links"
team_urls = [f"https://fbref.com{l}" for l in links]

In [None]:
# Testing extracting data from URL, trying the first item on the table "Manchester City"
team_url = team_urls[0]
data = requests.get(team_url) # putting the team_url data in a variable

In [None]:
# Creating dataframe from HTML data using pandas "read_html"
matches = pd.read_html(data.text, match="Scores & Fixtures")

In [None]:
# Displaying the data we scraped
matches[0].head()

### Getting match `Shooting Stats`

In [None]:
#Same as before we repeat the same steps but for the "Shooting" stats page
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get('href') for l in links]
links = [l for l in links if l and 'all_comps/shooting' in l] # getting link with tag 'all_comps/shooting'

# Important: link was reapeated 4 times but is the same link

data = requests.get(f"https://fbref.com{links[0]}")
shooting = pd.read_html(data.text, match="Shooting")[0]
shooting.head()

### Cleaning and Merging scrapped data

In [None]:
# Cleaning multilple level indexes in shooting table
shooting.columns = shooting.columns.droplevel()

In [None]:
# Result of droping the index
shooting.head() 

In [None]:
# merging matches and shooting dataframe
team_data = matches[0].merge(shooting[["Date", "Sh", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [None]:
# This is how the data frame would look like for this team
team_data.head()

In [None]:
# Columns an Rows of the dataframe, note that some rows from matches are not presented in shooting
matches[0].shape, shooting.shape

# Scraping data for multiple season and teams for predictions

In [2]:
# Setting seasons (years) and the url (same as the testing data)
years = list(range(2022,2020, -1)) # setting seasons (only 2) starting from 2022 to 2022
all_matches = [] # here is where all the teams data will be append as an array
standing_url = "http://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
# This loop follows the same steps as our testing
for year in years:
    data = requests.get(standing_url) # get URL data
    soup = BeautifulSoup(data.text) # parse it with BS
    standings_table = soup.select('table.stats_table')[0] # Select stats table

    links = [l.get('href') for l in standings_table.find_all('a')] #finding teams links
    links = [l for l in links if '/squads/' in l]
    team_urls = [f'https://fbref.com{l}' for l in links] # concatanating links to creat absolute links

    previous_season = soup.select('a.prev')[0].get('href') # going back to previous season by selecting "previous"
    standings_url = f'https://fbref.com{previous_season}'

    for team_url in team_urls: # getting standing and shooting stats for all teams
        team_name = team_url.split('/')[-1].replace('-Stats', '').replace('-', '') # cleaning teams names
        data = requests.get(team_url) 
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get('href') for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f'http://fbref.com{links[0]}')
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        # Avoids the "ValueError" for data not found
        try:
            team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')
        except ValueError:
            continue

        team_data = team_data[team_data['Comp'] == "Premier League"] # selects data only from EPL
        team_data['Season'] = year # sets the season to the according year
        team_data['Team'] = team_name # sets team name with clean version (no spaces no dashes)
        all_matches.append(team_data) # adds the array to the empty list
        time.sleep(2)


In [4]:
# Creating the "match dataframe"
match_df = pd.concat(all_matches)

# Setting all columns names to lowercase for easy writing later on
match_df.columns = [c.lower() for c in match_df.columns]

# Display final dataframe to be use for predictions
match_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,W,3,0,Crystal Palace,...,Match Report,,13.0,6.0,20.8,4.0,0.0,0.0,2022,Chelsea
2,2021-08-22,16:30,Premier League,Matchweek 2,Sun,Away,W,2,0,Arsenal,...,Match Report,,22.0,5.0,14.6,0.0,0.0,0.0,2022,Chelsea
3,2021-08-28,17:30,Premier League,Matchweek 3,Sat,Away,D,1,1,Liverpool,...,Match Report,,6.0,2.0,12.3,0.0,0.0,0.0,2022,Chelsea
4,2021-09-11,17:30,Premier League,Matchweek 4,Sat,Home,W,3,0,Aston Villa,...,Match Report,,12.0,4.0,22.5,0.0,0.0,0.0,2022,Chelsea
6,2021-09-19,16:30,Premier League,Matchweek 5,Sun,Away,W,3,0,Tottenham,...,Match Report,,20.0,9.0,13.2,0.0,0.0,0.0,2022,Chelsea


In [5]:
# Saving dataframe to csv file
match_df.to_csv("matches.csv")