In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import re
import random



### Scrape match links

After scraping the match links we will sample 20 games for each season of the league and then scrape the statistcs from the web.

In [None]:
# season links for premier league and turkish super league
LEAGUES = {
    "Premier League 24-25": "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures",
    "Turkish Super Lig 24-25": "https://fbref.com/en/comps/26/schedule/Super-Lig-Scores-and-Fixtures",
    "Premier League 23-24": "https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures",
    "Turkish Super Lig 23-24": "https://fbref.com/en/comps/26/2023-2024/schedule/2023-2024-Super-Lig-Scores-and-Fixtures"
}

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
]

def get_match_links(league_url):
    """
    This function collects the links for each game. These collected links are further used to scrape the game statistics for each one.
    """
    # send a get request to the web server
    response = requests.get(league_url)
    # beatuify the html response with BeatifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    match_links = []
    
    # select rows from the schedule table
    rows = soup.select("table.stats_table tbody tr")

    # loop through each row, meaning each game, and collect the respective game link to further 
    # scrape the game statistics for that game
    for row in rows:
        match_report = row.select_one("td[data-stat='match_report'] a")
        score = row.select_one("td[data-stat='score']")
        
        # check if match report link exists and score is not empty (meaning the match has been played)
        if match_report and score and score.text.strip():
            match_links.append("https://fbref.com" + match_report["href"])

    return match_links

In [202]:
all_matches = []
leagues = []

for league, url in LEAGUES.items():
    print(f"Scraping match links for {league}...")
    match_links = get_match_links(url)
    all_matches.extend(match_links)
    leagues.extend([league]*len(match_links))

Scraping match links for Premier League 24-25...
Scraping match links for Turkish Super Lig 24-25...
Scraping match links for Premier League 23-24...
Scraping match links for Turkish Super Lig 23-24...


In [205]:
df = pd.DataFrame([np.array(leagues),np.array(all_matches)]).T
df = df.rename({0:'league', 1:'match_link'},axis=1)
df.to_csv("match_links.csv",index=False)

### Scrape game stats

In [206]:
df = pd.read_csv("match_links.csv")
df.head()

Unnamed: 0,league,match_link
0,Premier League 24-25,https://fbref.com/en/matches/cc5b4244/Manchest...
1,Premier League 24-25,https://fbref.com/en/matches/a1d0d529/Ipswich-...
2,Premier League 24-25,https://fbref.com/en/matches/34557647/Newcastl...
3,Premier League 24-25,https://fbref.com/en/matches/71618ace/Everton-...
4,Premier League 24-25,https://fbref.com/en/matches/4efc72e4/Nottingh...


In [208]:
# randomly sample 20 samples from each season
n = 20
leagues = df['league'].unique() # get unique seasons

sampled_df = pd.DataFrame() # init empty dataframe

# loop through each season and randomly sample 20 data
for l in leagues:
    # filter the dataframe by season and randomly sample 20 datapoints
    sampled_df = pd.concat([sampled_df, df[df.league == l].sample(n)])

In [None]:
sampled_df.to_csv("samples.csv",index=False) # it only holds league info and the game web link

In [4]:
sampled_df = pd.read_csv("samples.csv")
sampled_df

Unnamed: 0,league,match_link
0,Premier League 24-25,https://fbref.com/en/matches/08966ea6/Manchest...
1,Premier League 24-25,https://fbref.com/en/matches/6c829b8f/Ipswich-...
2,Premier League 24-25,https://fbref.com/en/matches/4d72ec87/West-Ham...
3,Premier League 24-25,https://fbref.com/en/matches/c6439e5b/Arsenal-...
4,Premier League 24-25,https://fbref.com/en/matches/99eb6105/Leiceste...
...,...,...
75,Turkish Super Lig 23-24,https://fbref.com/en/matches/43c37a75/Galatasa...
76,Turkish Super Lig 23-24,https://fbref.com/en/matches/0a2f5ce0/Istanbul...
77,Turkish Super Lig 23-24,https://fbref.com/en/matches/71c034ba/Antalyas...
78,Turkish Super Lig 23-24,https://fbref.com/en/matches/0e7c77d3/Hatayspo...


In [None]:
def extract_game_stats(match_url):
    # Headers to mimic a real browser request
    HEADERS = {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "DNT": "1",  # Do Not Track
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
     
    response = requests.get(match_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    
    score_box = soup.find("div", {"class" : "scorebox"})
    # get home and away teams divs
    home_team_div = score_box.find_all("div", {"class" : ""})[0]
    away_team_div = score_box.find_all("div", {"class" : ""})[3]
    
    game_date = score_box.find("div", {"class" : "scorebox_meta"}).find("a").text
    
    # hold the game statistcs in a dictionary
    home_team_stats = {"team" : home_team_div.find("a").text, "score" : int(home_team_div.find("div",{"class" : "score"}).text)}
    away_team_stats = {"team" : away_team_div.find("a").text, "score" : int(away_team_div.find("div",{"class" : "score"}).text)}
    
    stats = soup.find("div", {"id": "team_stats"}) # stats div
    extra_stats = soup.find("div", {"id": "team_stats_extra"}) # extra stats div

    # get the number of yellow card statistic for home and away teams seperately
    home_yellow_cards = len(soup.find_all("div", {"class" : "cards"})[0].find_all("span", {"class" : "yellow_card"}))
    home_red_cards = len(soup.find_all("div", {"class" : "cards"})[0].find_all("span", {"class" : "yellow_red_card"})) # two yellow = red
    home_red_cards += len(soup.find_all("div", {"class" : "cards"})[0].find_all("span", {"class" : "red_card"})) # direct red card

    away_yellow_cards = len(soup.find_all("div", {"class" : "cards"})[1].find_all("span", {"class" : "yellow_card"}))
    away_red_cards = len(soup.find_all("div", {"class" : "cards"})[1].find_all("span", {"class" : "yellow_red_card"})) # two yellow = red
    away_red_cards += len(soup.find_all("div", {"class" : "cards"})[1].find_all("span", {"class" : "red_card"})) # direct red card
    
    home_team_stats['yellow_cards'] = home_yellow_cards
    home_team_stats['red_cards'] = home_red_cards

    away_team_stats['yellow_cards'] = away_yellow_cards
    away_team_stats['red_cards'] = away_red_cards
    
    # define a number patter to search in the div of html
    number_pattern = r'\d{1,3}(?:,\d{3})*\.?\d*'
    # iterate over the stats div
    for idx,s in enumerate(extra_stats):
        # if foul keyword exists, process to get the number of fouls for each team
        if "Fouls" in s.text:
            numbers = re.findall(number_pattern + "Fouls" + number_pattern, s.text)[0]
            home_fouls, away_fouls = numbers.split("Fouls")

            home_team_stats['fouls'] = int(home_fouls)
            away_team_stats['fouls'] = int(away_fouls)
        # if corners keyword exists, process to get the number of corners for each team
        if "Corners" in s.text:
            numbers = re.findall(number_pattern + "Corners" + number_pattern, s.text)[0]
            home_corners, away_corners = numbers.split("Corners")

            home_team_stats['corners'] = int(home_corners)
            away_team_stats['corners'] = int(away_corners)
        # if crosses keyword exists, process to get the number of crosses for each team
        if "Crosses" in s.text:
            numbers = re.findall(number_pattern + "Crosses" + number_pattern, s.text)[0]
            home_crosses, away_crosses = numbers.split("Crosses")

            home_team_stats['crosses'] = int(home_crosses)
            away_team_stats['crosses'] = int(away_crosses)
        # if interceptions keyword exists, process to get the number of interceptions for each team
        if "Interceptions" in s.text:
            numbers = re.findall(number_pattern + "Interceptions" + number_pattern, s.text)[0]
            home_interceptions, away_interceptions = numbers.split("Interceptions")

            home_team_stats['interceptions'] = int(home_interceptions)
            away_team_stats['interceptions'] = int(away_interceptions)
        # if offsides keyword exists, process to get the number of offsides for each team
        if "Offsides" in s.text:
            numbers = re.findall(number_pattern + "Offsides" + number_pattern, s.text)[0]
            home_offsides, away_offsides = numbers.split("Offsides")

            home_team_stats['offsides'] = int(home_offsides)
            away_team_stats['offsides'] = int(away_offsides)
            
    return game_date, home_team_stats, away_team_stats

In [8]:
stats_df = pd.DataFrame()

In [None]:
# iterate over each game
for _,row in sampled_df.iterrows():
    print(f"Scraping {row['match_link']} ...")
    # extract game statistics for the game
    date, home_stats, away_stats = extract_game_stats(row['match_link'])
    
    dic = {}
    for k,v in home_stats.items():
        dic[("Home", k)] = v
    for k,v in away_stats.items():
        dic[("Away", k)] = v

    # create data frame from the dictionary
    df = pd.DataFrame(dic, index=[0])
    df.columns = pd.MultiIndex.from_tuples(df.columns)
    df['date'] = date
    df['league'] = row['league']
    df['ball_play_time'] = np.nan # manually fill this value from mackolik mobile application
    # add the game df to the general df
    stats_df = pd.concat([stats_df, df])
    
    time.sleep(5) # sleep for 5 seconds between each request to reduce the overhead in the web server

Scraping https://fbref.com/en/matches/08966ea6/Manchester-United-Nottingham-Forest-December-7-2024-Premier-League ...
Scraping https://fbref.com/en/matches/6c829b8f/Ipswich-Town-Manchester-City-January-19-2025-Premier-League ...
Scraping https://fbref.com/en/matches/4d72ec87/West-Ham-United-Wolverhampton-Wanderers-December-9-2024-Premier-League ...
Scraping https://fbref.com/en/matches/c6439e5b/Arsenal-Southampton-October-5-2024-Premier-League ...
Scraping https://fbref.com/en/matches/99eb6105/Leicester-City-Crystal-Palace-January-15-2025-Premier-League ...
Scraping https://fbref.com/en/matches/09db2a2f/Tottenham-Hotspur-Manchester-United-February-16-2025-Premier-League ...
Scraping https://fbref.com/en/matches/837f0304/Nottingham-Forest-Wolverhampton-Wanderers-August-31-2024-Premier-League ...
Scraping https://fbref.com/en/matches/0b39252e/Wolverhampton-Wanderers-Arsenal-January-25-2025-Premier-League ...
Scraping https://fbref.com/en/matches/67a0c715/Chelsea-Manchester-City-August-18

Scraping https://fbref.com/en/matches/7f42f736/Kayserispor-Antalyaspor-January-29-2024-Super-Lig ...


In [10]:
stats_df

Unnamed: 0_level_0,Home,Home,Home,Home,Home,Home,Home,Home,Home,Away,Away,Away,Away,Away,Away,Away,Away,Away,date,league,ball_play_time
Unnamed: 0_level_1,team,score,yellow_cards,red_cards,fouls,corners,crosses,interceptions,offsides,team,...,yellow_cards,red_cards,fouls,corners,crosses,interceptions,offsides,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Manchester United,2,0,0,10,5,23,9,0.0,Nottingham Forest,...,2,0,13,3,7,8,1.0,"Saturday December 7, 2024",Premier League 24-25,
0,Ipswich Town,0,0,0,4,4,12,8,1.0,Manchester City,...,1,0,7,7,8,7,1.0,"Sunday January 19, 2025",Premier League 24-25,
0,West Ham United,2,5,0,12,11,22,8,4.0,Wolverhampton Wanderers,...,4,0,17,0,13,4,1.0,"Monday December 9, 2024",Premier League 24-25,
0,Arsenal,3,0,0,10,13,32,9,1.0,Southampton,...,3,0,9,1,7,9,0.0,"Saturday October 5, 2024",Premier League 24-25,
0,Leicester City,0,0,0,7,4,22,9,1.0,Crystal Palace,...,0,0,6,3,12,9,3.0,"Wednesday January 15, 2025",Premier League 24-25,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Galatasaray,2,4,0,21,7,23,10,4.0,Trabzonspor,...,3,0,7,1,7,8,2.0,"Saturday August 19, 2023",Turkish Super Lig 23-24,
0,İstanbulspor,0,4,0,15,0,9,12,0.0,İstanbul Başakşehir,...,0,0,9,7,21,12,3.0,"Sunday September 17, 2023",Turkish Super Lig 23-24,
0,Antalyaspor,2,2,0,22,1,14,15,,Samsunspor,...,1,0,11,4,23,8,,"Saturday September 23, 2023",Turkish Super Lig 23-24,
0,Hatayspor,1,4,0,8,4,20,4,4.0,Pendikspor,...,1,0,10,6,22,8,0.0,"Saturday January 13, 2024",Turkish Super Lig 23-24,


In [None]:
stats_df.isna().sum() # there are 2 na values for offsides, fill them manually
# also as mentioned before ball_play_time values are null, we are going to fill them manually

Home            team              0
                score             0
                yellow_cards      0
                red_cards         0
                fouls             0
                corners           0
                crosses           0
                interceptions     0
                offsides          2
Away            team              0
                score             0
                yellow_cards      0
                red_cards         0
                fouls             0
                corners           0
                crosses           0
                interceptions     0
                offsides          2
date                              0
league                            0
ball_play_time                   80
dtype: int64

In [11]:
stats_df.to_csv("dataset.csv",index=False)

In [None]:
# after manual data ingestion, we can inspect the final form of the dataframe
final_df = pd.read_csv("dataset_filled.csv")
final_df.head()

Unnamed: 0,Home,Home.1,Home.2,Home.3,Home.4,Home.5,Home.6,Home.7,Home.8,Away,...,Away.2,Away.3,Away.4,Away.5,Away.6,Away.7,Away.8,date,league,ball_play_time
0,team,score,yellow_cards,red_cards,fouls,corners,crosses,interceptions,offsides,team,...,yellow_cards,red_cards,fouls,corners,crosses,interceptions,offsides,,,
1,Manchester United,2,0,0,10,5,23,9,0,Nottingham Forest,...,2,0,13,3,7,8,1,"Saturday December 7, 2024",Premier League 24-25,54:26 / 98:00
2,Ipswich Town,0,0,0,4,4,12,8,1,Manchester City,...,1,0,7,7,8,7,1,"Sunday January 19, 2025",Premier League 24-25,63:49 / 94:03
3,West Ham United,2,5,0,12,11,22,8,4,Wolverhampton Wanderers,...,4,0,17,0,13,4,1,"Monday December 9, 2024",Premier League 24-25,53:46 / 100:23
4,Arsenal,3,0,0,10,13,32,9,1,Southampton,...,3,0,9,1,7,9,0,"Saturday October 5, 2024",Premier League 24-25,56:12 / 102:09


In [26]:
# fix the column names
for idx,col in enumerate(final_df.columns[:-3]):
    col_name = col.split(".")[0]
    col_name = col_name + "_" + final_df.iloc[0,idx]
    final_df.rename(columns={col:col_name},inplace=True)

In [29]:
final_df = final_df[1:] # remove first row that holds the column names
final_df.head()

Unnamed: 0,Home_team,Home_score,Home_yellow_cards,Home_red_cards,Home_fouls,Home_corners,Home_crosses,Home_interceptions,Home_offsides,Away_team,...,Away_yellow_cards,Away_red_cards,Away_fouls,Away_corners,Away_crosses,Away_interceptions,Away_offsides,date,league,ball_play_time
1,Manchester United,2,0,0,10,5,23,9,0,Nottingham Forest,...,2,0,13,3,7,8,1,"Saturday December 7, 2024",Premier League 24-25,54:26 / 98:00
2,Ipswich Town,0,0,0,4,4,12,8,1,Manchester City,...,1,0,7,7,8,7,1,"Sunday January 19, 2025",Premier League 24-25,63:49 / 94:03
3,West Ham United,2,5,0,12,11,22,8,4,Wolverhampton Wanderers,...,4,0,17,0,13,4,1,"Monday December 9, 2024",Premier League 24-25,53:46 / 100:23
4,Arsenal,3,0,0,10,13,32,9,1,Southampton,...,3,0,9,1,7,9,0,"Saturday October 5, 2024",Premier League 24-25,56:12 / 102:09
5,Leicester City,0,0,0,7,4,22,9,1,Crystal Palace,...,0,0,6,3,12,9,3,"Wednesday January 15, 2025",Premier League 24-25,62:22 / 98:33
