#1 Setup and imports

In [51]:
import sys
!{sys.executable} -m pip install pandas numpy matplotlib seaborn plotly scikit-learn >nul 2>&1
print("Install done")

Install done


In [None]:
#Core libraries
import pandas as pd
import numpy as np 
#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px 
#Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error, classification_report, accuracy_score, confusion_matrix
#Data preprocessing libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

Matplotlib is building the font cache; this may take a moment.


#2 Data Collection

- *Clubs Data :*


Points, wins, draws, losses, goals for/against, xG/xGA

In [48]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
seasons = [2025,2024,2023,2022,2021]
top5_standings = []

for season in seasons:
    url = f"https://fbref.com/en/comps/9/{season-1}-{season}/{season-1}-{season}-Premier-League-Stats"
    tables = pd.read_html(url)
    df = tables[0]
    df.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in df.columns.values]

    #Select the first 5 teams (top 5 teams)
    top5 = df.head(5)
    top5.loc[:, 'Season'] = f"{season-1}/{season}"
    top5_standings.append(top5)

standings_df = pd.concat(top5_standings, ignore_index=True)
print(standings_df[['Season', 'Squad', 'Pts', 'W', 'D', 'L', 'GF', 'GA', 'GD','Rk']])

       Season            Squad  Pts   W   D   L  GF  GA  GD  Rk
0   2024/2025        Liverpool   84  25   9   4  86  41  45   1
1   2024/2025          Arsenal   74  20  14   4  69  34  35   2
2   2024/2025  Manchester City   71  21   8   9  72  44  28   3
3   2024/2025          Chelsea   69  20   9   9  64  43  21   4
4   2024/2025    Newcastle Utd   66  20   6  12  68  47  21   5
5   2023/2024  Manchester City   91  28   7   3  96  34  62   1
6   2023/2024          Arsenal   89  28   5   5  91  29  62   2
7   2023/2024        Liverpool   82  24  10   4  86  41  45   3
8   2023/2024      Aston Villa   68  20   8  10  76  61  15   4
9   2023/2024        Tottenham   66  20   6  12  74  61  13   5
10  2022/2023  Manchester City   89  28   5   5  94  33  61   1
11  2022/2023          Arsenal   84  26   6   6  88  43  45   2
12  2022/2023   Manchester Utd   75  23   6   9  58  43  15   3
13  2022/2023    Newcastle Utd   71  19  14   5  68  33  35   4
14  2022/2023        Liverpool   67  19 

Squad market value

In [60]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# we will try to use requests with headers to avoid the automated blocking by the website
url = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=2025"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

table = soup.find("table", class_="items")
rows = table.find_all("tr", class_=["odd", "even"])

data = []
for row in rows:
    cells = row.find_all("td")
    if len(cells) >= 7:
        club = cells[1].get_text(strip=True)
        avg_value = cells[5].get_text(strip=True)
        total_value = cells[6].get_text(strip=True)
        data.append([club, avg_value, total_value])

sm_df = pd.DataFrame(data, columns=["Club", "Average Market Value", "Total Market Value"])
print(sm_df.head(10))


                     Club Average Market Value Total Market Value
0         Manchester City              €39.37m            €1.34bn
1              Chelsea FC              €28.92m            €1.30bn
2              Arsenal FC              €46.88m            €1.17bn
3            Liverpool FC              €37.19m            €1.08bn
4       Tottenham Hotspur              €25.00m           €849.90m
5       Manchester United              €25.54m           €817.20m
6        Newcastle United              €22.83m           €684.88m
7  Brighton & Hove Albion              €16.62m           €664.60m
8             Aston Villa              €19.30m           €559.80m
9          Crystal Palace              €15.38m           €461.30m


Manager changes

In [95]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.transfermarkt.com/premier-league/trainerwechsel/wettbewerb/GB1/plus/?saison_id=2024"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

table = soup.find("table", class_="items")
rows = table.find_all("tr", class_=["odd", "even"])

data = []
for row in rows:
    # We only want rows with at least 7 cells
    cells = row.find_all("td")
    if len(cells) < 7:
        continue

    # We extract the club name from the <a> tag in the first cell
    # it's the title attribute of the <a> tag
    club_tag = cells[0].find("a")
    club = club_tag.get("title", club_tag.text.strip()) if club_tag else cells[0].get_text(strip=True)

    #The new manager located in the 11th cell
    #stripping the text to remove any extra spaces
    new_manager = cells[10].get_text(strip=True)

    data.append([club, new_manager])

managers_df = pd.DataFrame(data, columns=["Club", "New Manager"]).head(4)
print(managers_df)

                Club      New Manager
0       Brentford FC    Keith Andrews
1     Leicester City  Martí Cifuentes
2  Tottenham Hotspur     Thomas Frank
3     Southampton FC       Will Still


Transfers in/out (value, type)



Team strength metrics

- *Player Data :*

Goals, xG, shots, minutes played

In [26]:
import requests 
import pandas as pd
from bs4 import BeautifulSoup
import json 

seasons = [2024, 2023, 2022, 2021, 2020]
players_data = []
for season in seasons:      
    url = f"https://understat.com/league/EPL/{season}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    scripts = soup.find_all('script')

    try:
        strings = scripts[3].string

        ind_start = strings.index("('")+2
        ind_end = strings.index("');")
        json_data = strings[ind_start:ind_end]
        json_data = json_data.encode('utf-8').decode('unicode_escape')
        data = json.loads(json_data)
        # print(json.dumps(data, indent=2))

        
        for player in data[:50]: # Limit to first 50 players for brevity
            player_info = {
                "Season": season,
                "name" : player['player_name'],
                "Minutes played": player['time'],
                "Games played": player['games'],
                "Goals": player['goals'],
                "Assists": player['assists'],
                "xG": player['xG'],
                "xA": player['xA'],
                "Shots": player['shots'],
                "Key Passes": player['key_passes'],
                "npg": player['npg'],
                "npxG": player['npxG'],  
            }
            players_data.append(player_info)
    except Exception as e:
        print(f"Error processing season {season}: {e}")
    
players_df = pd.DataFrame(players_data) 
print(players_df)

     Season             name Minutes played Games played Goals Assists  \
0      2024    Mohamed Salah           3392           38    29      18   
1      2024   Alexander Isak           2822           34    23       6   
2      2024   Erling Haaland           2749           31    22       3   
3      2024       Chris Wood           3024           36    20       3   
4      2024     Bryan Mbeumo           3419           38    20       7   
..      ...              ...            ...          ...   ...     ...   
245    2020          Rodrigo           1285           26     7       2   
246    2020      Richarlison           2883           34     7       3   
247    2020    Ferrán Torres           1305           24     7       2   
248    2020  Mason Greenwood           1825           31     7       2   
249    2020      Timo Werner           2605           35     6       8   

                     xG                  xA Shots Key Passes npg  \
0     27.70626749098301  15.858334187418222

Player Position and Team

In [55]:
import requests
import pandas as pd
#Channge the pandas display options to show all columns and increase width
pd.set_option('display.max_columns', None)      # Show all columns
pd.set_option('display.width', 2000)            # Increase display width
pd.set_option('display.max_colwidth', None)     # Show full column content

url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
data = response.json()
positions = {pos['id']: pos['singular_name'] for pos in data['element_types']}
name_to_position = {}
for player in data['elements']:
    full_name = f"{player['first_name']} {player['second_name']}".lower()
    name_to_position[full_name] = positions.get(player['element_type'], "Position not found")

def get_player_position(player_name):
    return name_to_position.get(player_name.lower(), "Position not found")

def get_player_team(player_name):
    for player in data['elements']:
        full_name = f"{player['first_name']} {player['second_name']}".lower()
        if (full_name.lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() or
            player['first_name'].lower() == player_name.lower() or
            player['second_name'].lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() ):
            for team in data['teams']:
                if team['id'] == player['team']:
                    return team['name']
    return "Team not found"

# Add team name and position just after the player name column
players_df['Team Name'] = players_df['name'].apply(get_player_team)
players_df['Position'] = players_df['name'].apply(get_player_position)

# Move 'Team Name' and 'Position' right after 'name'
cols = list(players_df.columns)
name_idx = cols.index('name')
# Remove if already present to avoid duplicates
cols = [c for c in cols if c not in ['Team Name', 'Position']]
new_cols = cols[:name_idx+1] + ['Team Name', 'Position'] + cols[name_idx+1:]
players_df = players_df[new_cols]

print(players_df.head(15))

    Season                  name       Team Name            Position Minutes played Games played Goals Assists                  xG                  xA Shots Key Passes npg                npxG     Penalty Taker
0     2024         Mohamed Salah       Liverpool          Midfielder           3392           38    29      18   27.70626749098301  15.858334187418222   130         89  20  20.855747912079096  Player not found
1     2024        Alexander Isak       Newcastle             Forward           2822           34    23       6  22.356988068670034    5.44870379474014    99         42  19  19.312312599271536  Player not found
2     2024        Erling Haaland        Man City             Forward           2749           31    22       3   23.95459282770753  3.5812273556366563   109         29  19  20.909917432814837  Player not found
3     2024            Chris Wood   Nott'm Forest             Forward           3024           36    20       3  15.638655036687851   3.044111367315054    68    

Team attacking style

Penalty taker? (Yes/No)

In [61]:
# We will simply define a function to check either a player is a penalty taker or not and what is his order if sp
import requests

# we fetch the data once for efficiency
url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
if response.status_code != 200:
    raise Exception("Failed to fetch data from the API")
data = response.json()

def is_penalty_taker(player_name):
    for player in data['elements']:
        full_name = f"{player['first_name']} {player['second_name']}"
        if (full_name.lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() or
            player['first_name'].lower() == player_name.lower() or
            player['second_name'].lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower()):
            order = player['penalties_order']
            if order and order > 0:
                return "Yes"
            else:
                return "No"
    return "Player not found"
players_df['Penalty Taker'] = players_df['name'].apply(is_penalty_taker)

cols = list(players_df.columns)
npxg = cols.index('npxG')
cols = [c for c in cols if c != 'Penalty Taker']
new_cols = cols[:npxg+1] + ['Penalty Taker'] + cols[npxg+1:]

players_df = players_df[new_cols]
print(players_df.head(10))

   Season                  name       Team Name            Position Minutes played Games played Goals Assists                  xG                  xA Shots Key Passes npg                npxG     Penalty Taker
0    2024         Mohamed Salah       Liverpool          Midfielder           3392           38    29      18   27.70626749098301  15.858334187418222   130         89  20  20.855747912079096               Yes
1    2024        Alexander Isak       Newcastle             Forward           2822           34    23       6  22.356988068670034    5.44870379474014    99         42  19  19.312312599271536               Yes
2    2024        Erling Haaland        Man City             Forward           2749           31    22       3   23.95459282770753  3.5812273556366563   109         29  19  20.909917432814837               Yes
3    2024            Chris Wood   Nott'm Forest             Forward           3024           36    20       3  15.638655036687851   3.044111367315054    68         

#3 Data Cleaning & Preprocessing

#4 Build the Prediction Models

#5 Evaluate Models

#6 Simulate Next Season

#7 Visualizations