#1 Setup and imports

In [1]:
import sys
!{sys.executable} -m pip install pandas numpy matplotlib seaborn plotly scikit-learn >nul 2>&1
print("Install done")

Install done


In [2]:
#Core libraries
import pandas as pd
import numpy as np
#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
#Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error, classification_report, accuracy_score, confusion_matrix
#Data preprocessing libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

#2 Data Collection

- *Clubs Data :*


Points, wins, draws, losses, goals for/against, xG/xGA

In [3]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
seasons = [2025,2024,2023,2022,2021]
top5_standings = []

for season in seasons:
    url = f"https://fbref.com/en/comps/9/{season-1}-{season}/{season-1}-{season}-Premier-League-Stats"
    tables = pd.read_html(url)
    df = tables[0]
    df.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in df.columns.values]

    #Select the first 5 teams (top 5 teams)
    top5 = df.head(5)
    top5.loc[:, 'Season'] = f"{season-1}/{season}"
    top5_standings.append(top5)

standings_df = pd.concat(top5_standings, ignore_index=True)
print(standings_df[['Season', 'Squad', 'Pts', 'W', 'D', 'L', 'GF', 'GA', 'GD','Rk']])

HTTPError: HTTP Error 403: Forbidden

Squad market value

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# we will try to use requests with headers to avoid the automated blocking by the website
url = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=2025"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

table = soup.find("table", class_="items")
rows = table.find_all("tr", class_=["odd", "even"])

data = []
for row in rows:
    cells = row.find_all("td")
    if len(cells) >= 7:
        club = cells[1].get_text(strip=True)
        avg_value = cells[5].get_text(strip=True)
        total_value = cells[6].get_text(strip=True)
        data.append([club, avg_value, total_value])

sm_df = pd.DataFrame(data, columns=["Club", "Average Market Value", "Total Market Value"])
print(sm_df)


Manager changes

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.transfermarkt.com/premier-league/trainerwechsel/wettbewerb/GB1/plus/?saison_id=2024"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

table = soup.find("table", class_="items")
rows = table.find_all("tr", class_=["odd", "even"])

data = []
for row in rows:
    # We only want rows with at least 7 cells
    cells = row.find_all("td")
    if len(cells) < 7:
        continue

    # We extract the club name from the <a> tag in the first cell
    # it's the title attribute of the <a> tag
    club_tag = cells[0].find("a")
    club = club_tag.get("title", club_tag.text.strip()) if club_tag else cells[0].get_text(strip=True)

    #The new manager located in the 11th cell
    #stripping the text to remove any extra spaces
    new_manager = cells[10].get_text(strip=True)

    data.append([club, new_manager])

managers_df = pd.DataFrame(data, columns=["Club", "New Manager"]).head(4)
print(managers_df)

Managers Names


In [None]:
# !pip install html5lib
import requests
import warnings
import pandas as pd

warnings.filterwarnings("ignore")
managers_df = []
seasons = [2025, 2024, 2023, 2022, 2021]

for season in seasons:
    url = f"https://www.transfermarkt.com/premier-league/trainer/pokalwettbewerb/GB1/plus/0?saison_id={season}"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve data for season {season}")
    tables = pd.read_html(response.text)
    soup = BeautifulSoup(response.text, "lxml")
    table = soup.find("table", class_="items")
    rows = table.find_all("tr", class_=["odd", "even"])

    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 4:
            continue
        club_img = cells[3].find("img")
        club_name = club_img['alt'] if club_img else cells[2].get_text(strip=True)
        manager_name = cells[1].get_text(strip=True)
        managers_df.append({
            "Season": f"{season-1}/{season}",
            "Club": club_name,
            "Manager Name": manager_name,
        })
managers_df = pd.DataFrame(managers_df)
print(managers_df.tail(20))



Goals, xG, shots, games played

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
seasons = [2025, 2024, 2023, 2022, 2021]
teams_df = []
for season in seasons:
    url = f"https://fbref.com/en/comps/9/{season-1}-{season}/{season-1}-{season}-Premier-League-Stats"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {season-1}/{season}")

    tables = pd.read_html(response.text)
    standings = tables[0]
    standings.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in standings.columns.values]
    for _,row in standings.iterrows():
        if row['Squad'].endswith(" Utd"):
            row['Squad'] = row['Squad'].replace(" Utd", " United")
        elif row['Squad'].startswith("Nott'ham"):
            row['Squad'] = row['Squad'].replace("Nott'ham", "Nottingham")
        elif row['Squad'].startswith("Brighton"):
            row['Squad'] = row['Squad'].replace("Brighton", "Brighton & Hove Albion")
        elif row['Squad'].startswith("Wolves"):
            row['Squad'] = row['Squad'].replace("Wolves", "Wolverhampton Wanderers")


        club_mask = (sm_df['Club'] == row['Squad']) | (sm_df['Club'] == row['Squad'] + " FC") | (sm_df['Club'] == "AFC "+row['Squad']) | (sm_df['Club'] == row['Squad']+" AFC") | (sm_df['Club'] == row['Squad']+" United") | (sm_df['Club'] == row['Squad']+" Hotspur")
        matched = sm_df.loc[club_mask, 'Total Market Value']
        squad_value = matched.values[0] if not matched.empty else None
        manager_row = managers_df[
                    (managers_df['Club'] == row['Squad']) |
                    (managers_df['Club'] == row['Squad'] + " FC") |
                    (managers_df['Club'] == "AFC "+row['Squad']) |
                    (managers_df['Club'] == row['Squad'] + " United") |
                    (managers_df['Club'] == row['Squad'] + " City") |
                    (managers_df['Club'] == row['Squad'] + " Hotspur") |
                    (managers_df['Club'].str.replace(" FC", "", regex=False) == row['Squad']) |
                    (managers_df['Club'].str.replace(" AFC", "", regex=False) == row['Squad']) |
                    (managers_df['Club'].str.replace(" United", "", regex=False) == row['Squad']) |
                    (managers_df['Club'].str.replace(" City", "", regex=False) == row['Squad']) |
                    (managers_df['Club'].str.replace(" Hotspur", "", regex=False) == row['Squad']) &
                    (managers_df['Season'] == f"{season-1}/{season}")]
        manager_name = manager_row['Manager Name'].values[0] if not manager_row.empty else None
        team_dict = {
            "Season": f"{season-1}/{season}",
            "Ranking": row['Rk'],
            "Team name": row['Squad'],
            "Games Played": row['MP'],
            "Wins": row['W'],
            "Draws": row['D'],
            "Losses": row['L'],
            "Goals For": row['GF'],
            "Goals Against": row['GA'],
            "Goals Difference": row['GD'],
            "Points": row['Pts'],
            "Points Per Game": row['Pts/MP'],
            "xG": row['xG'],
            "xGa": row['xGA'],
            "Squad Value": squad_value,
            "Manager name": manager_name,
            "Top 6 Label": "Yes" if row['Rk'] <= 6 else "No",
            "Bottom 3 Label": "Yes" if row['Rk'] >= 18 else "No",
        }
        teams_df.append(team_dict)
teams_df = pd.DataFrame(teams_df)
print(teams_df.head(20))


- *Player Data :*

Goals, xG, shots, minutes played

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json

seasons = [2024, 2023, 2022, 2021, 2020]
players_data = []
for season in seasons:
    url = f"https://understat.com/league/EPL/{season}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    scripts = soup.find_all('script')

    try:
        strings = scripts[3].string

        ind_start = strings.index("('")+2
        ind_end = strings.index("');")
        json_data = strings[ind_start:ind_end]
        json_data = json_data.encode('utf-8').decode('unicode_escape')
        data = json.loads(json_data)
        # print(json.dumps(data, indent=2))


        for player in data[:50]: # Limit to first 50 players for brevity
            player_info = {
                "Season": f"{season}/{season+1}",
                "name" : player['player_name'],
                "Team name": player['team_title'],
                "Minutes played": player['time'],
                "Games played": player['games'],
                "Goals": player['goals'],
                "Assists": player['assists'],
                "xG": player['xG'],
                "xA": player['xA'],
                "Shots": player['shots'],
                "Key Passes": player['key_passes'],
                "npg": player['npg'],
                "npxG": player['npxG'],
            }
            players_data.append(player_info)
    except Exception as e:
        print(f"Error processing season {season}: {e}")

players_df = pd.DataFrame(players_data)
print(players_df)

Player Position and Team

In [None]:
import requests
import pandas as pd
#Channge the pandas display options to show all columns and increase width
pd.set_option('display.max_columns', None)      # Show all columns
pd.set_option('display.width', 2000)            # Increase display width
pd.set_option('display.max_colwidth', None)     # Show full column content

url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
data = response.json()
positions = {pos['id']: pos['singular_name'] for pos in data['element_types']}
name_to_position = {}
for player in data['elements']:
    full_name = f"{player['first_name']} {player['second_name']}".lower()
    name_to_position[full_name] = positions.get(player['element_type'], "Position not found")

def get_player_position(player_name):
    return name_to_position.get(player_name.lower(), "Position not found")

def get_player_team(player_name):
    for player in data['elements']:
        full_name = f"{player['first_name']} {player['second_name']}".lower()
        if (full_name.lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() or
            player['first_name'].lower() == player_name.lower() or
            player['second_name'].lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() ):
            for team in data['teams']:
                if team['id'] == player['team']:
                    return team['name']
    return "Team not found"

# Add team name and position just after the player name column

players_df['Position'] = players_df['name'].apply(get_player_position)

# Move 'Team Name' and 'Position' right after 'name'
cols = list(players_df.columns)
name_idx = cols.index('name')
# Remove if already present to avoid duplicates
cols = [c for c in cols if c not in ['Position']]
new_cols = cols[:name_idx+1] + ['Position'] + cols[name_idx+1:]
players_df = players_df[new_cols]

print(players_df.head(15))

Penalty taker? (Yes/No)

In [None]:
# We will simply define a function to check either a player is a penalty taker or not and what is his order if sp
import requests

# we fetch the data once for efficiency
url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
if response.status_code != 200:
    raise Exception("Failed to fetch data from the API")
data = response.json()

def is_penalty_taker(player_name):
    for player in data['elements']:
        full_name = f"{player['first_name']} {player['second_name']}"
        if (full_name.lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() or
            player['first_name'].lower() == player_name.lower() or
            player['second_name'].lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower()):
            order = player['penalties_order']
            if order and order > 0:
                return "Yes"
            else:
                return "No"
    return "Player not found"
players_df['Penalty Taker'] = players_df['name'].apply(is_penalty_taker)

cols = list(players_df.columns)
npxg = cols.index('npxG')
cols = [c for c in cols if c != 'Penalty Taker']
new_cols = cols[:npxg+1] + ['Penalty Taker'] + cols[npxg+1:]

players_df = players_df[new_cols]
print(players_df.head(10))

#3 Data Cleaning & Preprocessing

- Player Data

In [None]:
#lets make sure that the numeric columns are in the right format
numeric_cols = ['Minutes played', 'Games played', 'Goals', 'Assists', 'xG', 'xA', 'Shots', 'Key Passes', 'npg', 'npxG']
for col in numeric_cols:
    players_df[col] = pd.to_numeric(players_df[col], errors='coerce')

# Convert categorical columns to string type
categorical_cols = ['name', 'Team name', 'Position', 'Penalty Taker']
for col in categorical_cols:
    players_df[col] = players_df[col].astype(str)
    players_df[col] = players_df[col].str.strip()

#lets convert the 'Season' column to a string type or a consistent datetime format
players_df['Season'] = players_df['Season'].astype(str)

#lets check for any missing values
print(players_df.isnull().sum())

# lets check the data types of the columns
print(players_df.dtypes)

Filter and Clean Player Data

In [None]:
#lets keep only attacking players
attacking_positions = ['Forward', 'Midfielder']
players_df = players_df[players_df['Position'].isin(attacking_positions)]
#lets keep only players with more than 1000 minutes played
players_df = players_df[players_df['Minutes played'] > 1000]
#Lets drop or fill any missing values
players_df = players_df.dropna(subset=['name', 'Team name', 'Position', 'Penalty Taker'])
# Switch Penalty Taker to a binary column
players_df['Penalty Taker'] = players_df['Penalty Taker'].apply(lambda x: 1 if x == 'Yes' else 0)
#Lets add derived columns for goals per game and assists per game
players_df['Goals per Game'] = players_df['Goals'] / players_df['Games played']
players_df['Assists per Game'] = players_df['Assists'] / players_df['Games played']
#Lets add a column for total contributions (goals + assists)
players_df['Total Contributions'] = players_df['Goals'] + players_df['Assists']
#Lets add a column for xG per game
players_df['xG per Game'] = players_df['xG'] / players_df['Games played']
#Lets add a column for xA per game
players_df['xA per Game'] = players_df['xA'] / players_df['Games played']
#Lets add a column for npg per game
players_df['npg per Game'] = players_df['npg'] / players_df['Games played']
#Lets add a column for npxG per game
players_df['npxG per Game'] = players_df['npxG'] / players_df['Games played']
print(players_df.tail(20))

- Teams Data

In [None]:
#lets make sure that the numeric columns are in the right format
numeric_cols = ['Ranking', 'Games Played', 'Wins', 'Draws', 'Losses', 'Goals For', 'Goals Against', 'Goals Difference', 'Points', 'Points Per Game', 'xG', 'xGa', 'Squad Value']
for col in numeric_cols:
    teams_df[col] = pd.to_numeric(teams_df[col], errors='coerce')

# Convert categorical columns to string type
categorical_cols = ['Team name', 'Manager name']
for col in categorical_cols:
    teams_df[col] = teams_df[col].astype(str)
    teams_df[col] = teams_df[col].str.strip()
#lets convert the 'Season' column to a string type or a consistent datetime format
teams_df['Season'] = teams_df['Season'].astype(str)
#lets check for any missing values
print(teams_df.isnull().sum())
# lets check the data types of the columns
print(teams_df.dtypes)


Filter and Clean Teams Data

In [None]:
#Lets switch the Top 6 Label and Bottom 3 Label to a binary column
teams_df['Top 6 Label'] = teams_df['Top 6 Label'].apply(lambda x: 1 if x == 'Yes' else 0)
teams_df['Bottom 3 Label'] = teams_df['Bottom 3 Label'].apply(lambda x: 1 if x == 'Yes' else 0)

#Lets add derived columns for goals per game and points per game
teams_df['Goals per Game'] = teams_df['Goals For'] / teams_df['Games Played']
teams_df['Points per Game'] = teams_df['Points'] / teams_df['Games Played']

#xG difference
teams_df['xG Difference'] = teams_df['xG'] - teams_df['xGa']
#Win Ratio
teams_df['Win Ratio'] = teams_df['Wins'] / teams_df['Games Played']
#Goal Efficiency
teams_df['Goal Efficiency'] = teams_df['Goals For'] / teams_df['xG']
#Defensive Leakiness
teams_df['Defensive Leakiness'] = teams_df['Goals Against'] / teams_df['xGa']

print(teams_df.tail(20))

Encode categorical columns & Merge Team Data into Player Data

In [None]:
#Lets encode categorical columns of players_df and teams_df and then merge them
# Encode categorical columns
players_categorical_cols = ['name', 'Team name', 'Position']
teams_categorical_cols = ['Team name', 'Manager name']
# Create a LabelEncoder instance
label_encoder = LabelEncoder()
# Encode players_df categorical columns
for col in players_categorical_cols:
    players_df[col] = label_encoder.fit_transform(players_df[col])
# Encode teams_df categorical columns
for col in teams_categorical_cols:
    teams_df[col] = label_encoder.fit_transform(teams_df[col])
# Now we will merge the players_df and teams_df on the 'Team name' and 'season' columns
merged_df = pd.merge(players_df, teams_df, left_on=['Team name', 'Season'], right_on=['Team name', 'Season'], suffixes=('_player', '_team'))

# Display the merged DataFrame
print(merged_df.head(20))


#4 Build the Prediction Models

In [None]:
# First thing, we will train a classifier (as option A) to predict whether a team will finish in the top 6 or not (binary classification).
# We will use the Random Forest Classifier for this task
# Let's split the data into features and target variable
X = merged_df.drop(columns=['Top 6 Label', 'Bottom 3 Label', 'Team name', 'Season', 'Manager name'])
y = merged_df['Top 6 Label']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Training the model

In [None]:
# Let's initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
rf_classifier.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)
# let's link the predictions to the team names (decoding the labels)
predicted_teams = label_encoder.inverse_transform(y_pred)
# Create a DataFrame to hold the predictions
predictions_df = pd.DataFrame({
    'Team': predicted_teams,
    'Predicted Top 6': y_pred
})
# Display the predictions
print(predictions_df)



#5 Evaluate Models

In [None]:
#Let's evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
# We can also visualize the feature importances
feature_importances = rf_classifier.feature_importances_
feature_names = X.columns
# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
# Plot the feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

#6 Simulate Next Season

#7 Visualizations