#1 Setup and imports

In [29]:
import sys
!{sys.executable} -m pip install pandas numpy matplotlib seaborn plotly scikit-learn >nul 2>&1
!pip install selenium webdriver-manager
print("Install done")

Collecting selenium
  Downloading selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket<1.0,>=0.12.2->selenium)
  Downloading wsproto-1.3.2-py3-none-any.whl.metadata (5.2 kB)
Downloading selenium-4.38.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading trio-0.32.0-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━

In [30]:
#Core libraries
import pandas as pd
import numpy as np
#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
#Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error, classification_report, accuracy_score, confusion_matrix
#Data preprocessing libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

#2 Data Collection

- *Clubs Data :*


Points, wins, draws, losses, goals for/against, xG/xGA

In [34]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import time
import warnings

warnings.filterwarnings("ignore")

seasons = [2024, 2023, 2022, 2021, 2020]
teams_data = []

for season in seasons:
    url = f"https://understat.com/league/EPL/{season}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'lxml')
        scripts = soup.find_all('script')

        # Find the script containing teams data (usually in a different script tag than players)
        for script in scripts:
            if script.string and 'teamsData' in script.string:
                strings = script.string

                # Extract teams data
                ind_start = strings.index("JSON.parse('") + len("JSON.parse('")
                ind_end = strings.index("')", ind_start)
                json_data = strings[ind_start:ind_end]
                json_data = json_data.encode('utf-8').decode('unicode_escape')
                teams = json.loads(json_data)

                # Process each team
                for team_id, team_data in teams.items():
                    team_info = {
                        "Season": f"{season}/{season+1}",
                        "Team name": team_data['title'],
                        "Games Played": len(team_data['history']),
                        "Points": sum([match['pts'] for match in team_data['history']]),
                        "Wins": sum([1 for match in team_data['history'] if match['result'] == 'w']),
                        "Draws": sum([1 for match in team_data['history'] if match['result'] == 'd']),
                        "Losses": sum([1 for match in team_data['history'] if match['result'] == 'l']),
                        "Goals For": sum([match['scored'] for match in team_data['history']]),
                        "Goals Against": sum([match['missed'] for match in team_data['history']]),
                        "xG": sum([float(match['xG']) for match in team_data['history']]),
                        "xGA": sum([float(match['xGA']) for match in team_data['history']]),
                    }
                    teams_data.append(team_info)

                print(f"✓ Successfully scraped {season}/{season+1} - {len(teams)} teams")
                break

        time.sleep(2)  # Be respectful with delays

    except Exception as e:
        print(f"✗ Error processing season {season}: {e}")

teams_df = pd.DataFrame(teams_data)

# Calculate additional metrics
teams_df['Goals Difference'] = teams_df['Goals For'] - teams_df['Goals Against']
teams_df['xG Difference'] = teams_df['xG'] - teams_df['xGA']
teams_df['Points Per Game'] = teams_df['Points'] / teams_df['Games Played']

# Sort by season and points
teams_df = teams_df.sort_values(['Season', 'Points'], ascending=[True, False])

print(f"\nTotal teams scraped: {len(teams_df)}")
print("\n" + "="*80)
print(teams_df[['Season', 'Team name', 'Games Played', 'Points', 'Wins', 'Draws', 'Losses',
                'Goals For', 'Goals Against', 'Goals Difference', 'xG', 'xGA']].head(25))

✓ Successfully scraped 2024/2025 - 20 teams
✓ Successfully scraped 2023/2024 - 20 teams
✓ Successfully scraped 2022/2023 - 20 teams
✓ Successfully scraped 2021/2022 - 20 teams
✓ Successfully scraped 2020/2021 - 20 teams

Total teams scraped: 100

       Season                Team name  Games Played  Points  Wins  Draws  \
92  2020/2021          Manchester City            38      86    27      5   
93  2020/2021        Manchester United            38      74    21     11   
91  2020/2021                Liverpool            38      69    20      9   
86  2020/2021                  Chelsea            38      67    19     10   
83  2020/2021                Leicester            38      66    20      6   
87  2020/2021                 West Ham            38      65    19      8   
88  2020/2021                Tottenham            38      62    18      8   
89  2020/2021                  Arsenal            38      61    18      7   
81  2020/2021                  Everton            38      59

Squad market value

In [35]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# we will try to use requests with headers to avoid the automated blocking by the website
url = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=2025"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

table = soup.find("table", class_="items")
rows = table.find_all("tr", class_=["odd", "even"])

data = []
for row in rows:
    cells = row.find_all("td")
    if len(cells) >= 7:
        club = cells[1].get_text(strip=True)
        avg_value = cells[5].get_text(strip=True)
        total_value = cells[6].get_text(strip=True)
        data.append([club, avg_value, total_value])

sm_df = pd.DataFrame(data, columns=["Club", "Average Market Value", "Total Market Value"])
print(sm_df)


                       Club Average Market Value Total Market Value
0                Arsenal FC              €52.56m            €1.31bn
1           Manchester City              €46.64m            €1.21bn
2              Liverpool FC              €44.39m            €1.15bn
3                Chelsea FC              €36.66m            €1.14bn
4         Tottenham Hotspur              €31.74m           €920.60m
5          Newcastle United              €25.93m           €751.83m
6         Manchester United              €28.08m           €730.20m
7         Nottingham Forest              €23.03m           €644.70m
8               Aston Villa              €23.74m           €546.00m
9    Brighton & Hove Albion              €20.02m           €520.60m
10           Crystal Palace              €16.15m           €484.53m
11          AFC Bournemouth              €17.92m           €448.10m
12             Brentford FC              €14.03m           €434.90m
13               Everton FC              €17.28m

Manager changes

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.transfermarkt.com/premier-league/trainerwechsel/wettbewerb/GB1/plus/?saison_id=2024"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

table = soup.find("table", class_="items")
rows = table.find_all("tr", class_=["odd", "even"])

data = []
for row in rows:
    # We only want rows with at least 7 cells
    cells = row.find_all("td")
    if len(cells) < 7:
        continue

    # We extract the club name from the <a> tag in the first cell
    # it's the title attribute of the <a> tag
    club_tag = cells[0].find("a")
    club = club_tag.get("title", club_tag.text.strip()) if club_tag else cells[0].get_text(strip=True)

    #The new manager located in the 11th cell
    #stripping the text to remove any extra spaces
    new_manager = cells[10].get_text(strip=True)

    data.append([club, new_manager])

managers_df = pd.DataFrame(data, columns=["Club", "New Manager"]).head(4)
print(managers_df)

                Club      New Manager
0       Brentford FC    Keith Andrews
1     Leicester City  Martí Cifuentes
2  Tottenham Hotspur     Thomas Frank
3     Southampton FC       Will Still


Managers Names


In [17]:
# !pip install html5lib
import requests
import warnings
import pandas as pd

warnings.filterwarnings("ignore")
managers_df = []
seasons = [2025, 2024, 2023, 2022, 2021]

for season in seasons:
    url = f"https://www.transfermarkt.com/premier-league/trainer/pokalwettbewerb/GB1/plus/0?saison_id={season}"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve data for season {season}")
    tables = pd.read_html(response.text)
    soup = BeautifulSoup(response.text, "lxml")
    table = soup.find("table", class_="items")
    rows = table.find_all("tr", class_=["odd", "even"])

    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 4:
            continue
        club_img = cells[3].find("img")
        club_name = club_img['alt'] if club_img else cells[2].get_text(strip=True)
        manager_name = cells[1].get_text(strip=True)
        managers_df.append({
            "Season": f"{season-1}/{season}",
            "Club": club_name,
            "Manager Name": manager_name,
        })
managers_df = pd.DataFrame(managers_df)
print(managers_df.tail(20))



       Season                     Club         Manager Name
80  2020/2021          Manchester City        Pep Guardiola
81  2020/2021               Arsenal FC         Mikel Arteta
82  2020/2021         Newcastle United           Eddie Howe
83  2020/2021              Aston Villa           Unai Emery
84  2020/2021             Leeds United         Daniel Farke
85  2020/2021           Crystal Palace       Oliver Glasner
86  2020/2021               Chelsea FC         Enzo Maresca
87  2020/2021   Brighton & Hove Albion      Fabian Hürzeler
88  2020/2021             Liverpool FC            Arne Slot
89  2020/2021               Burnley FC         Scott Parker
90  2020/2021        Manchester United         Rúben Amorim
91  2020/2021               Everton FC          David Moyes
92  2020/2021        Tottenham Hotspur         Thomas Frank
93  2020/2021             Brentford FC        Keith Andrews
94  2020/2021           Leicester City      Martí Cifuentes
95  2020/2021          West Ham United  

Goals, xG, shots, games played

In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
import warnings

warnings.filterwarnings("ignore")

seasons = [2024, 2023, 2022, 2021, 2020]  # Understat uses different year format
teams_df = []

for season in seasons:
    url = f"https://understat.com/league/EPL/{season}"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {season}/{season+1}")
            continue

        soup = BeautifulSoup(response.text, 'lxml')
        scripts = soup.find_all('script')

        # Find the script containing teams data
        teams_found = False
        for script in scripts:
            if script.string and 'teamsData' in script.string:
                strings = script.string

                # Extract teams data
                ind_start = strings.index("JSON.parse('") + len("JSON.parse('")
                ind_end = strings.index("')", ind_start)
                json_data = strings[ind_start:ind_end]
                json_data = json_data.encode('utf-8').decode('unicode_escape')
                teams_data = json.loads(json_data)

                # Process each team
                for team_id, team_info in teams_data.items():
                    # Calculate stats from match history
                    matches = team_info['history']

                    wins = sum([1 for m in matches if m['result'] == 'w'])
                    draws = sum([1 for m in matches if m['result'] == 'd'])
                    losses = sum([1 for m in matches if m['result'] == 'l'])
                    goals_for = sum([m['scored'] for m in matches])
                    goals_against = sum([m['missed'] for m in matches])
                    xg = sum([float(m['xG']) for m in matches])
                    xga = sum([float(m['xGA']) for m in matches])
                    points = sum([m['pts'] for m in matches])
                    games_played = len(matches)

                    # Match with squad value if available (from sm_df)
                    squad_value = None
                    if 'sm_df' in globals():
                        club_mask = (
                            (sm_df['Club'] == team_info['title']) |
                            (sm_df['Club'] == team_info['title'] + " FC") |
                            (sm_df['Club'] == "AFC " + team_info['title']) |
                            (sm_df['Club'] == team_info['title'] + " AFC")
                        )
                        matched = sm_df.loc[club_mask, 'Total Market Value']
                        squad_value = matched.values[0] if not matched.empty else None

                    # Match with manager if available (from managers_df)
                    manager_name = None
                    if 'managers_df' in globals():
                        manager_row = managers_df[
                            ((managers_df['Club'] == team_info['title']) |
                             (managers_df['Club'] == team_info['title'] + " FC") |
                             (managers_df['Club'] == "AFC " + team_info['title'])) &
                            (managers_df['Season'] == f"{season}/{season+1}")
                        ]
                        manager_name = manager_row['Manager Name'].values[0] if not manager_row.empty else None

                    team_dict = {
                        "Season": f"{season}/{season+1}",
                        "Team name": team_info['title'],
                        "Games Played": games_played,
                        "Wins": wins,
                        "Draws": draws,
                        "Losses": losses,
                        "Goals For": goals_for,
                        "Goals Against": goals_against,
                        "Goals Difference": goals_for - goals_against,
                        "Points": points,
                        "Points Per Game": round(points / games_played, 2) if games_played > 0 else 0,
                        "xG": round(xg, 1),
                        "xGa": round(xga, 1),
                        "xG Difference": round(xg - xga, 1),
                        "Squad Value": squad_value,
                        "Manager name": manager_name,
                    }
                    teams_df.append(team_dict)

                teams_found = True
                print(f"✓ Successfully scraped {season}/{season+1} - {len(teams_data)} teams")
                break

        if not teams_found:
            print(f"✗ Could not find teams data for {season}/{season+1}")

        time.sleep(2)  # Delay between requests

    except Exception as e:
        print(f"✗ Error processing season {season}/{season+1}: {e}")

# Convert to DataFrame
teams_df = pd.DataFrame(teams_df)

# Sort by season and points to get rankings
teams_df = teams_df.sort_values(['Season', 'Points'], ascending=[True, False])

# Add ranking within each season
teams_df['Ranking'] = teams_df.groupby('Season')['Points'].rank(ascending=False, method='min').astype(int)

# Add Top 6 and Bottom 3 labels
teams_df['Top 6 Label'] = teams_df['Ranking'].apply(lambda x: "Yes" if x <= 6 else "No")
teams_df['Bottom 3 Label'] = teams_df['Ranking'].apply(lambda x: "Yes" if x >= 18 else "No")

# Reorder columns
columns_order = ['Season', 'Ranking', 'Team name', 'Games Played', 'Wins', 'Draws', 'Losses',
                 'Goals For', 'Goals Against', 'Goals Difference', 'Points', 'Points Per Game',
                 'xG', 'xGa', 'xG Difference', 'Squad Value', 'Manager name', 'Top 6 Label', 'Bottom 3 Label']
teams_df = teams_df[columns_order]

print(f"\n{'='*80}")
print(f"Total teams scraped: {len(teams_df)}")
print(f"{'='*80}\n")
print(teams_df.head(20))

✓ Successfully scraped 2024/2025 - 20 teams
✓ Successfully scraped 2023/2024 - 20 teams
✓ Successfully scraped 2022/2023 - 20 teams
✓ Successfully scraped 2021/2022 - 20 teams
✓ Successfully scraped 2020/2021 - 20 teams

Total teams scraped: 100

       Season  Ranking                Team name  Games Played  Wins  Draws  \
92  2020/2021        1          Manchester City            38    27      5   
93  2020/2021        2        Manchester United            38    21     11   
91  2020/2021        3                Liverpool            38    20      9   
86  2020/2021        4                  Chelsea            38    19     10   
83  2020/2021        5                Leicester            38    20      6   
87  2020/2021        6                 West Ham            38    19      8   
88  2020/2021        7                Tottenham            38    18      8   
89  2020/2021        8                  Arsenal            38    18      7   
81  2020/2021        9                  Everton    

- *Player Data :*

Goals, xG, shots, minutes played

In [37]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json

seasons = [2024, 2023, 2022, 2021, 2020]
players_data = []
for season in seasons:
    url = f"https://understat.com/league/EPL/{season}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    scripts = soup.find_all('script')

    try:
        strings = scripts[3].string

        ind_start = strings.index("('")+2
        ind_end = strings.index("');")
        json_data = strings[ind_start:ind_end]
        json_data = json_data.encode('utf-8').decode('unicode_escape')
        data = json.loads(json_data)
        # print(json.dumps(data, indent=2))


        for player in data[:50]: # Limit to first 50 players for brevity
            player_info = {
                "Season": f"{season}/{season+1}",
                "name" : player['player_name'],
                "Team name": player['team_title'],
                "Minutes played": player['time'],
                "Games played": player['games'],
                "Goals": player['goals'],
                "Assists": player['assists'],
                "xG": player['xG'],
                "xA": player['xA'],
                "Shots": player['shots'],
                "Key Passes": player['key_passes'],
                "npg": player['npg'],
                "npxG": player['npxG'],
            }
            players_data.append(player_info)
    except Exception as e:
        print(f"Error processing season {season}: {e}")

players_df = pd.DataFrame(players_data)
print(players_df)

        Season             name          Team name Minutes played  \
0    2024/2025    Mohamed Salah          Liverpool           3392   
1    2024/2025   Alexander Isak   Newcastle United           2822   
2    2024/2025   Erling Haaland    Manchester City           2749   
3    2024/2025       Chris Wood  Nottingham Forest           3024   
4    2024/2025     Bryan Mbeumo          Brentford           3419   
..         ...              ...                ...            ...   
245  2020/2021          Rodrigo              Leeds           1285   
246  2020/2021      Richarlison            Everton           2883   
247  2020/2021    Ferrán Torres    Manchester City           1305   
248  2020/2021  Mason Greenwood  Manchester United           1825   
249  2020/2021      Timo Werner            Chelsea           2605   

    Games played Goals Assists                  xG                  xA Shots  \
0             38    29      18   27.70626749098301  15.858334187418222   130   
1          

Player Position and Team

In [38]:
import requests
import pandas as pd
#Channge the pandas display options to show all columns and increase width
pd.set_option('display.max_columns', None)      # Show all columns
pd.set_option('display.width', 2000)            # Increase display width
pd.set_option('display.max_colwidth', None)     # Show full column content

url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
data = response.json()
positions = {pos['id']: pos['singular_name'] for pos in data['element_types']}
name_to_position = {}
for player in data['elements']:
    full_name = f"{player['first_name']} {player['second_name']}".lower()
    name_to_position[full_name] = positions.get(player['element_type'], "Position not found")

def get_player_position(player_name):
    return name_to_position.get(player_name.lower(), "Position not found")

def get_player_team(player_name):
    for player in data['elements']:
        full_name = f"{player['first_name']} {player['second_name']}".lower()
        if (full_name.lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() or
            player['first_name'].lower() == player_name.lower() or
            player['second_name'].lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() ):
            for team in data['teams']:
                if team['id'] == player['team']:
                    return team['name']
    return "Team not found"

# Add team name and position just after the player name column

players_df['Position'] = players_df['name'].apply(get_player_position)

# Move 'Team Name' and 'Position' right after 'name'
cols = list(players_df.columns)
name_idx = cols.index('name')
# Remove if already present to avoid duplicates
cols = [c for c in cols if c not in ['Position']]
new_cols = cols[:name_idx+1] + ['Position'] + cols[name_idx+1:]
players_df = players_df[new_cols]

print(players_df.head(15))

       Season                  name            Position                Team name Minutes played Games played Goals Assists                  xG                  xA Shots Key Passes npg                npxG
0   2024/2025         Mohamed Salah          Midfielder                Liverpool           3392           38    29      18   27.70626749098301  15.858334187418222   130         89  20  20.855747912079096
1   2024/2025        Alexander Isak             Forward         Newcastle United           2822           34    23       6  22.356988068670034    5.44870379474014    99         42  19  19.312312599271536
2   2024/2025        Erling Haaland             Forward          Manchester City           2749           31    22       3   23.95459282770753  3.5812273556366563   109         29  19  20.909917432814837
3   2024/2025            Chris Wood             Forward        Nottingham Forest           3024           36    20       3  15.638655036687851   3.044111367315054    68         22  17 

Penalty taker? (Yes/No)

In [39]:
# We will simply define a function to check either a player is a penalty taker or not and what is his order if sp
import requests

# we fetch the data once for efficiency
url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
if response.status_code != 200:
    raise Exception("Failed to fetch data from the API")
data = response.json()

def is_penalty_taker(player_name):
    for player in data['elements']:
        full_name = f"{player['first_name']} {player['second_name']}"
        if (full_name.lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower() or
            player['first_name'].lower() == player_name.lower() or
            player['second_name'].lower() == player_name.lower() or
            player['web_name'].lower() == player_name.lower()):
            order = player['penalties_order']
            if order and order > 0:
                return "Yes"
            else:
                return "No"
    return "Player not found"
players_df['Penalty Taker'] = players_df['name'].apply(is_penalty_taker)

cols = list(players_df.columns)
npxg = cols.index('npxG')
cols = [c for c in cols if c != 'Penalty Taker']
new_cols = cols[:npxg+1] + ['Penalty Taker'] + cols[npxg+1:]

players_df = players_df[new_cols]
print(players_df.head(10))

      Season                  name            Position                Team name Minutes played Games played Goals Assists                  xG                  xA Shots Key Passes npg                npxG     Penalty Taker
0  2024/2025         Mohamed Salah          Midfielder                Liverpool           3392           38    29      18   27.70626749098301  15.858334187418222   130         89  20  20.855747912079096               Yes
1  2024/2025        Alexander Isak             Forward         Newcastle United           2822           34    23       6  22.356988068670034    5.44870379474014    99         42  19  19.312312599271536               Yes
2  2024/2025        Erling Haaland             Forward          Manchester City           2749           31    22       3   23.95459282770753  3.5812273556366563   109         29  19  20.909917432814837               Yes
3  2024/2025            Chris Wood             Forward        Nottingham Forest           3024           36    20   

#3 Data Cleaning & Preprocessing

- Player Data

In [41]:
# Let's make sure that the numeric columns are in the right format
numeric_cols = ['Minutes played', 'Games played', 'Goals', 'Assists', 'xG', 'xA', 'Shots', 'Key Passes', 'npg', 'npxG']
for col in numeric_cols:
    players_df[col] = pd.to_numeric(players_df[col], errors='coerce')

# Convert categorical columns to string type
categorical_cols = ['name', 'Team name', 'Position', 'Penalty Taker']
for col in categorical_cols:
    players_df[col] = players_df[col].astype(str)
    players_df[col] = players_df[col].str.strip()

# Let's convert the 'Season' column to a string type
players_df['Season'] = players_df['Season'].astype(str)

# Check for any missing values
print("Missing values per column:")
print(players_df.isnull().sum())
print("\n" + "="*50)

# Check the data types of the columns
print("\nData types:")
print(players_df.dtypes)
print("\n" + "="*50)

# Additional checks:
print(f"\nTotal rows: {len(players_df)}")
print(f"Total columns: {len(players_df.columns)}")
print(f"\nUnique seasons: {players_df['Season'].unique()}")
print(f"Unique teams: {players_df['Team name'].nunique()}")
print(f"Unique positions: {players_df['Position'].unique()}")

# Check for any data quality issues
print("\n" + "="*50)
print("Data Quality Checks:")
print(f"Players with 0 minutes played: {(players_df['Minutes played'] == 0).sum()}")
print(f"Players with 0 games played: {(players_df['Games played'] == 0).sum()}")
print(f"Players with negative values in any numeric column: {(players_df[numeric_cols] < 0).any().any()}")

# Display sample of cleaned data
print("\n" + "="*50)
print("Sample of cleaned data:")
print(players_df.head(10))

Missing values per column:
Season            0
name              0
Position          0
Team name         0
Minutes played    0
Games played      0
Goals             0
Assists           0
xG                0
xA                0
Shots             0
Key Passes        0
npg               0
npxG              0
Penalty Taker     0
dtype: int64


Data types:
Season             object
name               object
Position           object
Team name          object
Minutes played      int64
Games played        int64
Goals               int64
Assists             int64
xG                float64
xA                float64
Shots               int64
Key Passes          int64
npg                 int64
npxG              float64
Penalty Taker      object
dtype: object


Total rows: 250
Total columns: 15

Unique seasons: ['2024/2025' '2023/2024' '2022/2023' '2021/2022' '2020/2021']
Unique teams: 31
Unique positions: ['Midfielder' 'Forward' 'Position not found']

Data Quality Checks:
Players with 0 minutes p

Filter and Clean Player Data

In [42]:
# Let's keep only attacking players
attacking_positions = ['Forward', 'Midfielder']
players_df = players_df[players_df['Position'].isin(attacking_positions)]
print(f"After filtering positions: {len(players_df)} players")

# Let's keep only players with more than 1000 minutes played
players_df = players_df[players_df['Minutes played'] > 1000]
print(f"After filtering minutes (>1000): {len(players_df)} players")

# Let's drop or fill any missing values
players_df = players_df.dropna(subset=['name', 'Team name', 'Position', 'Penalty Taker'])
print(f"After dropping missing values: {len(players_df)} players")

# Switch Penalty Taker to a binary column
# Handle different possible values (Yes, No, Player not found, etc.)
players_df['Penalty Taker'] = players_df['Penalty Taker'].apply(
    lambda x: 1 if str(x).lower() in ['yes', '1', 'true'] else 0
)

# IMPORTANT: Avoid division by zero
# Let's add derived columns for goals per game and assists per game
players_df['Goals per Game'] = players_df.apply(
    lambda row: row['Goals'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)
players_df['Assists per Game'] = players_df.apply(
    lambda row: row['Assists'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)

# Let's add a column for total contributions (goals + assists)
players_df['Total Contributions'] = players_df['Goals'] + players_df['Assists']

# Let's add a column for xG per game
players_df['xG per Game'] = players_df.apply(
    lambda row: row['xG'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)

# Let's add a column for xA per game
players_df['xA per Game'] = players_df.apply(
    lambda row: row['xA'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)

# Let's add a column for npg per game
players_df['npg per Game'] = players_df.apply(
    lambda row: row['npg'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)

# Let's add a column for npxG per game
players_df['npxG per Game'] = players_df.apply(
    lambda row: row['npxG'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)

# Additional useful metrics
players_df['Shots per Game'] = players_df.apply(
    lambda row: row['Shots'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)
players_df['Key Passes per Game'] = players_df.apply(
    lambda row: row['Key Passes'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)
players_df['Minutes per Game'] = players_df.apply(
    lambda row: row['Minutes played'] / row['Games played'] if row['Games played'] > 0 else 0, axis=1
)

# Shot conversion rate (Goals / Shots)
players_df['Shot Conversion'] = players_df.apply(
    lambda row: row['Goals'] / row['Shots'] if row['Shots'] > 0 else 0, axis=1
)

# xG overperformance (Goals - xG)
players_df['xG Overperformance'] = players_df['Goals'] - players_df['xG']

print("\n" + "="*80)
print("Feature Engineering Complete!")
print(f"Total features: {len(players_df.columns)}")
print("\n" + "="*80)
print(players_df.tail(20))
print("\n" + "="*80)
print("\nSummary statistics of new features:")
print(players_df[['Goals per Game', 'Assists per Game', 'xG per Game',
                  'Total Contributions', 'Shot Conversion']].describe())

After filtering positions: 121 players
After filtering minutes (>1000): 120 players
After dropping missing values: 120 players

Feature Engineering Complete!
Total features: 27

        Season                   name    Position                 Team name  Minutes played  Games played  Goals  Assists         xG        xA  Shots  Key Passes  npg       npxG  Penalty Taker  Goals per Game  Assists per Game  Total Contributions  xG per Game  xA per Game  npg per Game  npxG per Game  Shots per Game  Key Passes per Game  Minutes per Game  Shot Conversion  xG Overperformance
185  2021/2022       Leandro Trossard  Midfielder                  Brighton            2830            34      8        3   7.961571  5.277916     73          47    7   7.200403              1        0.235294          0.088235                   11     0.234164     0.155233      0.205882       0.211777        2.147059             1.382353         83.235294         0.109589            0.038429
186  2021/2022          Jack Har

- Teams Data

In [43]:
# Let's make sure that the numeric columns are in the right format
numeric_cols = ['Ranking', 'Games Played', 'Wins', 'Draws', 'Losses', 'Goals For',
                'Goals Against', 'Goals Difference', 'Points', 'Points Per Game',
                'xG', 'xGa', 'Squad Value']

for col in numeric_cols:
    teams_df[col] = pd.to_numeric(teams_df[col], errors='coerce')

# Convert categorical columns to string type
categorical_cols = ['Team name', 'Manager name']
for col in categorical_cols:
    teams_df[col] = teams_df[col].astype(str)
    teams_df[col] = teams_df[col].str.strip()

# Let's convert the 'Season' column to a string type
teams_df['Season'] = teams_df['Season'].astype(str)

# Check for any missing values
print("Missing values per column:")
print(teams_df.isnull().sum())
print("\n" + "="*80)

# Check the data types of the columns
print("\nData types:")
print(teams_df.dtypes)
print("\n" + "="*80)

# Additional validation checks
print("\nData Quality Checks:")
print(f"Total teams: {len(teams_df)}")
print(f"Unique seasons: {teams_df['Season'].nunique()}")
print(f"Teams per season: {teams_df.groupby('Season').size()}")
print(f"Unique team names: {teams_df['Team name'].nunique()}")

# Check for data consistency
print("\n" + "="*80)
print("Data Consistency Checks:")

# Check if Wins + Draws + Losses = Games Played
teams_df['Games Check'] = teams_df['Wins'] + teams_df['Draws'] + teams_df['Losses']
inconsistent_games = teams_df[teams_df['Games Check'] != teams_df['Games Played']]
print(f"Teams with inconsistent game counts: {len(inconsistent_games)}")

# Check if Goals Difference = Goals For - Goals Against
teams_df['GD Check'] = teams_df['Goals For'] - teams_df['Goals Against']
inconsistent_gd = teams_df[teams_df['GD Check'] != teams_df['Goals Difference']]
print(f"Teams with inconsistent goal difference: {len(inconsistent_gd)}")

# Check if Points make sense (3*W + 1*D)
teams_df['Points Check'] = (teams_df['Wins'] * 3) + teams_df['Draws']
inconsistent_points = teams_df[teams_df['Points Check'] != teams_df['Points']]
print(f"Teams with inconsistent points: {len(inconsistent_points)}")

# Drop temporary check columns
teams_df = teams_df.drop(columns=['Games Check', 'GD Check', 'Points Check'])

# Check for outliers
print("\n" + "="*80)
print("Outlier Detection:")
print(f"Teams with 0 games played: {(teams_df['Games Played'] == 0).sum()}")
print(f"Teams with negative values: {(teams_df[numeric_cols] < 0).any().any()}")
print(f"Teams with missing squad value: {teams_df['Squad Value'].isnull().sum()}")
print(f"Teams with missing manager: {(teams_df['Manager name'] == 'None').sum() + teams_df['Manager name'].isnull().sum()}")

# Display summary statistics
print("\n" + "="*80)
print("Summary Statistics:")
print(teams_df[['Points', 'Goals For', 'Goals Against', 'xG', 'xGa']].describe())

# Display sample of cleaned data
print("\n" + "="*80)
print("Sample of cleaned data (top 5 teams from latest season):")
print(teams_df.sort_values(['Season', 'Ranking'], ascending=[False, True]).head(5))

Missing values per column:
Season                0
Ranking               0
Team name             0
Games Played          0
Wins                  0
Draws                 0
Losses                0
Goals For             0
Goals Against         0
Goals Difference      0
Points                0
Points Per Game       0
xG                    0
xGa                   0
xG Difference         0
Squad Value         100
Manager name          0
Top 6 Label           0
Bottom 3 Label        0
dtype: int64


Data types:
Season               object
Ranking               int64
Team name            object
Games Played          int64
Wins                  int64
Draws                 int64
Losses                int64
Goals For             int64
Goals Against         int64
Goals Difference      int64
Points                int64
Points Per Game     float64
xG                  float64
xGa                 float64
xG Difference       float64
Squad Value         float64
Manager name         object
Top 6 Label   

Filter and Clean Teams Data

In [44]:
# Let's switch the Top 6 Label and Bottom 3 Label to a binary column
# Handle different possible values (Yes, No, or other)
teams_df['Top 6 Label'] = teams_df['Top 6 Label'].apply(
    lambda x: 1 if str(x).lower() in ['yes', '1', 'true'] else 0
)
teams_df['Bottom 3 Label'] = teams_df['Bottom 3 Label'].apply(
    lambda x: 1 if str(x).lower() in ['yes', '1', 'true'] else 0
)

print(f"Top 6 teams: {teams_df['Top 6 Label'].sum()}")
print(f"Bottom 3 teams: {teams_df['Bottom 3 Label'].sum()}")

# Let's add derived columns with division by zero protection
# Goals per game and points per game
teams_df['Goals per Game'] = teams_df.apply(
    lambda row: row['Goals For'] / row['Games Played'] if row['Games Played'] > 0 else 0, axis=1
)
teams_df['Points per Game'] = teams_df.apply(
    lambda row: row['Points'] / row['Games Played'] if row['Games Played'] > 0 else 0, axis=1
)

# xG difference
teams_df['xG Difference'] = teams_df['xG'] - teams_df['xGa']

# Win Ratio
teams_df['Win Ratio'] = teams_df.apply(
    lambda row: row['Wins'] / row['Games Played'] if row['Games Played'] > 0 else 0, axis=1
)

# Goal Efficiency (Goals For / xG)
teams_df['Goal Efficiency'] = teams_df.apply(
    lambda row: row['Goals For'] / row['xG'] if row['xG'] > 0 else 0, axis=1
)

# Defensive Leakiness (Goals Against / xGA)
teams_df['Defensive Leakiness'] = teams_df.apply(
    lambda row: row['Goals Against'] / row['xGa'] if row['xGa'] > 0 else 0, axis=1
)

# Additional useful features
print("\n" + "="*80)
print("Adding additional features...")

# Draw Ratio
teams_df['Draw Ratio'] = teams_df.apply(
    lambda row: row['Draws'] / row['Games Played'] if row['Games Played'] > 0 else 0, axis=1
)

# Loss Ratio
teams_df['Loss Ratio'] = teams_df.apply(
    lambda row: row['Losses'] / row['Games Played'] if row['Games Played'] > 0 else 0, axis=1
)

# Clean Sheet Percentage (approximation: when GA = 0 per game, need match-level data for exact)
teams_df['Goals Against per Game'] = teams_df.apply(
    lambda row: row['Goals Against'] / row['Games Played'] if row['Games Played'] > 0 else 0, axis=1
)

# Attack Strength (Goals For - xG, positive means overperforming)
teams_df['Attack Overperformance'] = teams_df['Goals For'] - teams_df['xG']

# Defense Strength (xGA - Goals Against, positive means strong defense)
teams_df['Defense Overperformance'] = teams_df['xGa'] - teams_df['Goals Against']

# Overall Efficiency (combining attack and defense)
teams_df['Overall Efficiency'] = teams_df['Attack Overperformance'] + teams_df['Defense Overperformance']

# Points per xG difference (how well team converts xG advantage to points)
teams_df['Points per xG Diff'] = teams_df.apply(
    lambda row: row['Points'] / row['xG Difference'] if row['xG Difference'] > 0 else 0, axis=1
)

print("\n" + "="*80)
print("Feature Engineering Complete!")
print(f"Total features: {len(teams_df.columns)}")

# Validation checks
print("\n" + "="*80)
print("Validation Checks:")
print(f"Teams with Goal Efficiency > 2 (unusual): {(teams_df['Goal Efficiency'] > 2).sum()}")
print(f"Teams with Goal Efficiency < 0.5 (unusual): {(teams_df['Goal Efficiency'] < 0.5).sum()}")
print(f"Teams with Win Ratio > 0.8 (dominant): {(teams_df['Win Ratio'] > 0.8).sum()}")
print(f"Teams with Win Ratio < 0.2 (struggling): {(teams_df['Win Ratio'] < 0.2).sum()}")

# Summary statistics
print("\n" + "="*80)
print("Summary Statistics of New Features:")
print(teams_df[['Win Ratio', 'Goal Efficiency', 'Defensive Leakiness',
               'xG Difference', 'Attack Overperformance', 'Defense Overperformance']].describe())

print("\n" + "="*80)
print("Sample Data (Last 20 teams):")
print(teams_df.tail(20))

# Optional: Display correlation between key metrics
print("\n" + "="*80)
print("Correlation with Points:")
correlation_cols = ['Win Ratio', 'Goals per Game', 'Goal Efficiency', 'xG Difference',
                   'Defensive Leakiness', 'Attack Overperformance']
print(teams_df[correlation_cols + ['Points']].corr()['Points'].sort_values(ascending=False))

Top 6 teams: 30
Bottom 3 teams: 15

Adding additional features...

Feature Engineering Complete!
Total features: 31

Validation Checks:
Teams with Goal Efficiency > 2 (unusual): 0
Teams with Goal Efficiency < 0.5 (unusual): 0
Teams with Win Ratio > 0.8 (dominant): 0
Teams with Win Ratio < 0.2 (struggling): 14

Summary Statistics of New Features:
        Win Ratio  Goal Efficiency  Defensive Leakiness  xG Difference  Attack Overperformance  Defense Overperformance
count  100.000000       100.000000           100.000000      100.00000               100.00000               100.000000
mean     0.386053         0.958329             0.963865       -0.00500                -2.05900                 2.064000
std      0.164748         0.132177             0.109884       25.84087                 7.09712                 6.515537
min      0.052632         0.602410             0.735043      -59.70000               -20.80000               -18.300000
25%      0.289474         0.881216             0.879

Encode categorical columns & Merge Team Data into Player Data

In [45]:
# IMPORTANT: Don't encode before merging!
# Merge first, then encode if needed for modeling

# First, let's standardize team names to ensure proper matching
# (in case there are slight variations)
def standardize_team_name(name):
    name = str(name).strip()
    # Add any team name standardization here if needed
    return name

players_df['Team name'] = players_df['Team name'].apply(standardize_team_name)
teams_df['Team name'] = teams_df['Team name'].apply(standardize_team_name)

print(f"Players dataframe shape: {players_df.shape}")
print(f"Teams dataframe shape: {teams_df.shape}")
print(f"\nUnique teams in players_df: {players_df['Team name'].nunique()}")
print(f"Unique teams in teams_df: {teams_df['Team name'].nunique()}")

# Check for team name mismatches before merging
players_teams = set(players_df['Team name'].unique())
teams_teams = set(teams_df['Team name'].unique())
print(f"\nTeams in players but not in teams: {players_teams - teams_teams}")
print(f"Teams in teams but not in players: {teams_teams - players_teams}")

# Merge the dataframes WITHOUT encoding first
merged_df = pd.merge(
    players_df,
    teams_df,
    on=['Team name', 'Season'],
    how='inner',  # Use 'left' if you want to keep all players even without team data
    suffixes=('_player', '_team')
)

print(f"\n{'='*80}")
print(f"Merged dataframe shape: {merged_df.shape}")
print(f"Players before merge: {len(players_df)}")
print(f"Players after merge: {len(merged_df)}")
print(f"Players lost in merge: {len(players_df) - len(merged_df)}")

# NOW encode categorical columns AFTER merging
# Keep original values and create separate encoded columns
from sklearn.preprocessing import LabelEncoder

# Create separate encoders for each column to preserve mappings
encoders = {}

# Encode player names
encoders['name'] = LabelEncoder()
merged_df['name_encoded'] = encoders['name'].fit_transform(merged_df['name'])

# Encode team names
encoders['Team name'] = LabelEncoder()
merged_df['Team name_encoded'] = encoders['Team name'].fit_transform(merged_df['Team name'])

# Encode positions
encoders['Position'] = LabelEncoder()
merged_df['Position_encoded'] = encoders['Position'].fit_transform(merged_df['Position'])

# Encode manager names (handle NaN values first)
merged_df['Manager name'] = merged_df['Manager name'].fillna('Unknown')
encoders['Manager name'] = LabelEncoder()
merged_df['Manager name_encoded'] = encoders['Manager name'].fit_transform(merged_df['Manager name'])

# Print encoding mappings for reference
print(f"\n{'='*80}")
print("Encoding Mappings:")
print(f"\nPositions: {dict(enumerate(encoders['Position'].classes_))}")
print(f"Unique teams: {len(encoders['Team name'].classes_)}")
print(f"Unique managers: {len(encoders['Manager name'].classes_)}")
print(f"Unique players: {len(encoders['name'].classes_)}")

# Display the merged DataFrame
print(f"\n{'='*80}")
print("Sample of merged data:")
print(merged_df[['Season', 'name', 'Team name', 'Position', 'Goals',
                 'Points', 'Ranking', 'Manager name']].head(20))

# Show columns
print(f"\n{'='*80}")
print(f"Total columns in merged dataframe: {len(merged_df.columns)}")
print("\nAll columns:")
print(merged_df.columns.tolist())

Players dataframe shape: (120, 27)
Teams dataframe shape: (100, 31)

Unique teams in players_df: 25
Unique teams in teams_df: 27

Teams in players but not in teams: {'Chelsea,Manchester City', 'Arsenal,Brighton', 'Arsenal,Newcastle United'}
Teams in teams but not in players: {'Luton', 'Sheffield United', 'Norwich', 'West Bromwich Albion', 'Watford'}

Merged dataframe shape: (117, 56)
Players before merge: 120
Players after merge: 117
Players lost in merge: 3

Encoding Mappings:

Positions: {0: 'Forward', 1: 'Midfielder'}
Unique teams: 22
Unique managers: 17
Unique players: 55

Sample of merged data:
       Season                  name                Team name    Position  Goals  Points  Ranking    Manager name
0   2024/2025         Mohamed Salah                Liverpool  Midfielder     29      84        1       Arne Slot
1   2024/2025        Alexander Isak         Newcastle United     Forward     23      66        5      Eddie Howe
2   2024/2025        Erling Haaland          Mancheste

#4 Build the Prediction Models

In [46]:
# CRITICAL: Remove data leakage - don't include outcome variables!
# Variables like 'Points', 'Ranking', 'Wins' directly determine Top 6 Label

print("="*80)
print("Building Prediction Model - Removing Data Leakage")
print("="*80)

# List of columns that cause DATA LEAKAGE (these are outcomes, not predictors)
leakage_columns = [
    # Target variables
    'Top 6 Label', 'Bottom 3 Label',

    # Direct outcomes (these DETERMINE the target)
    'Points', 'Ranking', 'Points Per Game', 'Points per Game',
    'Wins', 'Draws', 'Losses', 'Games Played',
    'Goals For', 'Goals Against', 'Goals Difference',

    # Identifiers (not useful for prediction)
    'Team name', 'Season', 'Manager name', 'name',

    # Encoded versions of identifiers (if they exist)
    'Team name_encoded', 'name_encoded', 'Manager name_encoded'
]

# Also remove player-level outcome columns that are aggregates
player_outcome_cols = [
    'Goals', 'Assists', 'Total Contributions',
    'Goals per Game', 'Assists per Game'
]

# Remove all leakage columns
columns_to_drop = [col for col in leakage_columns + player_outcome_cols
                   if col in merged_df.columns]

print(f"\nDropping {len(columns_to_drop)} columns to prevent data leakage:")
print(columns_to_drop)

# PREDICTIVE FEATURES (what we know BEFORE the season)
# These should be things like:
# - xG metrics (predictive of future performance)
# - Squad value (team quality indicator)
# - Previous season performance (would need to add)
# - Player quality metrics (xG, xA, shots, key passes)
# - Manager experience

X = merged_df.drop(columns=columns_to_drop)
y = merged_df['Top 6 Label']

print(f"\n{'='*80}")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution: {y.value_counts()}")

# Check for any remaining non-numeric columns
non_numeric = X.select_dtypes(include=['object']).columns.tolist()
if non_numeric:
    print(f"\nWARNING: Non-numeric columns found: {non_numeric}")
    print("These need to be encoded or dropped.")
    X = X.drop(columns=non_numeric)

# Check for missing values
print(f"\n{'='*80}")
print("Missing values in features:")
missing = X.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
    print("\nFilling missing values with median...")
    X = X.fillna(X.median())
else:
    print("No missing values found.")

# Display feature columns
print(f"\n{'='*80}")
print(f"Final feature set ({len(X.columns)} features):")
print(X.columns.tolist())

# TIME-BASED SPLIT (much better than random split for time series data)
print(f"\n{'='*80}")
print("Using time-based split instead of random split...")

# Check available seasons
seasons_available = sorted(merged_df['Season'].unique())
print(f"Available seasons: {seasons_available}")

if len(seasons_available) >= 2:
    # Use earlier seasons for training, latest for testing
    train_seasons = seasons_available[:-1]
    test_seasons = [seasons_available[-1]]

    train_idx = merged_df['Season'].isin(train_seasons)
    test_idx = merged_df['Season'].isin(test_seasons)

    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]

    print(f"\nTraining seasons: {train_seasons}")
    print(f"Testing season: {test_seasons}")
    print(f"Training set size: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
    print(f"Testing set size: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")
else:
    print("\nNot enough seasons for time-based split. Using random split...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Training set size: {len(X_train)}")
    print(f"Testing set size: {len(X_test)}")

# Check class distribution
print(f"\n{'='*80}")
print("Class Distribution:")
print(f"Training set - Top 6: {y_train.sum()} | Not Top 6: {(~y_train.astype(bool)).sum()}")
print(f"Testing set - Top 6: {y_test.sum()} | Not Top 6: {(~y_test.astype(bool)).sum()}")

# Optional: Scale features (helpful but not required for Random Forest)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n{'='*80}")
print("Data preparation complete! Ready for model training.")

Building Prediction Model - Removing Data Leakage

Dropping 24 columns to prevent data leakage:
['Top 6 Label', 'Bottom 3 Label', 'Points', 'Ranking', 'Points Per Game', 'Points per Game', 'Wins', 'Draws', 'Losses', 'Games Played', 'Goals For', 'Goals Against', 'Goals Difference', 'Team name', 'Season', 'Manager name', 'name', 'Team name_encoded', 'name_encoded', 'Manager name_encoded', 'Goals', 'Assists', 'Total Contributions', 'Assists per Game']

Features shape: (117, 36)
Target shape: (117,)
Target distribution: Top 6 Label
0    67
1    50
Name: count, dtype: int64

These need to be encoded or dropped.

Missing values in features:
Squad Value    117
dtype: int64

Filling missing values with median...

Final feature set (35 features):
['Minutes played', 'Games played', 'xG_player', 'xA', 'Shots', 'Key Passes', 'npg', 'npxG', 'Penalty Taker', 'Goals per Game_player', 'xG per Game', 'xA per Game', 'npg per Game', 'npxG per Game', 'Shots per Game', 'Key Passes per Game', 'Minutes per G

Training the model

In [47]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import (classification_report, accuracy_score, confusion_matrix,
                             roc_auc_score, precision_recall_curve, roc_curve)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

print("="*80)
print("BUILDING AND TRAINING TOP 6 PREDICTION MODEL")
print("="*80)

# ============================================================================
# STEP 1: Baseline Model - Simple Random Forest
# ============================================================================
print("\n" + "="*80)
print("STEP 1: Training Baseline Random Forest Model")
print("="*80)

# Initialize baseline model
rf_baseline = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Train baseline model
rf_baseline.fit(X_train, y_train)

# Predictions
y_pred_baseline = rf_baseline.predict(X_test)
y_pred_proba_baseline = rf_baseline.predict_proba(X_test)[:, 1]

# Evaluate baseline
print("\nBaseline Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_baseline):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_baseline,
                          target_names=['Not Top 6', 'Top 6']))

# ============================================================================
# STEP 2: Cross-Validation to Check Model Stability
# ============================================================================
print("\n" + "="*80)
print("STEP 2: Cross-Validation on Training Set")
print("="*80)

# Use stratified k-fold to maintain class distribution
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(rf_baseline, X_train, y_train,
                            cv=cv, scoring='accuracy', n_jobs=-1)

print(f"\nCross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Also check ROC-AUC in CV
cv_scores_auc = cross_val_score(rf_baseline, X_train, y_train,
                                cv=cv, scoring='roc_auc', n_jobs=-1)
print(f"Mean CV ROC-AUC: {cv_scores_auc.mean():.4f} (+/- {cv_scores_auc.std() * 2:.4f})")

# ============================================================================
# STEP 3: Hyperparameter Tuning with GridSearchCV
# ============================================================================
print("\n" + "="*80)
print("STEP 3: Hyperparameter Tuning (this may take a few minutes...)")
print("="*80)

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', None]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV ROC-AUC: {grid_search.best_score_:.4f}")

# Get best model
rf_tuned = grid_search.best_estimator_

# ============================================================================
# STEP 4: Train Multiple Models and Compare
# ============================================================================
print("\n" + "="*80)
print("STEP 4: Training Multiple Models for Comparison")
print("="*80)

models = {
    'Random Forest (Tuned)': rf_tuned,
    'Random Forest (Baseline)': rf_baseline,
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    )
}

results = {}

for name, model in models.items():
    # Train model (skip if already trained)
    if name not in ['Random Forest (Tuned)', 'Random Forest (Baseline)']:
        model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

    print(f"\n{name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  ROC-AUC: {roc_auc:.4f}")

# Find best model
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = results[best_model_name]['model']

print(f"\n{'='*80}")
print(f"BEST MODEL: {best_model_name}")
print(f"ROC-AUC: {results[best_model_name]['roc_auc']:.4f}")
print(f"{'='*80}")

# ============================================================================
# STEP 5: Detailed Evaluation of Best Model
# ============================================================================
print("\n" + "="*80)
print("STEP 5: Detailed Evaluation of Best Model")
print("="*80)

y_pred_best = results[best_model_name]['predictions']
y_pred_proba_best = results[best_model_name]['probabilities']

# Classification Report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_best,
                          target_names=['Not Top 6', 'Top 6']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
print("\nConfusion Matrix:")
print(cm)

# ============================================================================
# STEP 6: Feature Importance Analysis
# ============================================================================
print("\n" + "="*80)
print("STEP 6: Feature Importance Analysis")
print("="*80)

if hasattr(best_model, 'feature_importances_'):
    # Get feature importances
    importances = best_model.feature_importances_
    feature_names = X_train.columns

    # Create DataFrame
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)

    print("\nTop 20 Most Important Features:")
    print(importance_df.head(20).to_string(index=False))

    # Plot feature importances
    plt.figure(figsize=(12, 8))
    top_features = importance_df.head(20)
    sns.barplot(data=top_features, x='Importance', y='Feature')
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

# ============================================================================
# STEP 7: Visualizations
# ============================================================================
print("\n" + "="*80)
print("STEP 7: Creating Visualizations")
print("="*80)

# 1. Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Top 6', 'Top 6'],
            yticklabels=['Not Top 6', 'Top 6'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# 2. ROC Curve
plt.figure(figsize=(10, 6))
for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
    auc = result['roc_auc']
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 3. Precision-Recall Curve
plt.figure(figsize=(10, 6))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_best)
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve - {best_model_name}')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 4. Probability Distribution
plt.figure(figsize=(10, 6))
plt.hist(y_pred_proba_best[y_test == 0], bins=20, alpha=0.5, label='Not Top 6', color='blue')
plt.hist(y_pred_proba_best[y_test == 1], bins=20, alpha=0.5, label='Top 6', color='red')
plt.xlabel('Predicted Probability of Top 6')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# ============================================================================
# STEP 8: Save the Best Model
# ============================================================================
print("\n" + "="*80)
print("STEP 8: Saving Best Model")
print("="*80)

import joblib

# Save model
model_filename = f'top6_predictor_{best_model_name.replace(" ", "_").lower()}.pkl'
joblib.dump(best_model, model_filename)
print(f"Model saved as: {model_filename}")

# Save scaler if used
if 'scaler' in globals():
    scaler_filename = 'scaler.pkl'
    joblib.dump(scaler, scaler_filename)
    print(f"Scaler saved as: {scaler_filename}")

# Save feature names
feature_names_filename = 'feature_names.pkl'
joblib.dump(X_train.columns.tolist(), feature_names_filename)
print(f"Feature names saved as: {feature_names_filename}")

# ============================================================================
# STEP 9: Make Predictions with Team Names
# ============================================================================
print("\n" + "="*80)
print("STEP 9: Predictions with Team Names")
print("="*80)

# Get test set with team names
test_idx = merged_df['Season'].isin(test_seasons) if 'test_seasons' in locals() else X_test.index
predictions_df = pd.DataFrame({
    'Season': merged_df.loc[test_idx, 'Season'].values,
    'Team': merged_df.loc[test_idx, 'Team name'].values,
    'Actual Top 6': y_test.values,
    'Predicted Top 6': y_pred_best,
    'Probability': y_pred_proba_best
})

# Sort by probability
predictions_df = predictions_df.sort_values('Probability', ascending=False)

print("\nPredictions for Test Set:")
print(predictions_df.to_string(index=False))

# Check prediction accuracy by team
print("\n" + "="*80)
print("Prediction Accuracy by Team:")
correct_predictions = predictions_df[predictions_df['Actual Top 6'] == predictions_df['Predicted Top 6']]
print(f"Correct Predictions: {len(correct_predictions)}/{len(predictions_df)} ({len(correct_predictions)/len(predictions_df)*100:.1f}%)")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "="*80)
print("MODEL TRAINING COMPLETE - SUMMARY")
print("="*80)
print(f"\nBest Model: {best_model_name}")
print(f"Test Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"Test ROC-AUC: {results[best_model_name]['roc_auc']:.4f}")
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"\nModel saved and ready for deployment!")
print("="*80)

BUILDING AND TRAINING TOP 6 PREDICTION MODEL

STEP 1: Training Baseline Random Forest Model

Baseline Model Performance:
Accuracy: 0.9118
ROC-AUC: 1.0000

Classification Report:
              precision    recall  f1-score   support

   Not Top 6       1.00      0.85      0.92        20
       Top 6       0.82      1.00      0.90        14

    accuracy                           0.91        34
   macro avg       0.91      0.93      0.91        34
weighted avg       0.93      0.91      0.91        34


STEP 2: Cross-Validation on Training Set

Cross-Validation Accuracy Scores: [0.94117647 1.         1.         0.9375     1.        ]
Mean CV Accuracy: 0.9757 (+/- 0.0595)
Mean CV ROC-AUC: 1.0000 (+/- 0.0000)

STEP 3: Hyperparameter Tuning (this may take a few minutes...)
Fitting 5 folds for each of 432 candidates, totalling 2160 fits

Best parameters: {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Be

ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Let's initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
rf_classifier.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)
# let's link the predictions to the team names (decoding the labels)
predicted_teams = label_encoder.inverse_transform(y_pred)
# Create a DataFrame to hold the predictions
predictions_df = pd.DataFrame({
    'Team': predicted_teams,
    'Predicted Top 6': y_pred
})
# Display the predictions
print(predictions_df)



#5 Evaluate Models

In [None]:
#Let's evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
# We can also visualize the feature importances
feature_importances = rf_classifier.feature_importances_
feature_names = X.columns
# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
# Plot the feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

#6 Simulate Next Season

#7 Visualizations