In [None]:
import pandas as pd
df = pd.read_csv('/final_rolling.csv')
df2 = pd.read_csv('/match_by_match.csv')
df3= pd.read_csv('/prem_with_elo.csv')

In [None]:
prem_df = (df2['Comp'] == 'Premier League')
prem_df = df2[prem_df]
prem_df = prem_df.drop(['Match Report', 'Notes', 'Time', 'Day'], axis=1)
prem_df

In [None]:
date_map = prem_df['Date']

df['Date'] = df['original_index'].map(date_map)

new_order = [
    'Season',
    'Team',
    'Date',
    'original_index',
    'xG',
    'xG Difference',
    'xG_roll_avg',
    'xG_diff_roll_avg',
    'Points',
    'Elo'
]

final_dataset = df[new_order]

final_dataset.to_csv('/Users/ogizelenovic/Downloads/final_rolling.csv', encoding='utf-8-sig', index=True)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

try:
    player_stats_df = pd.read_csv('/MASTER.csv')
except FileNotFoundError as e:
    print(f"ERROR: Could not find a file.")
    print(f"Details: {e}")
    raise

# --- initial data cleaning and preparation
print("\nStep 1: Preparing source data...")
df3['Date'] = pd.to_datetime(df3['Date'])
final_dataset['Date'] = pd.to_datetime(final_dataset['Date'])
player_stats_df['Season'] = player_stats_df['Season'].astype(str)

# --- aggregateing player stats to create team-level stats
print("Step 2: Aggregating player stats...")

numeric_cols = ['Age', 'Min', 'Gls', 'Ast', 'npxG', 'xAG', 'PrgC', 'PrgP']
for col in numeric_cols:
    player_stats_df[col] = pd.to_numeric(player_stats_df[col], errors='coerce')

significant_players = player_stats_df[player_stats_df['Min'] > 450].copy()

team_level_features = significant_players.groupby(['Season', 'Squad']).agg(
    squad_avg_age=('Age', 'mean'),
    squad_total_npxg=('npxG', 'sum'),
    squad_total_prgp=('PrgP', 'sum'),
    squad_total_gls=('Gls', 'sum')
).reset_index()

# --- create final dataframe
model_df = df3.copy()
rolling_features_to_merge = final_dataset[['Season', 'Team', 'Date', 'xG_roll_avg', 'xG_diff_roll_avg']].copy()

model_df = pd.merge(model_df, rolling_features_to_merge, on=['Season', 'Date', 'Team'], how='left')
model_df.rename(columns={'xG_roll_avg': 'xG_roll_avg_home', 'xG_diff_roll_avg': 'xG_diff_roll_avg_home'}, inplace=True)

model_df = pd.merge(model_df, rolling_features_to_merge, left_on=['Season', 'Date', 'Opponent'], right_on=['Season', 'Date', 'Team'], how='left', suffixes=('_home_team', '_away_team'))
model_df.rename(columns={'xG_roll_avg': 'xG_roll_avg_away', 'xG_diff_roll_avg': 'xG_diff_roll_avg_away'}, inplace=True)
model_df.drop(columns=['Team_away_team'], inplace=True)
model_df.rename(columns={'Team_home_team': 'Team'}, inplace=True)

model_df = pd.merge(model_df, team_level_features, left_on=['Season', 'Team'], right_on=['Season', 'Squad'], how='left')
model_df.drop(columns=['Squad'], inplace=True)

model_df = pd.merge(model_df, team_level_features, left_on=['Season', 'Opponent'], right_on=['Season', 'Squad'], how='left', suffixes=('_home', '_away'))
model_df.drop(columns=['Squad'], inplace=True)

# --- feature engineering
print("Step 4: Engineering final features...")

model_df['elo_diff'] = model_df['Home Elo'] - model_df['Away Elo']
model_df['xg_roll_avg_diff'] = model_df['xG_roll_avg_home'] - model_df['xG_roll_avg_away']
model_df['squad_total_npxg_diff'] = model_df['squad_total_npxg_home'] - model_df['squad_total_npxg_away']

features = [
    'Home Elo', 'Away Elo', 'elo_diff',
    'xG_roll_avg_home', 'xG_roll_avg_away', 'xg_roll_avg_diff',
    'xG_diff_roll_avg_home', 'xG_diff_roll_avg_away',
    'squad_avg_age_home', 'squad_total_npxg_home', 'squad_total_prgp_home',
    'squad_avg_age_away', 'squad_total_npxg_away', 'squad_total_prgp_away',
    'squad_total_npxg_diff'
]
X = model_df[features]
y = model_df['Result']

X.fillna(0, inplace=True)
y = y[X.index]

# --- train binary classification model
print("Step 5 (Revised): Training and evaluating a Binary (Win/No-Win) Model...")

# ### CHANGE 1: Import the new model and tools
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np

y_binary = y.map({'W': 1, 'D': 0, 'L': 0})


# --- cross validation split
tscv = TimeSeriesSplit(n_splits=5)
accuracies = []
roc_aucs = []
all_true = []
all_preds = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_binary.iloc[train_index], y_binary.iloc[test_index]

    imputation_values = X_train.mean()
    X_train = X_train.fillna(imputation_values)
    X_test = X_test.fillna(imputation_values)

    # max_iter is increased to ensure convergence.
    binary_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    
    # fitting model on training data
    binary_model.fit(X_train, y_train)

    # predicting on unseen data
    predicted_outcomes = binary_model.predict(X_test)
    predicted_probabilities = binary_model.predict_proba(X_test)[:, 1] # Get prob of class '1' (Win)
    
    accuracy = accuracy_score(y_test, predicted_outcomes)
    accuracies.append(accuracy)
    
    # ROC AUC measure
    roc_auc = roc_auc_score(y_test, predicted_probabilities)
    roc_aucs.append(roc_auc)
    
    all_true.extend(y_test)
    all_preds.extend(predicted_outcomes)
    
    print(f"Fold Accuracy: {accuracy:.4f} | Fold ROC AUC: {roc_auc:.4f}")

# --- final eval
print("\n--- Binary Model Out-of-Sample Performance ---")
print(f"Average Accuracy across all folds: {np.mean(accuracies):.4f} ({np.mean(accuracies):.2%})")
print(f"Average ROC AUC across all folds:  {np.mean(roc_aucs):.4f}")

print(classification_report(all_true, all_preds, target_names=['No-Win', 'Win']))

# --- feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': binary_model.coef_[0]
}).sort_values('coefficient', ascending=False)

print("\n--- Feature Importance (from last fold's model) ---")
print(feature_importance)

In [None]:
import pandas as pd
import numpy as np
import random
from collections import deque
from tqdm import tqdm # A progress bar for the simulation
from statsmodels.miscmodels.ordinal_model import OrderedModel
from pandas.api.types import CategoricalDtype

try:
    player_stats_df = pd.read_csv('/MASTER.csv')
except FileNotFoundError as e:
    print(f"ERROR: Could not find a file.")
    print(f"Details: {e}")
    raise

# --- final Elo state
print("\nStep 1: Calculating final Elo ratings from historical data...")

# --- data prep
df3['Date'] = pd.to_datetime(df3['Date'])
K = 25
HFA = 40
result_points_elo = {'W': 1, 'D': 0.5, 'L': 0}
all_historical_teams = sorted(list(set(df3['Team'].unique()) | set(df3['Opponent'].unique())))
elo_score = {team: 1500 for team in all_historical_teams}
previous_season = None

# --- looping through matches to get final elos
historical_matches = df3[df3['Date'] < '2024-08-01'].copy().sort_values('Date')

for _, row in historical_matches.iterrows():
    current_season = row['Season']
    if current_season != previous_season and previous_season is not None:
        for team in elo_score: elo_score[team] = 1500 + (2/3) * (elo_score[team] - 1500)
    previous_season = current_season
    
    hometeam, awayteam = row['Team'], row['Opponent']
    elo_h, elo_a = elo_score[hometeam], elo_score[awayteam]
    
    expected_h = 1 / (1 + 10**((elo_a - (elo_h + HFA)) / 400))
    actual_h = result_points_elo[row['Result']]
    
    elo_score[hometeam] += K * (actual_h - expected_h)
    elo_score[awayteam] -= K * (actual_h - expected_h)

# --- assemble features and train OrderedModel

# --- assemble the full feature set (model_df) ---
final_dataset['Date'] = pd.to_datetime(final_dataset['Date'])
player_stats_df['Season'] = player_stats_df['Season'].astype(str)
numeric_cols = ['Age', 'Min', 'Gls', 'Ast', 'npxG', 'xAG', 'PrgC', 'PrgP']
for col in numeric_cols: 
    player_stats_df[col] = pd.to_numeric(player_stats_df[col], errors='coerce')
significant_players = player_stats_df[player_stats_df['Min'] > 450]
team_level_features = significant_players.groupby(['Season', 'Squad']).agg(
    squad_avg_age=('Age', 'mean'), 
    squad_total_npxg=('npxG', 'sum'),
    squad_total_prgp=('PrgP', 'sum'), 
    squad_total_gls=('Gls', 'sum')).reset_index()

model_df = df3.copy()
rolling_features_to_merge = final_dataset[['Season', 'Team', 'Date', 'xG_roll_avg', 'xG_diff_roll_avg']]
model_df = pd.merge(model_df, rolling_features_to_merge, on=['Season', 'Date', 'Team'], how='left')
model_df.rename(columns={'xG_roll_avg': 'xG_roll_avg_home', 'xG_diff_roll_avg': 'xG_diff_roll_avg_home'}, inplace=True)
model_df = pd.merge(model_df, rolling_features_to_merge, left_on=['Season', 'Date', 'Opponent'], right_on=['Season', 'Date', 'Team'], how='left', suffixes=('_home_team', '_away_team'))
model_df.rename(columns={'xG_roll_avg': 'xG_roll_avg_away', 'xG_diff_roll_avg': 'xG_diff_roll_avg_away'}, inplace=True)
model_df.drop(columns=['Team_away_team'], inplace=True)
model_df.rename(columns={'Team_home_team'   : 'Team'}, inplace=True)
model_df = pd.merge(model_df, team_level_features, left_on=['Season', 'Team'], right_on=['Season', 'Squad'], how='left')
model_df.drop(columns=['Squad'], inplace=True)
model_df = pd.merge(model_df, team_level_features, left_on=['Season', 'Opponent'], right_on=['Season', 'Squad'], how='left', suffixes=('_home', '_away'))
model_df.drop(columns=['Squad'], inplace=True)

# --- engineer features and define X, y 
model_df['elo_diff'] = model_df['Home Elo'] - model_df['Away Elo']
model_df['xg_roll_avg_diff'] = model_df['xG_roll_avg_home'] - model_df['xG_roll_avg_away']
model_df['squad_total_npxg_diff'] = model_df['squad_total_npxg_home'] - model_df['squad_total_npxg_away']

features = [
    'Home Elo', 'Away Elo', 'elo_diff', 'xG_roll_avg_home', 'xG_roll_avg_away', 'xg_roll_avg_diff',
    'xG_diff_roll_avg_home', 'xG_diff_roll_avg_away', 'squad_avg_age_home', 'squad_total_npxg_home',
    'squad_total_prgp_home', 'squad_avg_age_away', 'squad_total_npxg_away', 'squad_total_prgp_away',
    'squad_total_npxg_diff']
X = model_df[features]
y = model_df['Result']
X.fillna(0, inplace=True)
y = y[X.index]

# --- train ifnal OrderedModel
cat_type = CategoricalDtype(categories=['L', 'D', 'W'], ordered=True)
y_ordered = y.astype(cat_type).cat.codes
trained_model_result = OrderedModel(y_ordered, X).fit(method='bfgs', disp=0)
print("Predictive model trained successfully on all historical data.")


# --- setting up and running 2024-25 season

# --- initial state for sim
final_elo_ratings = elo_score.copy()
last_known_form = final_dataset[final_dataset['Season']=='2023-2024'].sort_values('Date').drop_duplicates('Team', keep='last').set_index('Team')[['xG_roll_avg', 'xG_diff_roll_avg']].to_dict('index')
squad_features_23_24 = team_level_features[team_level_features['Season'] == '2023-2024'].set_index('Squad').to_dict('index')

# --- standardising names across the board
teams_24_25_raw = ['AFC Bournemouth', 'Arsenal', 'Aston Villa', 'Brentford', 'Brighton & Hove Albion', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Ipswich Town', 'Leicester City', 'Liverpool', 'Manchester City', 'Manchester United', 'Newcastle United', 'Nottingham Forest', 'Southampton', 'Tottenham Hotspur', 'West Ham United', 'Wolverhampton Wanderers']
name_map = {'AFC Bournemouth': 'Bournemouth', 'Brighton & Hove Albion': 'Brighton-and-Hove-Albion', 'Aston Villa': 'Aston-Villa', 'Ipswich Town': 'Ipswich-Town', 'Leicester City': 'Leicester-City', 'Manchester City': 'Manchester-City', 'Manchester United': 'Manchester-United', 'Newcastle United': 'Newcastle-United', 'Nottingham Forest': 'Nottingham-Forest', 'Tottenham Hotspur': 'Tottenham-Hotspur', 'West Ham United': 'West-Ham-United', 'Wolverhampton Wanderers': 'Wolverhampton-Wanderers', 'Crystal Palace': 'Crystal-Palace'}
teams_24_25 = [name_map.get(team, team) for team in teams_24_25_raw]

# --- creating features and running sim loop
fixtures = pd.DataFrame([{'HomeTeam': h, 'AwayTeam': a} for h in teams_24_25 for a in teams_24_25 if h != a]).sample(frac=1).reset_index(drop=True)
N_SIMULATIONS = 100
season_winner_counts = {team: 0 for team in teams_24_25}

for i in tqdm(range(N_SIMULATIONS)):
    sim_elo = final_elo_ratings.copy()
    sim_table = {team: {'points': 0, 'gd': 0} for team in teams_24_25}
    sim_form = {team: {'xG_history': deque(maxlen=5), 'xGA_history': deque(maxlen=5)} for team in teams_24_25}

    for team in teams_24_25:
        if team not in sim_elo: sim_elo[team] = 1450
        if team not in squad_features_23_24: squad_features_23_24[team] = {'squad_avg_age': 26.5, 'squad_total_npxg': 38.0, 'squad_total_prgp': 1600.0, 'squad_total_gls': 45.0}

    for team in sim_elo: sim_elo[team] = 1500 + (2/3) * (sim_elo[team] - 1500)

    for _, match in fixtures.iterrows():
        home_team, away_team = match['HomeTeam'], match['AwayTeam']
        elo_h, elo_a = sim_elo[home_team], sim_elo[away_team]
        
        home_xG_roll_avg = np.mean(sim_form[home_team]['xG_history']) if sim_form[home_team]['xG_history'] else 1.3
        home_xGA_roll_avg = np.mean(sim_form[home_team]['xGA_history']) if sim_form[home_team]['xGA_history'] else 1.3
        away_xG_roll_avg = np.mean(sim_form[away_team]['xG_history']) if sim_form[away_team]['xG_history'] else 1.3
        away_xGA_roll_avg = np.mean(sim_form[away_team]['xGA_history']) if sim_form[away_team]['xGA_history'] else 1.3
        
        match_features = pd.DataFrame([{
            'Home Elo': elo_h, 'Away Elo': elo_a, 'elo_diff': elo_h - elo_a,
            'xG_roll_avg_home': home_xG_roll_avg, 'xG_diff_roll_avg_home': home_xG_roll_avg - home_xGA_roll_avg,
            'xG_roll_avg_away': away_xG_roll_avg, 'xG_diff_roll_avg_away': away_xG_roll_avg - away_xGA_roll_avg,
            'xg_roll_avg_diff': home_xG_roll_avg - away_xG_roll_avg,
            'squad_avg_age_home': squad_features_23_24[home_team]['squad_avg_age'], 'squad_total_npxg_home': squad_features_23_24[home_team]['squad_total_npxg'],
            'squad_total_prgp_home': squad_features_23_24[home_team]['squad_total_prgp'], 'squad_avg_age_away': squad_features_23_24[away_team]['squad_avg_age'],
            'squad_total_npxg_away': squad_features_23_24[away_team]['squad_total_npxg'], 'squad_total_prgp_away': squad_features_23_24[away_team]['squad_total_prgp'],
            'squad_total_npxg_diff': squad_features_23_24[home_team]['squad_total_npxg'] - squad_features_23_24[away_team]['squad_total_npxg']
        }])[features].fillna(0)

        probabilities = trained_model_result.predict(match_features).iloc[0].values
        simulated_result = np.random.choice([0, 1, 2], p=probabilities) # 0=L, 1=D, 2=W
        
        if simulated_result == 2: points_h, points_a, gd_h = 3, 0, 1
        elif simulated_result == 1: points_h, points_a, gd_h = 1, 1, 0
        else: points_h, points_a, gd_h = 0, 3, -1
            
        sim_table[home_team]['points'] += points_h
        sim_table[away_team]['points'] += points_a
        sim_table[home_team]['gd'] += gd_h
        sim_table[away_team]['gd'] -= gd_h
        
        actual_h = {2: 1, 1: 0.5, 0: 0}[simulated_result]
        expected_h = 1 / (1 + 10**((elo_a - (elo_h + HFA)) / 400))
        sim_elo[home_team] += K * (actual_h - expected_h)
        sim_elo[away_team] -= K * (actual_h - expected_h)
        
        mock_xg_home = max(0.2, 1.25 + (elo_h - 1500)/200 - (elo_a - 1500)/250)
        mock_xg_away = max(0.2, 1.25 + (elo_a - 1500)/200 - (elo_h - 1500)/250)
        sim_form[home_team]['xG_history'].append(mock_xg_home)
        sim_form[home_team]['xGA_history'].append(mock_xg_away)
        sim_form[away_team]['xG_history'].append(mock_xg_away)
        sim_form[away_team]['xGA_history'].append(mock_xg_home)
        
    final_table = sorted(sim_table.items(), key=lambda item: (item[1]['points'], item[1]['gd']), reverse=True)
    winner = final_table[0][0]
    season_winner_counts[winner] += 1

# --- presenting results
print("\n--- Predicted 2024-25 Season Winner Probabilities ---")
winner_probabilities = {team: (count / N_SIMULATIONS) * 100 for team, count in season_winner_counts.items()}
sorted_winners = sorted(winner_probabilities.items(), key=lambda item: item[1], reverse=True)

reverse_name_map = {v: k for k, v in name_map.items()}
for team_std, prob in sorted_winners:
    if prob > 0:
        team_display = reverse_name_map.get(team_std, team_std)
        print(f"{team_display:<30} {prob:.1f}%")