@milnerLFC --ben rabat

In [None]:
import os
import datetime
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier

from sklearn.metrics import  accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from tensorflow import keras
from tensorflow.keras import layers

# from ann_output import binary_ann_df

from utils import list_seasons, custom_accuracy_margin, accuracy_double_strict, accuracy_double_margin, confusion_classification
from download_data import download_league_season,download_fixtures,load_data,load_fixtures
from ann_utils import split_data, scale_datasets

# from mlp_classifier import mlp_model_fit

from predictables import table, table_bet, table_filter, table_pred

In [None]:
first_year = 15
bet_amount_base = 10
min_entropy = 0.5
max_entropy = 1.6

In [None]:
current_year = datetime.datetime.now().year
current_season = f"{current_year%100:02d}{current_year%100+1:02d}"
key = "E0"
leagues_keys = [key,"E1","SC0","SP1","F1","D1","I1"]
base_path = os.getcwd()
folder_path = os.path.join(base_path, "resources", "seasons")

existing_csv = {
    entry.name for entry in os.scandir(folder_path)
    if entry.is_file() and entry.name.endswith(".csv") and any(key in entry.name for key in leagues_keys)
}

venue_infos = ['Div','Date','Time','HomeTeam','AwayTeam']
odds = ['PSH','PSD','PSA']
columns_to_check = venue_infos + odds
result_cols = ['FTHG','FTAG','FTR'] #'HTR','HTHG','HTAG'
shot_cols = ['HS','AS','HST','AST']
card_cols = ['HR','AR']
usecols = venue_infos + result_cols + shot_cols + card_cols + odds
unique_key = False

In [None]:
filename = f"fixtures.csv" 
download_fixtures(os.path.join(base_path, "resources"), filename)
fixtures = load_fixtures(base_path, filename, columns_to_check, current_season, leagues_keys, unique_key=unique_key)

In [None]:
# fixtures

In [None]:
all_seasons = list_seasons(first_year=first_year, last_year=current_year)

for season in all_seasons:
    if unique_key:
        filename = f"{key}_{season}.csv" 
        if (filename not in existing_csv) or season == current_season:
            download_league_season(folder_path,filename,season,key)
    else:
        for k in leagues_keys:
            filename = f"{k}_{season}.csv" 
            if (filename not in existing_csv) or season == current_season:
                download_league_season(folder_path,filename,season,k)

In [None]:
league_data = load_data(folder_path, all_seasons, usecols, leagues_keys, unique_key=unique_key)

In [None]:
league_data['FTGD'] = league_data['FTHG'] - league_data['FTAG']

In [None]:
league_data.info()

In [None]:
sns.heatmap(league_data.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".1f")
plt.show()

In [None]:
league_data.describe()

In [None]:
alpha = 0.15
shift_val = 1
win_size = 4
seasons = league_data.groupby("Season")

Hcols = ['HPointsH','HGFH','HGCH','HSH','HSTH']
Acols = ['APointsA','AGFA','AGCA','ASA','ASTA']

seasonal_df = pd.DataFrame()
columns_to_check.insert(1,'Season')
for season_label, season_df in seasons:    
    
    if season_label == current_season:
        # season_df = pd.concat([season_df, fixtures])
        temp_seas_merged_df = pd.merge(season_df, fixtures, on=columns_to_check, how='outer', indicator=True)
        unique_rows = temp_seas_merged_df[temp_seas_merged_df['_merge'] == 'right_only'].drop('_merge', axis=1)

        season_df = pd.concat([season_df, unique_rows], ignore_index=True)
        season_df = season_df.drop_duplicates(subset=columns_to_check, keep='first')

        
    # Home team statistics
    season_df[['HGFH_m', 'HGCH_m']] = season_df.groupby('HomeTeam', group_keys=False)[['FTHG', 'FTAG']].apply(lambda x: x.shift(shift_val).expanding().mean())
    season_df['HPointsH'] = season_df.apply(lambda row: 3 if row['FTR'] == 'H' else 1 if row['FTR'] == 'D' else 0, axis=1)
    season_df['HPtsH_m'] = season_df.groupby('HomeTeam', group_keys=False)['HPointsH'].apply(lambda x: x.shift(shift_val).expanding().mean())
    season_df[['HSH_m', 'HSTH_m']] = season_df.groupby('HomeTeam', group_keys=False)[['HS', 'HST']].apply(lambda x: x.shift(shift_val).expanding().mean())
    # season_df['HPtsH_e'] = season_df.groupby('HomeTeam', group_keys=False)['HPointsH'].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())
    
    # Away team statistics
    season_df[['AGFA_m', 'AGCA_m']] = season_df.groupby('AwayTeam', group_keys=False)[['FTAG', 'FTHG']].apply(lambda x: x.shift(shift_val).expanding().mean())
    season_df['APointsA'] = season_df.apply(lambda row: 3 if row['FTR'] == 'A' else 1 if row['FTR'] == 'D' else 0, axis=1)
    season_df['APtsA_m'] = season_df.groupby('AwayTeam', group_keys=False)['APointsA'].apply(lambda x: x.shift(shift_val).expanding().mean())
    season_df[['ASA_m', 'ASTA_m']] = season_df.groupby('AwayTeam', group_keys=False)[['AS', 'AST']].apply(lambda x: x.shift(shift_val).expanding().mean())
    # season_df['APtsA_e'] = season_df.groupby('AwayTeam', group_keys=False)['APointsA'].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())

    seasonal_df = pd.concat([seasonal_df, season_df]).reset_index(drop=True)

In [None]:
seasonal_df['DateTime'] = pd.to_datetime(seasonal_df['Date'] + ' ' + seasonal_df['Time'], infer_datetime_format=True, dayfirst=True, errors='coerce')
seasonal_df = seasonal_df.sort_values(by='DateTime').reset_index(drop=True)
# seasonal_df.tail(20)

In [None]:
fixtures = seasonal_df.copy()[(seasonal_df['FTHG'].isna()) & (seasonal_df['FTAG'].isna()) & (seasonal_df['FTR'].isna())][columns_to_check]

In [None]:
fixtures

In [None]:
seasonal_df[['HGFH_r', 'HGCH_r']] = seasonal_df.groupby('HomeTeam', group_keys=False)[['FTHG', 'FTAG']].apply(lambda x: x.shift(shift_val).rolling(window=win_size, min_periods=1).mean())
seasonal_df[['AGFA_r', 'AGCA_r']] = seasonal_df.groupby('AwayTeam', group_keys=False)[['FTAG', 'FTHG']].apply(lambda x: x.shift(shift_val).rolling(window=win_size, min_periods=1).mean())

seasonal_df['HPtsH_r'] = seasonal_df.groupby('HomeTeam', group_keys=False)['HPointsH'].apply(lambda x: x.shift(shift_val).rolling(window=win_size, min_periods=1).mean())
seasonal_df['APtsA_r'] = seasonal_df.groupby('AwayTeam', group_keys=False)['APointsA'].apply(lambda x: x.shift(shift_val).rolling(window=win_size, min_periods=1).mean())


seasonal_df[['HGFH_e', 'HGCH_e']] = seasonal_df.groupby('HomeTeam', group_keys=False)[['FTHG', 'FTAG']].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())
seasonal_df[['AGFA_e', 'AGCA_e']] = seasonal_df.groupby('AwayTeam', group_keys=False)[['FTAG', 'FTHG']].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())


seasonal_df[['HSH_e', 'HSTH_e']] = seasonal_df.groupby('HomeTeam', group_keys=False)[['HS', 'HST']].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())
seasonal_df[['ASA_e', 'ASTA_e']] = seasonal_df.groupby('AwayTeam', group_keys=False)[['AS', 'AST']].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())


seasonal_df = seasonal_df.drop(['HPointsH','APointsA'],axis=1)

In [None]:
teams = sorted(set(seasonal_df['HomeTeam'].unique()) | set(seasonal_df['AwayTeam'].unique()))
# seasons = seasonal_df.groupby("Season")

In [None]:
joint_cols = ["GF","GC","SF","SC","STF","STC","Pts"]

for col in joint_cols:
    seasonal_df[f"H{col}"] = 0
    seasonal_df[f"A{col}"] = 0
    # seasonal_df[f"H{col}_r"] = 0
    # seasonal_df[f"A{col}_r"] = 0

In [None]:
# joint_cols = ["Team","GF","GC","SF","SC","STF","STC"]
# for season_label, season_df in seasons:
#     season_teams = sorted(set(season_df['HomeTeam'].unique()) | set(season_df['AwayTeam'].unique()))
#     for team in season_teams:
#         team_stats = pd.DataFrame(columns=joint_cols)
#         team_homes = season_df[season_df["HomeTeam"] == team][['HomeTeam','FTHG','FTAG','HS','AS','HST','AST']]
#         team_aways = season_df[season_df["AwayTeam"] == team][['AwayTeam','FTAG','FTHG','AS','HS','AST','HST']]
#         team_homes = team_homes.rename(columns=dict(zip(team_homes.columns, joint_cols)))
#         team_aways = team_aways.rename(columns=dict(zip(team_aways.columns, joint_cols)))

#         team_stats = pd.concat([team_homes, team_aways])
#         team_stats = team_stats.sort_index()
        
#         team_stats[['GF_m', 'GC_m']] = team_stats.groupby('Team', group_keys=False)[['GF', 'GC']].apply(lambda x: x.shift(shift_val).expanding().mean())
#         team_stats['Points'] = team_stats.apply(lambda row: 3 if row['GF'] > row['GC'] else 1 if row['GF'] == row['GC'] else 0, axis=1)
#         team_stats['Pts_m'] = team_stats.groupby('Team', group_keys=False)['Points'].apply(lambda x: x.shift(shift_val).expanding().mean())
    
    
#         print(team_stats)

In [None]:
for team in teams:
    columns = ["Team","Opponent"]+joint_cols
    team_stats = pd.DataFrame(columns=columns)
    team_homes = seasonal_df[seasonal_df["HomeTeam"] == team][['HomeTeam','AwayTeam','FTHG','FTAG','HS','AS','HST','AST']]
    team_aways = seasonal_df[seasonal_df["AwayTeam"] == team][['AwayTeam','HomeTeam','FTAG','FTHG','AS','HS','AST','HST']]
    team_homes = team_homes.rename(columns=dict(zip(team_homes.columns, columns)))
    team_homes['HomeTeam'] = team_homes['Team']
    team_homes['AwayTeam'] = team_homes['Opponent']
    team_aways = team_aways.rename(columns=dict(zip(team_aways.columns, columns)))
    team_aways['AwayTeam'] = team_homes['Team']
    team_aways['HomeTeam'] = team_homes['Opponent']

    team_stats = pd.concat([team_homes, team_aways])
    team_stats = team_stats.sort_index()
    
    team_stats['Points'] = team_stats.apply(lambda row: 3 if row['GF'] > row['GC'] else 1 if row['GF'] == row['GC'] else 0, axis=1)
    team_stats['Pts_r'] = team_stats.groupby('Team', group_keys=False)['Points'].apply(lambda x: x.shift(shift_val).rolling(window=win_size, min_periods=1).mean())
    
    team_stats[['GF_r', 'GC_r']] = team_stats.groupby('Team', group_keys=False)[['GF', 'GC']].apply(lambda x: x.shift(shift_val).rolling(window=win_size, min_periods=1).mean())
    team_stats[['SF_r', 'STF_r', 'SC_r', 'STC_r']] = team_stats.groupby('Team', group_keys=False)[['SF', 'STF', 'SC', 'STC']].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())

    team_stats[['GF_e', 'GC_e']] = team_stats.groupby('Team', group_keys=False)[['GF', 'GC']].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())
    team_stats[['SF_e', 'STF_e', 'SC_e', 'STC_e']] = team_stats.groupby('Team', group_keys=False)[['SF', 'STF', 'SC', 'STC']].apply(lambda x: x.shift(shift_val).ewm(alpha=alpha, adjust=False).mean())
    
    merge_cols = ['Team','GF_e', 'GC_e', 'SF_e', 'SC_e', 'STF_e', 'STC_e', 'Pts_r']
    merge_df = team_stats[merge_cols]
    
    mask_home_stats = (merge_df['Team'] == team_stats['HomeTeam'])
    mask_away_stats = ~mask_home_stats
    
    home_cols = [f"H{col}" for col in joint_cols]
    away_cols = [f"A{col}" for col in joint_cols]
        
    mask_home_df = mask_home_stats.reindex(seasonal_df.index, fill_value=False)
    mask_away_df = mask_away_stats.reindex(seasonal_df.index, fill_value=False)

    seasonal_home_idx = seasonal_df[mask_home_df].index
    team_stats_home_idx = team_stats[mask_home_stats].index
    seasonal_df.loc[seasonal_home_idx, home_cols] = team_stats.loc[team_stats_home_idx, merge_cols[1:]].values

    seasonal_away_idx = seasonal_df[mask_away_df].index
    team_stats_away_idx = team_stats[mask_away_stats].index
    seasonal_df.loc[seasonal_away_idx, away_cols] = team_stats.loc[team_stats_away_idx, merge_cols[1:]].values


In [None]:
def fill_nan_with_avg_last_4(series):
    return series.fillna(series.rolling(window=win_size, min_periods=1).mean())

In [None]:
team_homes = seasonal_df.groupby("HomeTeam")
team_aways = seasonal_df.groupby("AwayTeam")

for name, group in team_homes:
    seasonal_df.loc[group.index, group.columns.str.contains('_m')] = group.loc[:, group.columns.str.contains('_m')].apply(fill_nan_with_avg_last_4)

for name, group in team_aways:
    seasonal_df.loc[group.index, group.columns.str.contains('_m')] = group.loc[:, group.columns.str.contains('_m')].apply(fill_nan_with_avg_last_4)



In [None]:
no_drop_columns = result_cols + shot_cols + card_cols +['FTGD'] #'HTHG','HTAG','HTR',
features_df = seasonal_df.dropna(subset=[col for col in seasonal_df.columns if col not in no_drop_columns]).copy()

In [None]:
features_df.info()

In [None]:
features_df['HoffStr'] = features_df['HGF'] + features_df['AGCA_e'] + features_df['HGFH_e'] + features_df['AGC']
features_df['AoffStr'] = features_df['AGF'] + features_df['HGCH_e'] + features_df['AGFA_e'] + features_df['HGC']
features_df['ShDiff'] = features_df['HSF'] + features_df['HSH_e'] - features_df['ASF'] - features_df['ASA_e']
features_df['ShtDiff'] = features_df['HSTF'] + features_df['HSTH_e'] - features_df['ASTF'] - features_df['ASTA_e']

features_df['GFdiff'] = features_df['HGF'] + features_df['HGFH_e'] - features_df['AGF'] - features_df['AGFA_e']
features_df['GCdiff'] = features_df['HGC'] + features_df['HGCH_e'] - features_df['AGC'] - features_df['AGCA_e']

features_df['PCH'] = features_df['HGFH_e'] + features_df['HSH_e'] + features_df['AGCA_e']
features_df['PCA'] = features_df['AGFA_e'] + features_df['ASA_e'] + features_df['HGCH_e']

features_df['PCdiff'] = features_df['PCH']  - features_df['PCA'] 
features_df['Ptsdiff'] = features_df['HPts']  - features_df['APts'] 

In [None]:
# features_df['BTTS'] = features_df.apply(lambda row: 1 if row['FTHG'] > 0 and row['FTAG'] > 0 else 0, axis=1)

# features_df['2more1'] = features_df.apply(lambda row: 1 if (row['FTHG'] + row['FTAG'] - row['HTHG'] + row['HTAG']) > (row['HTHG'] + row['HTAG']) else 0, axis=1)

# features_df['over15'] = features_df.apply(lambda row: 1 if row['FTHG'] + row['FTAG'] > 1.5 else 0, axis=1)
# features_df['over25'] = features_df.apply(lambda row: 1 if row['FTHG'] + row['FTAG'] > 2.5 else 0, axis=1)
# features_df['over35'] = features_df.apply(lambda row: 1 if row['FTHG'] + row['FTAG'] > 3.5 else 0, axis=1)

# features_df['Hover05'] = features_df.apply(lambda row: 1 if row['FTHG'] > 0.5 else 0, axis=1)
# features_df['Hover15'] = features_df.apply(lambda row: 1 if row['FTHG'] > 1.5 else 0, axis=1)
# features_df['Hover25'] = features_df.apply(lambda row: 1 if row['FTHG'] > 2.5 else 0, axis=1)
# features_df['Hover35'] = features_df.apply(lambda row: 1 if row['FTHG'] > 3.5 else 0, axis=1)

# features_df['Aover05'] = features_df.apply(lambda row: 1 if row['FTAG'] > 0.5 else 0, axis=1)
# features_df['Aover15'] = features_df.apply(lambda row: 1 if row['FTAG'] > 1.5 else 0, axis=1)
# features_df['Aover25'] = features_df.apply(lambda row: 1 if row['FTAG'] > 2.5 else 0, axis=1)
# features_df['Aover35'] = features_df.apply(lambda row: 1 if row['FTAG'] > 3.5 else 0, axis=1)

In [None]:
features_df['H'] = (features_df['FTR'] == 'H').astype(int)
features_df['D'] = (features_df['FTR'] == 'D').astype(int)
features_df['A'] = (features_df['FTR'] == 'A').astype(int)


sns.heatmap(features_df[['HSH_m','HPtsH_m','HPts','HGF','HGFH_e','AGC','HoffStr','HSH_e','HSF','PCH','FTHG']].corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.show() #HPtsH_e

In [None]:
sns.heatmap(features_df[['ASA_m','APtsA_m','APts','AGF','AGFA_e','HGCH_e','HGC','HGCH_e','ASA_e','ASF','PCA','FTAG']].corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.show() #APtsA_e

In [None]:
sns.heatmap(features_df[['ShDiff','ShtDiff', 'GFdiff','GCdiff','PCdiff','Ptsdiff','HoffStr','AoffStr','FTGD']].corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.show() #APtsA_e

In [None]:
over_cols = [col for col in features_df.columns.tolist() if 'over' in col]

game_infos =['Div','DateTime','HomeTeam','AwayTeam'] + result_cols + card_cols # 'BTTS','2more1']

input_features  = ['HGF','HGFH_e','AGC','AGCA_e','HSH_e','HSF', 'HPts', #'HPtsH_m', 'HPts', 'PCH',
                   'AGF','AGFA_e','HGC','HGCH_e','ASA_e','ASF', 'APts',#'APtsA_m', 'APts','PCA',
                #    'ShtDiff','GFdiff'#'ShDiff','HoffStr', 'AoffStr',
                   ]

# input_features  = ['HoffStr','HGFH_e','HSF', #'HPtsH_m', 'HPts', 'PCH',
#                    'AoffStr','AGFA_e','ASF', #'APtsA_m', 'APts','PCA',
#                    'ShDiff','ShtDiff', 'GFdiff'#,'GCdiff'
#                    ]

In [None]:
selected_features = game_infos + over_cols + input_features + odds


data = features_df.copy()[selected_features]
data['R'] = data['FTR'].map({'H': 1, 'D': 0, 'A': 2})


In [None]:
target = 'R'

test_size = 311#len(leagues_keys)*36

X_train,y_train,meta_train,X_test,y_test,meta_test,X_pred,meta_pred = split_data(data,fixtures,test_size,game_infos,over_cols,odds,target, no_red = True)
X_train_scaled,X_test_scaled,X_pred_scaled = scale_datasets(X_train,X_test,X_pred)

In [None]:
test_size = len(y_test)
test_size

In [None]:
num_classes = len(np.unique(y_train))


model = keras.Sequential([
    layers.Dense(2**4, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(2**5, activation='relu'),
    # layers.Dense(2**3, activation='relu'),
    layers.Dense(num_classes, activation='softmax')  # Output layer with softmax for classification
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train_scaled, keras.utils.to_categorical(y_train, num_classes=num_classes),
        epochs=35, batch_size=2**5, validation_data=(X_test_scaled, keras.utils.to_categorical(y_test, num_classes=num_classes)))

loss, accuracy = model.evaluate(X_test_scaled, keras.utils.to_categorical(y_test, num_classes=num_classes))
print(f'Loss on Test Set: {loss}')
print(f'Accuracy on Test Set: {accuracy}')


# num_classes = 1  # For binary classification, you only need one output neuron

# model = keras.Sequential([
#     layers.Dense(2**5, activation='tanh', input_shape=(X_train.shape[1],)),
#     layers.Dense(2**4, activation='gelu'),
#     layers.Dense(num_classes, activation='sigmoid')
# ])

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# model.fit(X_train_scaled, y_train, epochs=50, batch_size=2**5, validation_data=(X_test_scaled, y_test))

# loss, accuracy = model.evaluate(X_test_scaled, y_test)
# print(f'Loss on Test Set: {loss}')
# print(f'Accuracy on Test Set: {accuracy}')

In [None]:
# first_layer_weights = model.layers[0].get_weights()[0]

# feature_importances = np.sum(np.abs(first_layer_weights), axis=1)

# feature_importances /= np.sum(feature_importances)

# sorted_indices = np.argsort(feature_importances)[::-1]

# sorted_feature_names = [input_features[i] for i in sorted_indices]

# plt.figure(figsize=(10, 6))
# plt.bar(range(X_train.shape[1]), feature_importances[sorted_indices])
# plt.xticks(range(X_train.shape[1]), sorted_feature_names, rotation=45, ha='right')  # Rotate feature names for better visibility
# plt.xlabel('Feature')
# plt.ylabel('Normalized Importance')
# plt.title('Feature Importances in the First Layer')
# plt.tight_layout()
# plt.show()

In [None]:
ann_probabilities = model.predict(X_pred_scaled)

probh_ann = ann_probabilities[:, 1]
probd_ann = ann_probabilities[:, 0]
proba_ann = ann_probabilities[:, 2]

# result_df_ann = table(probh_ann,probd_ann,proba_ann,meta_test,meta_pred,min_entropy,max_entropy)
# confusion_classification(result_df_ann, test_size)

In [None]:
# ann_bet_df = table_bet(result_df_ann,bet_amount_base)

# total_hypothetical_winnings = ann_bet_df['result'].head(test_size).sum()
# print(f"Total Hypothetical Winnings: {round(total_hypothetical_winnings,2)} euros over {test_size} games (with total bet : {ann_bet_df['base_bet'].sum()}€)")

In [None]:
# pred_table = table_pred(result_df_ann,len(fixtures))
# np.round(pred_table.head(10),2)

In [None]:
# np.round(pred_table.sort_values('Odd').head(3),2)

In [None]:
# pred_table_sorted = np.round(pred_table.sort_values('Bprob',ascending = False).head(3),2)
# pred_table_sorted

In [None]:
# pred_table_sorted.base_bet.sum()

-------

### MLP

In [None]:
def mlp_model_fit(X_train_scaled,y_train):
    
    mlp_model = MLPClassifier(  activation='relu', 
                                max_iter=800, 
                                # power_t=0.5, 
                                # validation_fraction=0.1, 
                                # beta_1=0.9, beta_2=0.999,
                                # epsilon=1e-08,
                                
                                random_state=42)
    
    # Create the parameter grid for grid search
    param_grid = {'hidden_layer_sizes':[(2**4,2**3)], #(2**3, 2**3),(2**5, 2**3),
                  } #0.005,

    # Perform grid search to find the best hyperparameters for Random Forest
    grid_search_mlp = GridSearchCV(mlp_model, param_grid, cv=5, scoring='accuracy', verbose=3, return_train_score=True)
    grid_search_mlp.fit(X_train_scaled, y_train)

    # Print the best parameters
    print("Best Parameters for MLP:")
    print(grid_search_mlp.best_params_)
    
    # Train the model on the best hyperparameters found using grid search
    best_mlp = grid_search_mlp.best_estimator_
    

    return best_mlp.fit(X_train_scaled, y_train)

In [None]:
mlp_model = mlp_model_fit(X_train_scaled,y_train)
mlp_probabilities =  mlp_model.predict_proba(X_pred_scaled)
accuracy_mlp = accuracy_score(y_test, np.argmax(mlp_probabilities[:test_size], axis=1))
print(f'Accuracy on Test Set (MLP): {accuracy_mlp}')

In [None]:
probh_mlp = mlp_probabilities[:, 1]
probd_mlp = mlp_probabilities[:, 0]
proba_mlp = mlp_probabilities[:, 2]

# result_df_mlp = table(probh_mlp,probd_mlp,proba_mlp,meta_test,meta_pred,min_entropy,max_entropy)
# confusion_classification(result_df_mlp, test_size)

In [None]:
# mlp_bet_df = table_bet(result_df_mlp,bet_amount_base)

# total_hypothetical_winnings = mlp_bet_df['result'].head(test_size).sum()
# print(f"Total Hypothetical Winnings: {round(total_hypothetical_winnings,2)} euros over {test_size} games (with total bet : {mlp_bet_df['base_bet'].sum()}€)")

In [None]:
# pred_table = table_pred(result_df_mlp,len(fixtures))
# np.round(pred_table.head(10),2)

---

Random Forest Classifier

In [None]:
# Define the hyperparameters for Random Forest
params = {
    'n_estimators':90,
    'criterion': 'gini',
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,'max_features': 'sqrt','max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'oob_score': False,
    'n_jobs': -1,
    'random_state': 42,
    'verbose': 0,'warm_start': False, 'class_weight': None,
}

# Create the Random Forest classifier
rfc = RandomForestClassifier(**params)

# Create the parameter grid for grid search
param_grid = {
    'max_depth': [20]
}

# Perform grid search to find the best hyperparameters for Random Forest
grid_search_rfc = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', verbose=1, return_train_score=True)
grid_search_rfc.fit(X_train_scaled, y_train)

# Train the model on the best hyperparameters found using grid search
best_rfc = grid_search_rfc.best_estimator_


In [None]:
# Print the best parameters
print("Best Parameters for RFC:")
print(grid_search_rfc.best_params_)

best_rfc.fit(X_train_scaled, y_train)

calibrated_model = CalibratedClassifierCV(best_rfc, cv='prefit', method='isotonic')
calibrated_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
predictions_rfc = calibrated_model.predict(X_test_scaled)

# Calculate accuracy on the test set
accuracy_rfc = accuracy_score(y_test, predictions_rfc)
print("Accuracy on test set: ", accuracy_rfc)

In [None]:
rfc_probabilities = calibrated_model.predict_proba(X_pred_scaled)

In [None]:
probh_rfc,probd_rfc,proba_rfc = rfc_probabilities[:, 1],rfc_probabilities[:, 0],rfc_probabilities[:, 2]

--------------

### Ensemble Learning

In [None]:
scaler = MinMaxScaler()
coef = 2

probh_square = probh_mlp**coef+probh_rfc**coef+probh_ann**coef
probd_square = probd_mlp**coef+probd_rfc**coef+probd_ann**coef
proba_square = proba_mlp**coef+proba_rfc**coef+proba_ann**coef

combined_square = np.column_stack((proba_square, probd_square, probh_square))
combined_square = scaler.fit_transform(combined_square)

bookiesh = 1/features_df[-(test_size+len(fixtures)):]['PSH']
bookiesd = 1/features_df[-(test_size+len(fixtures)):]['PSD']
bookiesa = 1/features_df[-(test_size+len(fixtures)):]['PSA']
combined_books= np.column_stack((proba_square, probd_square, probh_square))
combined_books = scaler.fit_transform(combined_books)

# square_preds = np.argmax(square_norms, axis=1)

In [None]:
preds_test = np.column_stack((ann_probabilities, mlp_probabilities, rfc_probabilities,combined_square))#combined_books

In [None]:
acc_last = len(leagues_keys)*20

lr_model = LogisticRegression()
lr_model.fit(preds_test[:test_size], y_test)

logis_preds = lr_model.predict(preds_test)
logis_proba = lr_model.predict_proba(preds_test)

probh = logis_proba[:,1]
probd = logis_proba[:,0]
proba = logis_proba[:,2]

accuracy_lr = accuracy_score(y_test, logis_preds[:test_size])
accuracy_last10 = accuracy_score(y_test[-acc_last:], logis_preds[test_size-acc_last:test_size])

f"accuracy last {test_size} : {accuracy_lr}, accuracy last {acc_last} : {accuracy_last10}"

In [None]:
threshold_value = 0.055

In [None]:
df_margin , accuracy_margin = custom_accuracy_margin(y_test, logis_proba[:test_size], threshold=threshold_value)
df_double_chance_strict , accuracy_double = accuracy_double_strict(y_test, logis_proba[:test_size], threshold=threshold_value)
df_double_chance_margin , accuracy_double_marg = accuracy_double_margin(y_test, logis_proba[:test_size], threshold=threshold_value)
print(f"accuracy_margin with threshold {threshold_value}: {accuracy_margin}")
print(f"accuracy_double without margin {threshold_value}: {accuracy_double}")
print(f"accuracy_double_margin with threshold {threshold_value}: {accuracy_double_marg}")


In [None]:
# condition = (df['Diff Condition 1'] == True) | (df['Diff Condition 2'] == True)
# condition = (df['True Class'] == df['Second Prediction'])
# condition = (df['True Class'] != df['Predicted Class']) & (df['True Class'] != df['Second Prediction'])

# df[condition]

In [None]:
coefficients = lr_model.coef_[0]
feature_names = ['annh','annd','anna', 'mlph','mlpd','mlpa', 'rfch','rfcd','rfca','squareh','squared','squarea']

# Create a dictionary to associate feature names with their coefficients
feature_coefficients = dict(zip(feature_names, coefficients))

# Sort the features by their absolute coefficient values to see the most influential ones
sorted_features = sorted(feature_coefficients.items(), key=lambda x: abs(x[1]), reverse=True)

# Display the sorted features and their coefficients
for feature, coefficient in sorted_features:
    print(f"{feature}: {coefficient}")

In [None]:
result = table(probh,probd,proba,meta_test,meta_pred,min_entropy,max_entropy).copy()

In [None]:
np.round(result.tail(10),2)

In [None]:
confusion_classification(result, test_size)

In [None]:
result_bet_df = table_bet(result,bet_amount_base)

total_hypothetical_winnings = result_bet_df['result'].head(test_size).sum()
print(f"Total Hypothetical Winnings: {round(total_hypothetical_winnings,2)} euros over {len(result_bet_df[:test_size])} games (with total bet : {result_bet_df[:test_size]['base_bet'].sum()}€)")

In [None]:
pred_table = table_pred(result,len(meta_pred))
pred_table['|'] = "|"
pred_table['Hodd'] = 1/pred_table['Home']
pred_table['Dodd'] = 1/pred_table['Draw']
pred_table['Aodd'] = 1/pred_table['Away']

size_table = len(pred_table)
np.round(pred_table.head(size_table//2),2)

In [None]:
np.round(pred_table.tail(size_table//2),2)

--clermont n2
--metz n2
--everton
