In [1]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
from Config import HYPERPARAETERS
import pandas as pd

#Constants
c = 10
d = 400
k = 40
R0 = 0

In [2]:
DATA_COLUMNS = ['league', 'season', 'week', 'home_team', 'away_team', 'result']
dataset: pd.DataFrame = pd.read_csv(
            'data/KaggleDataset_withBO.csv',
            encoding='utf-8',
            usecols=DATA_COLUMNS,
            dtype=dict(zip(DATA_COLUMNS, [str]*2 + [int] + [str]*3))
        )
conds = [
    (dataset['result'] == 'win'),
    (dataset['result'] == 'tie'),
    (dataset['result'] == 'loss')
]
choices = [1, 0.5, 0]
dataset['alphaH'] = np.select(conds, choices)
dataset['alphaA'] = np.select(conds, choices[::-1])

Elo_Scores = pd.Series(R0, index=np.unique(dataset[['home_team', 'away_team']]))


week_lists = dict()
for league, league_df in dataset.groupby('league'):
    week_lists[league] = []
    for season, season_df in league_df.groupby('season'):
        for week, week_df in season_df.groupby('week'):
            week_lists[league].append((season, week))


tvt_indcs = dict() #train validation test indcs
for league, league_df in dataset.groupby('league'):
    tvt_indcs[league] = dict()
    tvt_indcs[league]['test'] = list(range(int((1-HYPERPARAETERS.TestPortion.value) * len(week_lists[league])), len(week_lists[league])))
    tvt_indcs[league]['eval'] = list(range(int((1 - (HYPERPARAETERS.TestPortion.value+HYPERPARAETERS.ValidationPortion.value)) * len(week_lists[league])), int((1-HYPERPARAETERS.TestPortion.value) * len(week_lists[league]))))
    tvt_indcs[league]['train'] = list(range(0, int((1 - (HYPERPARAETERS.TestPortion.value+HYPERPARAETERS.ValidationPortion.value)) * len(week_lists[league]))))

In [3]:
total = [0] * 3
for league, league_df in dataset.groupby('league'):
    league_df = league_df.reset_index(drop=True)
    print(f'{league}')
    indcs = tvt_indcs[league]
    for i, mode in enumerate(['train', 'eval', 'test']):
        start = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][indcs[mode][0]], axis=1), :].index[0]
        end = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][indcs[mode][-1]], axis=1), :].index[-1]
        print(f'\t {mode}: [{start}, {end}] - count: {end - start + 1}')
        total[i] += end - start + 1
print(f'Overal')
print(f'\t train count: {total[0]}')
print(f'\t eval count: {total[1]}')
print(f'\t test count: {total[2]}')

Belgium Jupiler League
	 train: [0, 902] - count: 903
	 eval: [903, 1041] - count: 139
	 test: [1042, 1180] - count: 139
England Premier League
	 train: [0, 2353] - count: 2354
	 eval: [2354, 2647] - count: 294
	 test: [2648, 2953] - count: 306
France Ligue 1
	 train: [0, 2289] - count: 2290
	 eval: [2290, 2573] - count: 284
	 test: [2574, 2854] - count: 281
Germany 1. Bundesliga
	 train: [0, 1875] - count: 1876
	 eval: [1876, 2115] - count: 240
	 test: [2116, 2367] - count: 252
Italy Serie A
	 train: [0, 2160] - count: 2161
	 eval: [2161, 2429] - count: 269
	 test: [2430, 2703] - count: 274
Netherlands Eredivisie
	 train: [0, 1600] - count: 1601
	 eval: [1601, 1809] - count: 209
	 test: [1810, 2018] - count: 209
Portugal Liga ZON Sagres
	 train: [0, 906] - count: 907
	 eval: [907, 1059] - count: 153
	 test: [1060, 1229] - count: 170
Scotland Premier League
	 train: [0, 1213] - count: 1214
	 eval: [1214, 1370] - count: 157
	 test: [1371, 1528] - count: 158
Spain LIGA BBVA
	 train: [0, 

In [4]:
count = [0] * 3
total = 0
for league, league_df in dataset.groupby('league'):
    index_dict = tvt_indcs[league]
    l_count = [0] * 3
    l_total = 0
    for idx in index_dict['test']:
        week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
        l_count[0] += (week_df['result'] == 'win').sum().item()
        l_count[1] += (week_df['result'] == 'tie').sum().item()
        l_count[2] += (week_df['result'] == 'loss').sum().item()
        l_total += week_df.shape[0]
    count[0] += l_count[0]
    count[1] += l_count[1]
    count[2] += l_count[2]
    total += l_total
    print(f'{league} - win: {l_count[0] / l_total: .4f} - tie: {l_count[1] / l_total: .4f} - loss: {l_count[2] / l_total: .4f}')
    print('='*50)
print(f'Overal - win: {count[0] / total: .4f} - tie: {count[1] / total: .4f} - loss: {count[2] / total: .4f}')


Belgium Jupiler League - win:  0.4604 - tie:  0.2302 - loss:  0.3094
England Premier League - win:  0.4281 - tie:  0.2810 - loss:  0.2908
France Ligue 1 - win:  0.4377 - tie:  0.2847 - loss:  0.2776
Germany 1. Bundesliga - win:  0.4325 - tie:  0.2460 - loss:  0.3214
Italy Serie A - win:  0.4416 - tie:  0.2591 - loss:  0.2993
Netherlands Eredivisie - win:  0.4306 - tie:  0.2249 - loss:  0.3445
Portugal Liga ZON Sagres - win:  0.4176 - tie:  0.2294 - loss:  0.3529
Scotland Premier League - win:  0.4430 - tie:  0.1899 - loss:  0.3671
Spain LIGA BBVA - win:  0.4965 - tie:  0.2326 - loss:  0.2708
Overal - win:  0.4439 - tie:  0.2475 - loss:  0.3086


In [5]:
#Train
for league, league_df in dataset.groupby('league'):
    index_dict = tvt_indcs[league]
    for idx in index_dict['train']:
        week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
        currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
        currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
        expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
        expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))
        
        #Updating
        Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
        Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))

In [6]:
# #Validation
# chosen_threshhold = pd.DataFrame(
#     {'thresh': 0, 'maxAcc': 0},
#     index=dataset['league'].unique()
# )

# for league, league_df in dataset.groupby('league'):
#     print(f'============== Validating over league: {league} ==============')
#     index_dict = tvt_indcs[league]
#     for t in [0.01, 0.03, 0.1, 0.3]:
#         tmp_Elo_Scores = Elo_Scores.copy()
#         correct, total = 0, 0
#         for idx in index_dict['eval']:
#             week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
#             currentH_elo = tmp_Elo_Scores[week_df['home_team']].to_numpy()
#             currentA_elo = tmp_Elo_Scores[week_df['away_team']].to_numpy()
#             expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
#             expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))

#             out = (expectedH - expectedA)
#             conditions = [
#                 (np.abs(out) <= t),
#                 (out > t),
#                 (out < -t)
#             ]
#             choices = [0.5, 1, 0]
#             predictions = np.select(conditions, choices)
#             result = week_df['alphaH'].to_numpy()
#             correct += (predictions == result).sum()
#             total += result.shape[0]

#             #Updating
#             tmp_Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
#             tmp_Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))
#         current_acc = correct / total
#         print(f'Threshhold: {t} - (Correct, Total): {(correct, total)} - Validation Accuracy: {correct / total: .3f}')
#         if current_acc >= chosen_threshhold.loc[league, 'maxAcc']:
#             chosen_threshhold.loc[league, 'maxAcc'] = current_acc
#             chosen_threshhold.loc[league, 'thresh'] = t




# for league, league_df in dataset.groupby('league'):
#     index_dict = tvt_indcs[league]
#     for idx in index_dict['eval']:
#         week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
#         currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
#         currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
#         expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
#         expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))
        
#         #Updating
#         Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
#         Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))


# print(f'{"#"*25} Testing {"#"*25}')

# #Test
# t_correct, t_total = 0, 0
# for league, league_df in dataset.groupby('league'):
#     print(f'============== Testing over league: {league} ==============')
#     index_dict = tvt_indcs[league]
#     t = chosen_threshhold.loc[league, 'thresh']
#     correct, total = 0, 0
#     for idx in index_dict['test']:
#         week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
#         currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
#         currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
#         expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
#         expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))

#         out = (expectedH - expectedA)
#         conditions = [
#             (np.abs(out) <= t),
#             (out > t),
#             (out < -t)
#         ]
#         choices = [0.5, 1, 0]
#         predictions = np.select(conditions, choices)
#         result = week_df['alphaH'].to_numpy()
#         correct += (predictions == result).sum()
#         total += result.shape[0]

#         #Updating
#         Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
#         Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))
#     current_acc = correct / total
#     t_correct += correct
#     t_total += total
#     print(f'Threshhold: {t} - (Correct, Total): {(correct, total)} - Test Accuracy: {correct / total: .3f}')
# print(f'Overal Accuracy: {t_correct / t_total: .3f}')

In [7]:
#Validation
# chosen_threshhold = pd.DataFrame(
#     {'thresh': 0, 'maxAcc': 0},
#     index=dataset['league'].unique()
# )

chosen_threshhold = {'thresh': 0, 'maxAcc': 0}

for t in [0.01, 0.03, 0.1, 0.3]:
    print(f'============== Validating over threshhold: {t} ==============')
    correct, total = 0, 0
    for league, league_df in dataset.groupby('league'):
        index_dict = tvt_indcs[league]
        tmp_Elo_Scores = Elo_Scores.copy()
        for idx in index_dict['eval']:
            week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
            currentH_elo = tmp_Elo_Scores[week_df['home_team']].to_numpy()
            currentA_elo = tmp_Elo_Scores[week_df['away_team']].to_numpy()
            expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
            expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))

            out = (expectedH - expectedA)
            conditions = [
                (np.abs(out) <= t),
                (out > t),
                (out < -t)
            ]
            choices = [0.5, 1, 0]
            predictions = np.select(conditions, choices)
            result = week_df['alphaH'].to_numpy()
            correct += (predictions == result).sum()
            total += result.shape[0]

            #Updating
            tmp_Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
            tmp_Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))
    current_acc = correct / total
    print(f'Threshhold: {t} - (Correct, Total): {(correct, total)} - Validation Accuracy: {correct / total: .3f}')
    if current_acc >= chosen_threshhold['maxAcc']:
        chosen_threshhold['maxAcc'] = current_acc
        chosen_threshhold['thresh'] = t




for league, league_df in dataset.groupby('league'):
    index_dict = tvt_indcs[league]
    for idx in index_dict['eval']:
        week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
        currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
        currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
        expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
        expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))
        
        #Updating
        Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
        Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))


print(f'{"#"*25} Testing {"#"*25}')


#Test
t_correct, t_total = 0, 0
for league, league_df in dataset.groupby('league'):
    print(f'============== Testing over league: {league} ==============')
    index_dict = tvt_indcs[league]
    t = chosen_threshhold['thresh']
    correct, total = 0, 0
    for idx in index_dict['test']:
        week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
        currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
        currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
        expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
        expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))

        out = (expectedH - expectedA)
        conditions = [
            (np.abs(out) <= t),
            (out > t),
            (out < -t)
        ]
        choices = [0.5, 1, 0]
        predictions = np.select(conditions, choices)
        result = week_df['alphaH'].to_numpy()
        correct += (predictions == result).sum()
        total += result.shape[0]

        #Updating
        Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
        Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))
    current_acc = correct / total
    t_correct += correct
    t_total += total
    print(f'Threshhold: {t} - (Correct, Total): {(correct, total)} - Test Accuracy: {correct / total: .3f}')
print(f'Overal Accuracy: {t_correct / t_total: .3f}')

Threshhold: 0.01 - (Correct, Total): (997, 2021) - Validation Accuracy:  0.493
Threshhold: 0.03 - (Correct, Total): (1000, 2021) - Validation Accuracy:  0.495
Threshhold: 0.1 - (Correct, Total): (974, 2021) - Validation Accuracy:  0.482
Threshhold: 0.3 - (Correct, Total): (877, 2021) - Validation Accuracy:  0.434
######################### Testing #########################
Threshhold: 0.03 - (Correct, Total): (59, 139) - Test Accuracy:  0.424
Threshhold: 0.03 - (Correct, Total): (140, 306) - Test Accuracy:  0.458
Threshhold: 0.03 - (Correct, Total): (123, 281) - Test Accuracy:  0.438
Threshhold: 0.03 - (Correct, Total): (114, 252) - Test Accuracy:  0.452
Threshhold: 0.03 - (Correct, Total): (131, 274) - Test Accuracy:  0.478
Threshhold: 0.03 - (Correct, Total): (110, 209) - Test Accuracy:  0.526
Threshhold: 0.03 - (Correct, Total): (86, 170) - Test Accuracy:  0.506
Threshhold: 0.03 - (Correct, Total): (83, 158) - Test Accuracy:  0.525
Threshhold: 0.03 - (Correct, Total): (153, 288) - Te

In [6]:
chosen_threshhold

Unnamed: 0,thresh,maxAcc
Belgium Jupiler League,0.01,0.494505
England Premier League,0.03,0.52562
France Ligue 1,0.1,0.467797
Germany 1. Bundesliga,0.01,0.518595
Italy Serie A,0.1,0.5
Netherlands Eredivisie,0.01,0.477273
Portugal Liga ZON Sagres,0.03,0.510714
Scotland Premier League,0.03,0.498423
Spain LIGA BBVA,0.03,0.487179


In [7]:
Elo_Scores

AC Ajaccio                 -117.018918
AC Arles-Avignon           -139.381876
ADO Den Haag                 38.631501
AJ Auxerre                  -62.481813
AS Monaco                   173.380321
                               ...    
Wigan Athletic              -21.629226
Willem II                   -73.063492
Wolverhampton Wanderers    -149.312759
Xerez Club Deportivo        -62.237358
Évian Thonon Gaillard FC    -57.569419
Length: 254, dtype: float64