In [1]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
from Config import HYPERPARAETERS
import pandas as pd

#Constants
c = 10
d = 400
k = 40
R0 = 0

In [2]:
DATA_COLUMNS = ['league', 'season', 'week', 'home_team', 'away_team', 'result']
dataset: pd.DataFrame = pd.read_csv(
            'data/KaggleDataset_withBO.csv',
            encoding='utf-8',
            usecols=DATA_COLUMNS,
            dtype=dict(zip(DATA_COLUMNS, [str]*2 + [int] + [str]*3))
        )
conds = [
    (dataset['result'] == 'win'),
    (dataset['result'] == 'tie'),
    (dataset['result'] == 'loss')
]
choices = [1, 0.5, 0]
dataset['alphaH'] = np.select(conds, choices)
dataset['alphaA'] = np.select(conds, choices[::-1])

Elo_Scores = pd.Series(R0, index=np.unique(dataset[['home_team', 'away_team']]))


week_lists = dict()
for league, league_df in dataset.groupby('league'):
    week_lists[league] = []
    for season, season_df in league_df.groupby('season'):
        for week, week_df in season_df.groupby('week'):
            week_lists[league].append((season, week))


tvt_indcs = dict() #train validation test indcs
for league, league_df in dataset.groupby('league'):
    tvt_indcs[league] = dict()
    tvt_indcs[league]['test'] = list(range(int((1-HYPERPARAETERS.TestPortion.value) * len(week_lists[league])), len(week_lists[league])))
    tvt_indcs[league]['eval'] = list(range(int((1 - (HYPERPARAETERS.TestPortion.value+HYPERPARAETERS.ValidationPortion.value)) * len(week_lists[league])), int((1-HYPERPARAETERS.TestPortion.value) * len(week_lists[league]))))
    tvt_indcs[league]['train'] = list(range(0, int((1 - (HYPERPARAETERS.TestPortion.value+HYPERPARAETERS.ValidationPortion.value)) * len(week_lists[league]))))

In [6]:
total = [0] * 3
for league, league_df in dataset.groupby('league'):
    league_df = league_df.reset_index(drop=True)
    print(f'{league}')
    indcs = tvt_indcs[league]
    for i, mode in enumerate(['train', 'eval', 'test']):
        start = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][indcs[mode][0]], axis=1), :].index[0]
        end = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][indcs[mode][-1]], axis=1), :].index[-1]
        print(f'\t {mode}: [{start}, {end}] - count: {end - start + 1}')
        total[i] += end - start + 1
print(f'Overal')
print(f'\t train count: {total[0]}')
print(f'\t eval count: {total[1]}')
print(f'\t test count: {total[2]}')

Belgium Jupiler League
	 train: [0, 629] - count: 630
	 eval: [630, 902] - count: 273
	 test: [903, 1180] - count: 278
England Premier League
	 train: [0, 1748] - count: 1749
	 eval: [1749, 2353] - count: 605
	 test: [2354, 2953] - count: 600
France Ligue 1
	 train: [0, 1699] - count: 1700
	 eval: [1700, 2289] - count: 590
	 test: [2290, 2854] - count: 565
Germany 1. Bundesliga
	 train: [0, 1391] - count: 1392
	 eval: [1392, 1875] - count: 484
	 test: [1876, 2367] - count: 492
Italy Serie A
	 train: [0, 1608] - count: 1609
	 eval: [1609, 2160] - count: 552
	 test: [2161, 2703] - count: 543
Netherlands Eredivisie
	 train: [0, 1160] - count: 1161
	 eval: [1161, 1600] - count: 440
	 test: [1601, 2018] - count: 418
Portugal Liga ZON Sagres
	 train: [0, 626] - count: 627
	 eval: [627, 906] - count: 280
	 test: [907, 1229] - count: 323
Scotland Premier League
	 train: [0, 896] - count: 897
	 eval: [897, 1213] - count: 317
	 test: [1214, 1528] - count: 315
Spain LIGA BBVA
	 train: [0, 1537] -

In [3]:
#Train
for league, league_df in dataset.groupby('league'):
    index_dict = tvt_indcs[league]
    for idx in index_dict['train']:
        week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
        currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
        currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
        expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
        expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))
        
        #Updating
        Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
        Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))

In [4]:
#Validation
chosen_threshhold = pd.DataFrame(
    {'thresh': 0, 'maxAcc': 0},
    index=dataset['league'].unique()
)

for league, league_df in dataset.groupby('league'):
    print(f'============== Validating over league: {league} ==============')
    index_dict = tvt_indcs[league]
    for t in [0.01, 0.03, 0.1, 0.3]:
        tmp_Elo_Scores = Elo_Scores.copy()
        correct, total = 0, 0
        for idx in index_dict['eval']:
            week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
            currentH_elo = tmp_Elo_Scores[week_df['home_team']].to_numpy()
            currentA_elo = tmp_Elo_Scores[week_df['away_team']].to_numpy()
            expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
            expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))

            out = (expectedH - expectedA)
            conditions = [
                (np.abs(out) <= t),
                (out > t),
                (out < -t)
            ]
            choices = [0.5, 1, 0]
            predictions = np.select(conditions, choices)
            result = week_df['alphaH'].to_numpy()
            correct += (predictions == result).sum()
            total += result.shape[0]

            #Updating
            tmp_Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
            tmp_Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))
        current_acc = correct / total
        print(f'Threshhold: {t} - (Correct, Total): {(correct, total)} - Validation Accuracy: {correct / total: .3f}')
        if current_acc >= chosen_threshhold.loc[league, 'maxAcc']:
            chosen_threshhold.loc[league, 'maxAcc'] = current_acc
            chosen_threshhold.loc[league, 'thresh'] = t




for league, league_df in dataset.groupby('league'):
    index_dict = tvt_indcs[league]
    for idx in index_dict['eval']:
        week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
        currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
        currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
        expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
        expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))
        
        #Updating
        Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
        Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))


print(f'{"#"*25} Testing {"#"*25}')

#Test
t_correct, t_total = 0, 0
for league, league_df in dataset.groupby('league'):
    print(f'============== Testing over league: {league} ==============')
    index_dict = tvt_indcs[league]
    t = chosen_threshhold.loc[league, 'thresh']
    correct, total = 0, 0
    for idx in index_dict['test']:
        week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
        currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
        currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
        expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
        expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))

        out = (expectedH - expectedA)
        conditions = [
            (np.abs(out) <= t),
            (out > t),
            (out < -t)
        ]
        choices = [0.5, 1, 0]
        predictions = np.select(conditions, choices)
        result = week_df['alphaH'].to_numpy()
        correct += (predictions == result).sum()
        total += result.shape[0]

        #Updating
        Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
        Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))
    current_acc = correct / total
    t_correct += correct
    t_total += total
    print(f'Threshhold: {t} - (Correct, Total): {(correct, total)} - Test Accuracy: {correct / total: .3f}')
print(f'Overal Accuracy: {t_correct / t_total: .3f}')

Threshhold: 0.01 - (Correct, Total): (135, 273) - Validation Accuracy:  0.495
Threshhold: 0.03 - (Correct, Total): (133, 273) - Validation Accuracy:  0.487
Threshhold: 0.1 - (Correct, Total): (132, 273) - Validation Accuracy:  0.484
Threshhold: 0.3 - (Correct, Total): (108, 273) - Validation Accuracy:  0.396
Threshhold: 0.01 - (Correct, Total): (316, 605) - Validation Accuracy:  0.522
Threshhold: 0.03 - (Correct, Total): (318, 605) - Validation Accuracy:  0.526
Threshhold: 0.1 - (Correct, Total): (310, 605) - Validation Accuracy:  0.512
Threshhold: 0.3 - (Correct, Total): (287, 605) - Validation Accuracy:  0.474
Threshhold: 0.01 - (Correct, Total): (267, 590) - Validation Accuracy:  0.453
Threshhold: 0.03 - (Correct, Total): (271, 590) - Validation Accuracy:  0.459
Threshhold: 0.1 - (Correct, Total): (276, 590) - Validation Accuracy:  0.468
Threshhold: 0.3 - (Correct, Total): (247, 590) - Validation Accuracy:  0.419
Threshhold: 0.01 - (Correct, Total): (251, 484) - Validation Accuracy:

In [5]:
# #Validation
# # chosen_threshhold = pd.DataFrame(
# #     {'thresh': 0, 'maxAcc': 0},
# #     index=dataset['league'].unique()
# # )

# chosen_threshhold = {'thresh': 0, 'maxAcc': 0}

# for t in [0.01, 0.03, 0.1, 0.3]:
#     print(f'============== Validating over threshhold: {t} ==============')
#     correct, total = 0, 0
#     for league, league_df in dataset.groupby('league'):
#         index_dict = tvt_indcs[league]
#         tmp_Elo_Scores = Elo_Scores.copy()
#         for idx in index_dict['eval']:
#             week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
#             currentH_elo = tmp_Elo_Scores[week_df['home_team']].to_numpy()
#             currentA_elo = tmp_Elo_Scores[week_df['away_team']].to_numpy()
#             expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
#             expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))

#             out = (expectedH - expectedA)
#             conditions = [
#                 (np.abs(out) <= t),
#                 (out > t),
#                 (out < -t)
#             ]
#             choices = [0.5, 1, 0]
#             predictions = np.select(conditions, choices)
#             result = week_df['alphaH'].to_numpy()
#             correct += (predictions == result).sum()
#             total += result.shape[0]

#             #Updating
#             tmp_Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
#             tmp_Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))
#     current_acc = correct / total
#     print(f'Threshhold: {t} - (Correct, Total): {(correct, total)} - Validation Accuracy: {correct / total: .3f}')
#     if current_acc >= chosen_threshhold['maxAcc']:
#         chosen_threshhold['maxAcc'] = current_acc
#         chosen_threshhold['thresh'] = t




# for league, league_df in dataset.groupby('league'):
#     index_dict = tvt_indcs[league]
#     for idx in index_dict['eval']:
#         week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
#         currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
#         currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
#         expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
#         expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))
        
#         #Updating
#         Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
#         Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))


# print(f'{"#"*25} Testing {"#"*25}')


# #Test
# t_correct, t_total = 0, 0
# for league, league_df in dataset.groupby('league'):
#     print(f'============== Testing over league: {league} ==============')
#     index_dict = tvt_indcs[league]
#     t = chosen_threshhold['thresh']
#     correct, total = 0, 0
#     for idx in index_dict['test']:
#         week_df = league_df.loc[np.all(league_df[['season', 'week']] == week_lists[league][idx], axis=1), :]
#         currentH_elo = Elo_Scores[week_df['home_team']].to_numpy()
#         currentA_elo = Elo_Scores[week_df['away_team']].to_numpy()
#         expectedH = 1 / (1 + c**((currentA_elo - currentH_elo) / d))
#         expectedA = 1 / (1 + c**((currentH_elo - currentA_elo) / d))

#         out = (expectedH - expectedA)
#         conditions = [
#             (np.abs(out) <= t),
#             (out > t),
#             (out < -t)
#         ]
#         choices = [0.5, 1, 0]
#         predictions = np.select(conditions, choices)
#         result = week_df['alphaH'].to_numpy()
#         correct += (predictions == result).sum()
#         total += result.shape[0]

#         #Updating
#         Elo_Scores.loc[week_df['home_team']] = currentH_elo + (k * (week_df['alphaH'].to_numpy() - expectedH))
#         Elo_Scores.loc[week_df['away_team']] = currentA_elo + (k * (week_df['alphaA'].to_numpy() - expectedA))
#     current_acc = correct / total
#     t_correct += correct
#     t_total += total
#     print(f'Threshhold: {t} - (Correct, Total): {(correct, total)} - Test Accuracy: {correct / total: .3f}')
# print(f'Overal Accuracy: {t_correct / t_total: .3f}')

In [6]:
chosen_threshhold

Unnamed: 0,thresh,maxAcc
Belgium Jupiler League,0.01,0.494505
England Premier League,0.03,0.52562
France Ligue 1,0.1,0.467797
Germany 1. Bundesliga,0.01,0.518595
Italy Serie A,0.1,0.5
Netherlands Eredivisie,0.01,0.477273
Portugal Liga ZON Sagres,0.03,0.510714
Scotland Premier League,0.03,0.498423
Spain LIGA BBVA,0.03,0.487179


In [7]:
Elo_Scores

AC Ajaccio                 -117.018918
AC Arles-Avignon           -139.381876
ADO Den Haag                 38.631501
AJ Auxerre                  -62.481813
AS Monaco                   173.380321
                               ...    
Wigan Athletic              -21.629226
Willem II                   -73.063492
Wolverhampton Wanderers    -149.312759
Xerez Club Deportivo        -62.237358
Évian Thonon Gaillard FC    -57.569419
Length: 254, dtype: float64