<a href="https://colab.research.google.com/github/DirkStulgies/mlwtSportsPrediction/blob/main/playground/teams_prediction_dirk.ipynb"
 target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import packages.
import os
import datetime
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorboard as tb

In [None]:
# Define parameters.
LOGS_PATH = '../logs/teams_prediction_dirk'
BASELINE_DATA_PATH = '../data/five_projects_soccer_match_data.csv'
RESULT_LOSS_ACC_FILE = './teams_prediction_dirk_loss_acc_3.txt'
TEAM_COLUMNS = ['team1', 'team2']
RESULT_COLUMN = 'result_team1'
SCORE_TEAM1_COLUMN = 'score1'
SCORE_TEAM2_COLUMN = 'score2'
NORMALIZE_PERCENTAGE_COLUMNS = ['spi1', 'spi2']
VALUE_HOME_LOST = 0
VALUE_HOME_DRAW = 1
VALUE_HOME_WON = 2
VALIDATION_SIZE = 0.2
LOSS_FUNCTIONS = [
    'categorical_hinge', 'hinge', 'huber',
    'kullback_leibler_divergence', 'log_cosh',
    'mean_absolute_error', 'mean_absolute_percentage_error', 'mean_squared_error',
    'mean_squared_logarithmic_error', 'poisson', 
    'sparse_categorical_crossentropy', 'squared_hinge']
OPTIMIZER_FUNCTIONS = ['Adadelta', 'Adagrad', 'Adam', 'Adamax', 'Ftrl', 'Nadam', 'RMSprop', 'SGD']
METRICS = 'sparse_categorical_accuracy'
EPOCHS = 300
BATCH_SIZE = 32
LEAGUE = 'German Bundesliga'

LOGS_PATH = os.path.join(LOGS_PATH, '3')

In [None]:
# Define the model
def defineModel(input_dim):
    nodes = input_dim*2-1
    return tf.keras.models.Sequential([
        tf.keras.layers.Dense(nodes, activation='relu', input_dim=input_dim),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(nodes, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(nodes, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(3, activation='softmax')
    ])

In [None]:
def runModel(name, columns, x_train, y_train, x_valid, y_valid, optimizer, loss, epochs, metrics):
    # Define callback function for writing data for tensorBoard
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=os.path.join(LOGS_PATH, name), histogram_freq=1)

    # Compile and run the model.
    model = defineModel(len(columns))
    model.compile(loss=loss, optimizer=optimizer, metrics=[metrics]) 
    history = model.fit(
        x=x_train,
        y=y_train,
        epochs=epochs,
        batch_size=BATCH_SIZE,
        validation_data=(x_valid, y_valid),
        callbacks=[tensorboard_callback],
        verbose=1
    )
    
    # model evaluation
    train_loss = history.history['loss'][EPOCHS-1] 
    train_acc = history.history[METRICS][EPOCHS-1]
    test_loss = model.evaluate(x_valid, y_valid)[0]
    test_acc = model.evaluate(x_valid, y_valid)[1]

    # Save acc and loss results to file.
    with open(RESULT_LOSS_ACC_FILE, 'a') as rfile:
        rfile.write(name+'\t'+str(train_loss)+'\t'+str(train_acc)+'\t'+str(test_loss)+'\t'+str(test_acc)+'\n')

In [None]:
# Set up tensorboard.
%load_ext tensorboard
logs = Path(LOGS_PATH)
logs.mkdir(mode=0o777, parents=True, exist_ok=True)
print('Log path set to ' + LOGS_PATH)

In [None]:
# Load the baseline data set.
baseline_data = pd.read_csv(BASELINE_DATA_PATH, delimiter=',', decimal='.') 
if LEAGUE != '':
    baseline_data = baseline_data[(baseline_data['league'] == LEAGUE)]
baseline_data.head()

In [None]:
# Normalize columns with percentage values.
bdm = baseline_data.copy(deep=True)
for column in NORMALIZE_PERCENTAGE_COLUMNS:
    bdm[column] = bdm[column].apply(lambda value: value / 100)
    bdm = bdm.astype({column: np.float64})

# Get a distinct list of all teams and create dummy columns.
teams = bdm[TEAM_COLUMNS[0]].unique()
bdm = pd.get_dummies(bdm, columns=TEAM_COLUMNS)
feature_column_names = []

for dummy_column in bdm.keys():
    for team_column in TEAM_COLUMNS:
        if team_column in dummy_column:
            feature_column_names.append(dummy_column)

# add the the result column.
bdm[RESULT_COLUMN] = VALUE_HOME_DRAW
bdm.loc[bdm[SCORE_TEAM1_COLUMN] < bdm[SCORE_TEAM2_COLUMN], RESULT_COLUMN] = VALUE_HOME_LOST
bdm.loc[bdm[SCORE_TEAM1_COLUMN] > bdm[SCORE_TEAM2_COLUMN], RESULT_COLUMN] = VALUE_HOME_WON

bdm.head()

In [None]:
# Shuffle and split the data sets for training and validation.
split_index = int(len(bdm) * VALIDATION_SIZE)
bdm_shuffled = bdm.sample(frac=1)
bdm_train = bdm_shuffled[split_index:]
bdm_valid = bdm_shuffled[:split_index]

print('Length training data:', len(bdm_train))
print('Length validation data:', len(bdm_valid))

# Select the columns for the four tests situations.
train_only_teams = bdm_train.loc[:, feature_column_names]
train_only_spi = bdm_train.loc[:, NORMALIZE_PERCENTAGE_COLUMNS]
train_teams_and_spi = bdm_train.loc[:, feature_column_names + NORMALIZE_PERCENTAGE_COLUMNS]

valid_only_teams = bdm_valid.loc[:, feature_column_names]
valid_only_spi = bdm_valid.loc[:, NORMALIZE_PERCENTAGE_COLUMNS]
valid_teams_and_spi = bdm_valid.loc[:, feature_column_names + NORMALIZE_PERCENTAGE_COLUMNS]

train_result = bdm_train.loc[:, RESULT_COLUMN]
valid_result = bdm_valid.loc[:, RESULT_COLUMN]

print('Trainings shape', train_result.shape)

In [None]:
for optimizer in OPTIMIZER_FUNCTIONS:
    for loss in LOSS_FUNCTIONS:
        runModel('only_teams_'+optimizer+'_'+loss, feature_column_names, train_only_teams, train_result, valid_only_teams, valid_result, optimizer, loss, EPOCHS, METRICS)

In [None]:
for optimizer in OPTIMIZER_FUNCTIONS:
    for loss in LOSS_FUNCTIONS:
        runModel('only_spi_'+optimizer+'_'+loss, NORMALIZE_PERCENTAGE_COLUMNS, train_only_spi, train_result, valid_only_spi, valid_result, optimizer, loss, EPOCHS, METRICS)

In [None]:
for optimizer in OPTIMIZER_FUNCTIONS:
    for loss in LOSS_FUNCTIONS:
        runModel('teams_and_spi_'+optimizer+'_'+loss, feature_column_names + NORMALIZE_PERCENTAGE_COLUMNS, train_teams_and_spi, train_result, valid_teams_and_spi, valid_result, optimizer, loss, EPOCHS, METRICS)

In [None]:
%tensorboard --logdir $LOGS_PATH