<a href="https://colab.research.google.com/github/DirkStulgies/mlwtSportsPrediction/blob/main/playground/own_data_opt_loss_dirk.ipynb"
 target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import packages.
import os
from pathlib import Path
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorboard as tb

2022-01-08 14:53:04.114547: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-08 14:53:04.114606: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Define parameters.
LOGS_PATH = '../logs/own_data_opt_loss_dirk'
DATA_PATH = '../data/own_data_preparation_dirk.csv'
RESULT_COLUMN = 'result_team1'
DUMMY_COLUMNS = ['team1', 'team2']
NORMAILZE_COLUMNS = ['match_day_team1', 'match_day_team2',
 'average_age_team1', 'average_age_team2',
 'average_market_value_in_euro_team1', 'average_market_value_in_euro_team2',
 'total_market_value_in_euro_team1', 'total_market_value_in_euro_team2',
 'points_last_season_team1', 'points_last_season_team2',
 'points_team1', 'points_team2',
 'squard_team1', 'squard_team2']
VALIDATION_SIZE = 0.2
LOSS_FUNCTION = 'sparse_categorical_crossentropy'
OPTIMIZER_FUNCTION = 'Adam'
METRICS = 'sparse_categorical_accuracy'
EPOCHS = 1
BATCH_SIZE = 32
TITLE = 'own_data'

LOGS_PATH = os.path.join(LOGS_PATH, TITLE)

In [3]:
# Set up tensorboard.
%load_ext tensorboard
logs = Path(LOGS_PATH)
logs.mkdir(mode=0o777, parents=True, exist_ok=True)

In [4]:
# Load the data set and get the wanted columns.
data = pd.read_csv(DATA_PATH, delimiter=',', decimal='.')
data = data.loc[:, DUMMY_COLUMNS + NORMAILZE_COLUMNS + [RESULT_COLUMN]]
data = data.dropna()

# Normalize column using the max value for each column.
for column in NORMAILZE_COLUMNS:
    max_value = data[column].max()
    data[column] = data[column].apply(lambda x: x / max_value)

# Create dummy columns for the teams.
data = pd.get_dummies(data, columns=DUMMY_COLUMNS)

# Split the data set.
split_index = int(len(data) * VALIDATION_SIZE)
data = data.sample(frac=1)
data_train = data[split_index:]
data_valid = data[:split_index]

par_train = data_train.loc[:, data_train.columns != RESULT_COLUMN]
res_train = data_train.loc[:, RESULT_COLUMN]
par_valid = data_valid.loc[:, data_train.columns != RESULT_COLUMN]
res_valid = data_valid.loc[:, RESULT_COLUMN]

data.head()

Unnamed: 0,match_day_team1,match_day_team2,average_age_team1,average_age_team2,average_market_value_in_euro_team1,average_market_value_in_euro_team2,total_market_value_in_euro_team1,total_market_value_in_euro_team2,points_last_season_team1,points_last_season_team2,...,team2_Hertha Berlin,team2_Mainz,team2_RB Leipzig,team2_SC Freiburg,team2_SV Darmstadt 98,team2_Schalke 04,team2_TSG Hoffenheim,team2_VfB Stuttgart,team2_VfL Wolfsburg,team2_Werder Bremen
3464,0.117647,0.117647,0.897059,0.933824,0.111661,0.218373,0.14637,0.271023,0.090909,0.173077,...,0,0,0,0,0,0,0,0,0,0
2747,0.441176,0.441176,0.908088,0.908088,0.082277,0.807609,0.107795,0.916445,0.029545,0.071795,...,0,0,0,0,0,0,0,0,0,0
317,0.294118,0.294118,0.875,0.930147,0.035571,0.057841,0.040231,0.075713,0.045455,0.048718,...,0,0,0,0,0,0,0,0,0,0
236,0.205882,0.205882,0.856618,0.944853,0.068048,0.026044,0.093997,0.036008,0.081169,0.069597,...,0,0,0,0,1,0,0,0,0,0
3006,0.705882,0.705882,0.867647,0.882353,0.358491,0.085679,0.394141,0.103408,0.026042,0.019231,...,0,0,0,1,0,0,0,0,0,0


In [5]:
# Convert data frames to numpy arrays.
par_train = par_train.to_numpy()
res_train = res_train.to_numpy()
par_valid = par_valid.to_numpy()
res_valid = res_valid.to_numpy()

#par_train = par_train.reshape(len(par_train), 1, par_train.shape[1])
#par_valid = par_valid.reshape(len(par_valid), 1, par_valid.shape[1])

print(par_train.shape)
print(res_train.shape)
print(par_valid.shape)
print(res_valid.shape)

(5385, 60)
(5385,)
(1346, 60)
(1346,)


In [6]:
# Define callback function for writing data for tensorBoard
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOGS_PATH, histogram_freq=1)

# Define the model
nodes = par_train.shape[1]*2-1
model = tf.keras.models.Sequential([
    #tf.keras.layers.Conv1D(nodes, kernel_size=5, strides=1, padding='causal', activation='relu'),
    #tf.keras.layers.LSTM(nodes, return_sequences=True),
    #tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(nodes, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(nodes, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(nodes, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(3, activation='softmax')
])

# Compile and run the model.
model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER_FUNCTION, metrics=[METRICS])
history = model.fit(
    x=par_train,
    y=res_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(par_valid, res_valid),
    callbacks=[tensorboard_callback],
    verbose=1
)

2022-01-08 14:53:06.891415: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-01-08 14:53:06.891449: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (u50sj3d10xnl): /proc/driver/nvidia/version does not exist
2022-01-08 14:53:06.892069: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [8]:
%tensorboard --logdir $LOGS_PATH --host localhost --port 6008