<a href="https://colab.research.google.com/github/DirkStulgies/mlwtSportsPrediction/blob/main/playground/own_data_column_selection.ipynb"
 target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import packages.
import os
import signal
import tempfile
from pathlib import Path
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorboard as tb

2022-01-08 17:22:57.008858: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-08 17:22:57.008903: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Define parameters.
LOGS_PATH = '../logs'
DATA_PATH = '../data/own_data_preparation_dirk.csv'
RESULT_COLUMN = 'result_team1'
DUMMY_COLUMNS = ['team1', 'team2']
NORMAILZE_COLUMNS = ['points', 'squard',
    'average_age', 'average_market_value_in_euro', 'total_market_value_in_euro',
    'rank_last_season', 'points_last_season_all', 'points_last_season']
VALIDATION_SIZE = 0.2
LOSS_FUNCTION = 'sparse_categorical_crossentropy'
OPTIMIZER_FUNCTION = 'Adam'
METRICS = 'sparse_categorical_accuracy'
EPOCHS = 200
BATCH_SIZE = 32
TITLE = 'column_selection'

TENSORBOARD_PROCESS = 'tensorboard'
TENSORBOARD_SERVER = 'localhost'
TENSORBOARD_PORT = 6008

LOGS_PATH = os.path.join(LOGS_PATH, TITLE)

In [3]:
# Set up tensorboard.
%load_ext tensorboard
logs = Path(LOGS_PATH)
logs.mkdir(mode=0o777, parents=True, exist_ok=True)

In [4]:
# Load the data set and drop rows with nan values.
data = pd.read_csv(DATA_PATH, delimiter=',', decimal='.')
data = data.dropna()

# Create the difference between team1 and team2 for each feature.
for column in NORMAILZE_COLUMNS:
    data[column] = data[column + '_team1'] - data[column + '_team2']

# Normalize column using the max value for each column.
for column in NORMAILZE_COLUMNS:
    max_value = data[column].max()
    data[column] = data[column].apply(lambda x: x / max_value)

# Get the wanted columns and create dummy columns for the teams.
data = data.loc[:, DUMMY_COLUMNS + NORMAILZE_COLUMNS + [RESULT_COLUMN]]
data = pd.get_dummies(data, columns=DUMMY_COLUMNS)

# Split the data set.
split_index = int(len(data) * VALIDATION_SIZE)
data = data.sample(frac=1)
data_train = data[split_index:]
data_valid = data[:split_index]

par_train = data_train.loc[:, data_train.columns != RESULT_COLUMN]
res_train = data_train.loc[:, [RESULT_COLUMN]]
par_valid = data_valid.loc[:, data_train.columns != RESULT_COLUMN]
res_valid = data_valid.loc[:, [RESULT_COLUMN]]

dummies = [key for key in par_train.keys() if 'team' in key]

data.head()

Unnamed: 0,points,squard,average_age,average_market_value_in_euro,total_market_value_in_euro,rank_last_season,points_last_season_all,points_last_season,result_team1,team1_1. FC Union Berlin,...,team2_Hertha Berlin,team2_Mainz,team2_RB Leipzig,team2_SC Freiburg,team2_SV Darmstadt 98,team2_Schalke 04,team2_TSG Hoffenheim,team2_VfB Stuttgart,team2_VfL Wolfsburg,team2_Werder Bremen
1632,-0.017857,0.352941,-0.294118,-0.092599,-0.067468,-0.066667,0.038462,0.002667,1,0,...,0,0,0,0,0,1,0,0,0,0
5647,0.107143,0.294118,-0.382353,0.496847,0.626242,-0.666667,0.538462,0.08,2,0,...,0,0,0,0,0,0,0,0,0,0
3536,-0.089286,-0.411765,0.382353,-0.485894,-0.698774,0.733333,-0.769231,-0.114286,1,0,...,0,0,0,0,0,0,0,0,0,0
3004,0.089286,0.529412,0.088235,-0.066047,-0.060184,0.333333,-0.115385,-0.005,2,0,...,0,0,0,0,0,0,1,0,0,0
2314,-0.107143,0.588235,-0.088235,-0.128112,-0.111073,0.2,-0.115385,-0.06,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Convert data frames to numpy arrays.
res_train = res_train.to_numpy()
res_valid = res_valid.to_numpy()

print(par_train.shape)
print(res_train.shape)
print(par_valid.shape)
print(res_valid.shape)

(4777, 54)
(4777, 1)
(1194, 54)
(1194, 1)


In [6]:
def defineModel(input_dim, optimizer, loss):
    # Define the model
    nodes = input_dim*2-1
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(nodes, activation='relu', input_dim=input_dim),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(3, activation='softmax')
    ])

    # Compile the model.
    model.compile(loss=loss, optimizer=optimizer, metrics=[METRICS])

    return model


In [7]:
def runModel(model, x_train, x_valid, y_train, y_valid, title):
    # Define callback function for writing data for tensorBoard
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=os.path.join(LOGS_PATH, title), histogram_freq=1)

    # Run the model.
    history = model.fit(
        x=x_train,
        y=y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(x_valid, y_valid),
        callbacks=[tensorboard_callback],
        verbose=1
    )

    return history

In [8]:

# Single columns without teams.
for column in NORMAILZE_COLUMNS:
    sel_train = par_train.loc[:, [column]]
    sel_valid = par_valid.loc[:, [column]]

    sel_train = sel_train.to_numpy()
    sel_valid = sel_valid.to_numpy()

    runModel(defineModel(sel_train.shape[1], OPTIMIZER_FUNCTION, LOSS_FUNCTION), sel_train, sel_valid, res_train, res_valid, column+'_without_teams')



2022-01-08 17:22:59.434553: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-01-08 17:22:59.434586: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (u50sj3d10xnl): /proc/driver/nvidia/version does not exist
2022-01-08 17:22:59.435291: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [9]:
# Single columns with teams.
for column in NORMAILZE_COLUMNS:
    sel_train = par_train.loc[:, dummies + [column]]
    sel_valid = par_valid.loc[:, dummies + [column]]

    sel_train = sel_train.to_numpy()
    sel_valid = sel_valid.to_numpy()

    runModel(defineModel(sel_train.shape[1], OPTIMIZER_FUNCTION, LOSS_FUNCTION), sel_train, sel_valid, res_train, res_valid, column+'_with_teams')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [10]:
# Only team columns.
sel_train = par_train.loc[:, dummies]
sel_valid = par_valid.loc[:, dummies]

sel_train = sel_train.to_numpy()
sel_valid = sel_valid.to_numpy()

runModel(defineModel(sel_train.shape[1], OPTIMIZER_FUNCTION, LOSS_FUNCTION), sel_train, sel_valid, res_train, res_valid, 'only_teams')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7efcd80ba700>

In [11]:
# Only not team columns.
sel_train = par_train.loc[:, NORMAILZE_COLUMNS]
sel_valid = par_valid.loc[:, NORMAILZE_COLUMNS]

sel_train = sel_train.to_numpy()
sel_valid = sel_valid.to_numpy()

runModel(defineModel(sel_train.shape[1], OPTIMIZER_FUNCTION, LOSS_FUNCTION), sel_train, sel_valid, res_train, res_valid, 'only_not_teams')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7efcd918cf40>

In [12]:
# All columns.
sel_train = par_train.to_numpy()
sel_valid = par_valid.to_numpy()

runModel(defineModel(sel_train.shape[1], OPTIMIZER_FUNCTION, LOSS_FUNCTION), sel_train, sel_valid, res_train, res_valid, 'all_columns')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7efcd0176b20>

In [13]:
# Kill the existing tensorboard process and delete the tensorflow temp folder. After this start a new tensorboard process.
try:    
    # Iterating through each instance of the process.
    for line in os.popen("ps ax | grep " + TENSORBOARD_PROCESS + " | grep -v grep"):
        fields = line.split()
            
        # Extracting Process ID from the output.
        pid = fields[0]
            
        # Terminating process.
        os.kill(int(pid), signal.SIGKILL)

    # Delete tensorboard temp folder.
    tb_temp_folder = os.path.join(tempfile.gettempdir(), '.tensorboard-info')
    os.system("rm -rf "+tb_temp_folder)
    print("Process Successfully terminated") 
except Exception as e:
    print(e)

%tensorboard --logdir $LOGS_PATH --host $TENSORBOARD_SERVER --port $TENSORBOARD_PORT

Process Successfully terminated
