# EDA + NN

## Importing libraries

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from sklearn.metrics import accuracy_score
import keras_tuner as kt
from keras.layers import Dense, Dropout
from keras.optimizers import Adam, RMSprop, Nadam

## Dataset

In [74]:
df0 = pd.read_csv(r"ready_data/data_a0_encoded.csv")
df1 = pd.read_csv(r"ready_data/data_a1_encoded.csv")

In [75]:
df0.head()


Unnamed: 0,Time,Target,HomeTeam_enc,avg_goals_in_last5_home,avg_goals_conceded_last5_home,AwayTeam_enc,avg_goals_in_last5_away,avg_goals_conceded_last5_away,Year,Month,Dayofweek,Is_weekend,Season_of_year,Country_D,Country_E,Country_F,Country_G,Country_I,Country_N,Country_P,Country_SC,Country_SP,Country_T,Division_1,Division_2,Division_3
0,19,1,420,,,204,,,2019,7,4,0,2,True,False,False,False,False,False,False,False,False,False,False,True,False
1,19,0,101,,,444,,,2019,7,4,0,2,False,False,True,False,False,False,False,False,False,False,False,True,False
2,19,1,8,,,245,,,2019,7,4,0,2,False,False,True,False,False,False,False,False,False,False,False,True,False
3,19,1,201,,,196,,,2019,7,4,0,2,False,False,True,False,False,False,False,False,False,False,False,True,False
4,19,0,372,,,44,,,2019,7,4,0,2,False,False,True,False,False,False,False,False,False,False,False,True,False


In [76]:
df0.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,42593.0,16.264292,2.468946,10.0,15.0,15.0,19.0,22.0
Target,42593.0,0.499894,0.500006,0.0,0.0,0.0,1.0,1.0
HomeTeam_enc,42593.0,238.533069,137.843524,0.0,118.0,240.0,359.0,477.0
avg_goals_in_last5_home,42115.0,1.468535,0.681854,0.0,1.0,1.4,1.8,7.0
avg_goals_conceded_last5_home,42115.0,1.193677,0.574891,0.0,0.8,1.2,1.6,5.0
AwayTeam_enc,42593.0,238.612284,137.810745,0.0,119.0,240.0,359.0,477.0
avg_goals_in_last5_away,42115.0,1.196533,0.597631,0.0,0.8,1.2,1.6,5.2
avg_goals_conceded_last5_away,42115.0,1.464568,0.640763,0.0,1.0,1.4,1.8,7.0
Year,42593.0,2022.055619,1.763826,2019.0,2021.0,2022.0,2024.0,2025.0
Month,42593.0,6.442232,3.827781,1.0,3.0,6.0,10.0,12.0


In [77]:
df1.head()

Unnamed: 0,Time,Target,HomeTeam_enc,avg_goals_in_last5_home,avg_goals_conceded_last5_home,AwayTeam_enc,avg_goals_in_last5_away,avg_goals_conceded_last5_away,market_decisiveness,expected_total_goals,Norm_Ah_P_home,Norm_Ah_P_away,ah_imbalance,ah_market_confidence,Year,Month,Dayofweek,Is_weekend,Season_of_year,Country_D,Country_E,Country_F,Country_G,Country_I,Country_N,Country_P,Country_SC,Country_SP,Country_T,Division_1,Division_2,Division_3
0,19,0,307,,,435,1.4,2.0,0.09335,1.677022,0.523437,0.476563,0.048936,0.546448,2019,7,4,0,2,False,False,True,False,False,False,False,False,False,False,False,True,False
1,19,1,184,,,238,1.0,1.2,0.121554,1.958666,0.466146,0.533854,0.070854,0.558659,2019,7,4,0,2,False,False,False,False,False,False,False,False,False,False,True,False,False
2,19,0,372,,,44,1.4,2.2,0.11461,1.644545,0.480519,0.519481,0.040541,0.540541,2019,7,4,0,2,False,False,True,False,False,False,False,False,False,False,False,True,False
3,19,1,420,,,204,1.2,0.6,0.066234,1.890909,0.490909,0.509091,0.018896,0.529101,2019,7,4,0,2,True,False,False,False,False,False,False,False,False,False,False,True,False
4,19,1,201,,,196,0.8,0.8,0.070312,1.710794,0.488312,0.511688,0.024301,0.531915,2019,7,4,0,2,False,False,True,False,False,False,False,False,False,False,False,True,False


In [78]:
# data import
data_a0=pd.read_csv(r"ready_data/data_a0_encoded.csv")
data_a1=pd.read_csv(r"ready_data/data_a1_encoded.csv")

In [79]:
data_a0[data_a0.select_dtypes(include='bool').columns]=data_a0[data_a0.select_dtypes(include='bool').columns].astype(int)
data_a1[data_a1.select_dtypes(include='bool').columns]=data_a1[data_a1.select_dtypes(include='bool').columns].astype(int)
#check that all columns have only numerical values
non_numeric_cols0 = data_a0.select_dtypes(exclude=[np.number]).columns
non_numeric_cols1 = data_a1.select_dtypes(exclude=[np.number]).columns

assert len(non_numeric_cols0)==0
assert len(non_numeric_cols1)==0

#just a check to see that we are good to keep working with the data and it's in the form we want
assert type(data_a0)==pd.core.frame.DataFrame
assert type(data_a1)==pd.core.frame.DataFrame

#train test split, 80/20 ratio
#for A0
split_index_0 = int(0.8 * len(data_a0))

train0 = data_a0.iloc[:split_index_0]
test0  = data_a0.iloc[split_index_0:]

X_train_0, y_train_0 = train0.drop(columns='Target'), train0['Target']
X_test_0,  y_test_0  = test0.drop(columns='Target'),  test0['Target']

#for A1
split_index_1 = int(0.8 * len(data_a1))

train1 = data_a1.iloc[:split_index_1]
test1  = data_a1.iloc[split_index_1:]

X_train_1, y_train_1 = train1.drop(columns='Target'), train1['Target']
X_test_1,  y_test_1  = test1.drop(columns='Target'),  test1['Target']

In [80]:
#temporary solution to missing values
X_train_0 = X_train_0.dropna()
y_train_0 = y_train_0.loc[X_train_0.index]

X_test_0 = X_test_0.dropna()
y_test_0 = y_test_0.loc[X_test_0.index]

In [81]:
# Scaler
scaler = StandardScaler()
X_train_0 = scaler.fit_transform(X_train_0)
X_test_0 = scaler.transform(X_test_0)

In [82]:
X_train_1 = scaler.fit_transform(X_train_1)
X_test_1 = scaler.transform(X_test_1)

In [83]:
X_train_0.shape

(33510, 25)

In [84]:
NN_model0 = Sequential()

NN_model0.add(Dense(20, input_dim=X_train_0.shape[1], activation = "relu"))
NN_model0.add(Dense(1, activation='sigmoid'))


NN_model0.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [85]:
NN_model0.fit(X_train_0, y_train_0, epochs=10, batch_size=32)


Epoch 1/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 740us/step - accuracy: 0.5289 - loss: 0.7013
Epoch 2/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 666us/step - accuracy: 0.5485 - loss: 0.6871
Epoch 3/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 755us/step - accuracy: 0.5559 - loss: 0.6850
Epoch 4/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 728us/step - accuracy: 0.5587 - loss: 0.6844
Epoch 5/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 743us/step - accuracy: 0.5605 - loss: 0.6834
Epoch 6/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 815us/step - accuracy: 0.5612 - loss: 0.6832
Epoch 7/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 746us/step - accuracy: 0.5624 - loss: 0.6829
Epoch 8/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 847us/step - accuracy: 0.5615 - loss: 0.6824
Epoch 9/

<keras.src.callbacks.history.History at 0x7b79bbdf9640>

In [86]:
nn_predictions = NN_model0.predict(X_test_0)
nn_predictions = (nn_predictions > 0.5).astype(int)
NN_accuracy0 =  accuracy_score(y_test_0, nn_predictions)
NN_accuracy0

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 492us/step


0.5333961597361291

In [87]:
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=512, step=32),
                    activation='relu', input_shape=(X_train_0.shape[1],)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [88]:
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=100,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [89]:
tuner.search(X_train_0, y_train_0, epochs=50, validation_split=0.2)

Trial 42 Complete [00h 00m 04s]
val_accuracy: 0.522828996181488

Best val_accuracy So Far: 0.5465532541275024
Total elapsed time: 00h 02m 50s


In [90]:
best_model0 = tuner.get_best_models(num_models=1)[0]


best_model0.evaluate(X_test_0, y_test_0)

[1m 60/266[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 852us/step - accuracy: 0.5136 - loss: 0.6974 

  saveable.load_own_variables(weights_store.get(inner_path))


[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step - accuracy: 0.5270 - loss: 0.6934


[0.6933702826499939, 0.5270349979400635]

In [91]:
best_model0.summary()

In [92]:
def build_model(hp):
    model = Sequential()

    # Tune how many hidden layers to use (1–4)
    num_layers = hp.Int("num_layers", min_value=1, max_value=4)

    # Input layer (first hidden layer)
    model.add(Dense(
        units=hp.Int("units_0", min_value=32, max_value=512, step=32),
        activation=hp.Choice("activation_0", ["relu", "tanh", "elu"]),
        input_shape=(X_train_0.shape[1],)
    ))

    # Loop for additional hidden layers
    for i in range(1, num_layers):
        model.add(Dense(
            units=hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
            activation=hp.Choice(f"activation_{i}", ["relu", "tanh", "elu"])
        ))

        # Add dropout (0.0–0.5)
        model.add(Dropout(
            hp.Float(f"dropout_{i}", 0.0, 0.5, step=0.1)
        ))

    # Output layer
    model.add(Dense(1, activation="sigmoid"))

    # Tune optimizer
    optimizer_choice = hp.Choice("optimizer", ["adam", "rmsprop", "nadam"])
    lr = hp.Float("learning_rate", 1e-5, 1e-2, sampling="log")

    if optimizer_choice == "adam":
        optimizer = Adam(learning_rate=lr)
    elif optimizer_choice == "rmsprop":
        optimizer = RMSprop(learning_rate=lr)
    else:
        optimizer = Nadam(learning_rate=lr)

    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model


In [93]:
tuner = kt.Hyperband(
    build_model,
    objective="val_accuracy",
    max_epochs=50,
    factor=3,
    directory="my_dir",
    project_name="improved_tuner"
)


In [94]:
from keras.callbacks import EarlyStopping

stop_early = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)


In [95]:
tuner.search(
    X_train_0, 
    y_train_0,
    validation_split=0.2,
    epochs=100,              
    callbacks=[stop_early],
    batch_size=kt.HyperParameters().Choice("batch_size", [16, 32, 64, 128])
)


Trial 90 Complete [00h 00m 15s]
val_accuracy: 0.5384959578514099

Best val_accuracy So Far: 0.5499850511550903
Total elapsed time: 00h 27m 57s


In [96]:
best_model0 = tuner.get_best_models(num_models=1)[0]
best_model0.summary()
best_model0.evaluate(X_test_0, y_test_0)


[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 791us/step - accuracy: 0.5316 - loss: 0.7073


[0.7073076963424683, 0.5316291451454163]

In [97]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best hyperparameters:")
for hp_name, hp_value in best_hps.values.items():
    print(f"{hp_name}: {hp_value}")

Best hyperparameters:
num_layers: 2
units_0: 288
activation_0: tanh
optimizer: nadam
learning_rate: 0.004891602957779066
units_1: 320
activation_1: elu
dropout_1: 0.4
units_2: 480
activation_2: relu
dropout_2: 0.1
units_3: 480
activation_3: relu
dropout_3: 0.30000000000000004
tuner/epochs: 2
tuner/initial_epoch: 0
tuner/bracket: 3
tuner/round: 0


In [None]:
"""Best hyperparameters:
num_layers: 2
units_0: 288
activation_0: tanh
optimizer: nadam
learning_rate: 0.004891602957779066
units_1: 320
activation_1: elu
dropout_1: 0.4
units_2: 480
activation_2: relu
dropout_2: 0.1
units_3: 480
activation_3: relu
dropout_3: 0.30000000000000004"""

In [98]:
def build_model1(hp):
    model = Sequential()

    # Tune how many hidden layers to use (1–4)
    num_layers = hp.Int("num_layers", min_value=1, max_value=4)

    # Input layer (first hidden layer)
    model.add(Dense(
        units=hp.Int("units_0", min_value=32, max_value=512, step=32),
        activation=hp.Choice("activation_0", ["relu", "tanh", "elu"]),
        input_shape=(X_train_1.shape[1],)
    ))

    # Loop for additional hidden layers
    for i in range(1, num_layers):
        model.add(Dense(
            units=hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
            activation=hp.Choice(f"activation_{i}", ["relu", "tanh", "elu"])
        ))

        # Add dropout (0.0–0.5)
        model.add(Dropout(
            hp.Float(f"dropout_{i}", 0.0, 0.5, step=0.1)
        ))

    # Output layer
    model.add(Dense(1, activation="sigmoid"))

    # Tune optimizer
    optimizer_choice = hp.Choice("optimizer", ["adam", "rmsprop", "nadam"])
    lr = hp.Float("learning_rate", 1e-5, 1e-2, sampling="log")

    if optimizer_choice == "adam":
        optimizer = Adam(learning_rate=lr)
    elif optimizer_choice == "rmsprop":
        optimizer = RMSprop(learning_rate=lr)
    else:
        optimizer = Nadam(learning_rate=lr)

    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model


In [99]:
tuner1 = kt.Hyperband(
    build_model1,
    objective="val_accuracy",
    max_epochs=50,
    factor=3,
    directory="my_dir",
    project_name="improved_tuner1"
)

In [100]:
tuner1.search(
    X_train_1, 
    y_train_1,
    validation_split=0.2,
    epochs=100,              
    callbacks=[stop_early],
    batch_size=kt.HyperParameters().Choice("batch_size", [16, 32, 64, 128])
)

Trial 90 Complete [00h 00m 39s]
val_accuracy: 0.4726791977882385

Best val_accuracy So Far: 0.5273208022117615
Total elapsed time: 00h 25m 01s


In [101]:
best_hps = tuner1.get_best_hyperparameters(num_trials=1)[0]

print("Best hyperparameters:")
for hp_name, hp_value in best_hps.values.items():
    print(f"{hp_name}: {hp_value}")

Best hyperparameters:
num_layers: 4
units_0: 416
activation_0: elu
optimizer: adam
learning_rate: 0.005299246642585854
units_1: 512
activation_1: tanh
dropout_1: 0.0
units_2: 224
activation_2: relu
dropout_2: 0.0
units_3: 320
activation_3: tanh
dropout_3: 0.1
tuner/epochs: 2
tuner/initial_epoch: 0
tuner/bracket: 3
tuner/round: 0


In [None]:
"""Best hyperparameters:
num_layers: 4
units_0: 416
activation_0: elu
optimizer: adam
learning_rate: 0.005299246642585854
units_1: 512
activation_1: tanh
dropout_1: 0.0
units_2: 224
activation_2: relu
dropout_2: 0.0
units_3: 320
activation_3: tanh
dropout_3: 0.1"""