In [None]:
import pandas as pd
from lib.data_utils import *
from lib.model_utils import *
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import time

import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import log_loss

import optuna

In [None]:
MODEL_TYPE = 'ACC'
MODEL_NAME = f'LEMv4_MODEL_{MODEL_TYPE}_TORCH'

In [None]:
device = torch.device("cpu")

# Loading and Preprocessing Data

In [None]:
df_train, df_train_y, df_optimization, df_optimization_y, df_test, df_test_y, complete_feature_set, features_model = load_model_training_data_template(train_sets = ['data/wyscout/csv/events/Italy.csv', 'data/wyscout/csv/events/Germany.csv', 'data/wyscout/csv/events/France.csv'], optimization_sets = ['data/wyscout/csv/events/Italy.csv',], test_sets = ['data/wyscout/csv/events/Spain.csv', 'data/wyscout/csv/events/England.csv'])

In [None]:
print(list(df_train_y[MODEL_TYPE].columns))

In [None]:
features = features_model[MODEL_TYPE]
print(features)

In [None]:
X_train = df_train[features].astype(float).values
x_optimization = df_optimization[features].astype(float).values
X_test = df_test[features].astype(float).values

Y_train = df_train_y[MODEL_TYPE].astype(float).values
Y_optimization = df_optimization_y[MODEL_TYPE].astype(float).values
Y_test = df_test_y[MODEL_TYPE].astype(float).values

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_optimization_tensor = torch.tensor(x_optimization, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)
Y_optimization_tensor = torch.tensor(Y_optimization, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)

# Create datasets
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
optimization_dataset = TensorDataset(X_optimization_tensor, Y_optimization_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
optimization_dataloader = DataLoader(optimization_dataset, batch_size=1024, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

input_size = X_train.shape[1]
output_size = Y_train.shape[1]

# Tunning Model

In [None]:
ENABLE_TUNING = False
TUNNING_COMPLEXITY_PENALTY = 0.001
TUNNING_TRAIN_TEST_SPLIT = 0.7
TUNNING_N_TRIALS = 40

In [None]:
if ENABLE_TUNING:
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, X_optimization_tensor, Y_optimization_tensor, model_name=MODEL_NAME, train_test_split=TUNNING_TRAIN_TEST_SPLIT, complexity_penalty=TUNNING_COMPLEXITY_PENALTY), n_trials=TUNNING_N_TRIALS)

In [None]:
if ENABLE_TUNING:
    trial = study.best_trial
    print(trial.value, trial.params, trial.datetime_start, trial.datetime_complete)
    
    model = torch.load(f'models/lem/optuna_trials/{MODEL_NAME}_{trial.number}.pt')
    test_log_loss = evaluate_log_loss(model, optimization_dataloader, device)
    print(f'Test Log Loss: {test_log_loss:.4f}')

    plt.rcParams["figure.figsize"] = (20, 5)
    plt.subplot(121)
    probabilities = predict(model, X_optimization_tensor, device)
    plt.hist(probabilities, bins=50);
    plt.subplot(122)
    plt.hist(probabilities[:,1], bins=50, color='C1')
    plt.yscale('log');

# Train model

In [None]:
model = MultiLayerBinaryClassifier(input_size, [128], output_size, activation='sigmoid').to(device)
learning_rate = 0.0410
num_epochs = 100
patience = 3
counter = 0
best_val_loss = 1000

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCELoss()

for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    test_loss = evaluate(model, test_dataloader, criterion, device)
    test_log_loss = evaluate_log_loss(model, test_dataloader, device)
    print(f'Epoch: {epoch+1}/{num_epochs}.. Training loss: {train_loss:.4f}.. Test loss: {test_loss:.4f}.. Test Log Loss: {test_log_loss:.4f}')

    if test_log_loss < best_val_loss:
        best_val_loss = test_log_loss
        counter = 0
        torch.save(model, f'models/lem/{MODEL_NAME}.pth')
    else:
        counter += 1
        if counter >= patience:
            break

In [None]:
plt.rcParams["figure.figsize"] = (10.6, 6.8)
plt.subplot(121)
probabilities = predict(model, X_test_tensor, device)
plt.hist(probabilities, bins=25);
plt.subplot(122)
plt.hist(probabilities[:, 1], bins=25);
plt.yscale('log');