In [None]:
import pandas as pd
from lib.data_utils import *
from lib.model_utils import *
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import time
import os

import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from lib.simulator import *

In [None]:
device = torch.device("cpu")

In [None]:
PARAMETERS = {
    'TYPE': {
        'learning_rate': 0.0001,
        'num_epochs': 25,
        'patience': 3,
    },
    'ACC': {
        'learning_rate': 0.0041,
        'num_epochs': 25,
        'patience': 3,
    },
    'DATA': {
        'learning_rate': 0.00063,
        'num_epochs': 25,
        'patience': 3,
    },
}

In [None]:
def prepare_data(df_base_x, team=None, player=None, remove_player=None, is_home=True):
    if is_home:
        side_selection = df_base_x.is_home_team
    else:
        side_selection = ~df_base_x.is_home_team

    data = []
    if team != None:
        if remove_player != None:
            data.append(
                df_base_x[(
                    df_base_x.match_name.str.contains(team).fillna(False) & 
                    (
                        (side_selection & df_base_x.team_name.str.contains(team).fillna(False)) | 
                        ((~side_selection) & (~df_base_x.team_name.str.contains(team).fillna(False)))
                    ) & 
                    (~df_base_x.player_name.str.contains(remove_player).fillna(False))
                )].copy()
            )
        else:
            data.append(
                df_base_x[(
                    df_base_x.match_name.str.contains(team).fillna(False) & 
                    (
                        (side_selection & df_base_x.team_name.str.contains(team).fillna(False)) | 
                        ((~side_selection) & (~df_base_x.team_name.str.contains(team).fillna(False)))
                    )
                )].copy()
            )

    if player != None:
        data.append(
            df_base_x[(
                df_base_x.player_name.str.contains(player).fillna(False) & 
                side_selection
            )].copy()
        )

    return pd.concat(data)

def load_models(base_model=None):
    if base_model == None:
        models = {
            'TYPE': torch.load('models/lem/LEMv3_MODEL_TYPE_TORCH.pth').to(device),
            'ACC': torch.load('models/lem/LEMv4_MODEL_ACC_TORCH.pth').to(device),
            'DATA': torch.load('models/lem/LEMv3_MODEL_DATA_TORCH.pth').to(device),
        }
    else:
        models = {
            'TYPE': torch.load(f'models/finetuning/team_representations/{base_model}_TYPE.pth').to(device),
            'ACC': torch.load(f'models/finetuning/team_representations/{base_model}_ACC.pth').to(device),
            'DATA': torch.load(f'models/finetuning/team_representations/{base_model}_DATA.pth').to(device),
        }

    models['TYPE'].eval();
    models['ACC'].eval();
    models['DATA'].eval();

    return models

def prepare_dataloader(df_selected_data, df_base_y, model_type, features):
    X_train = df_selected_data[features].astype(float).values # df_original_team should be a parameters
    Y_train = df_base_y[model_type].loc[df_selected_data.index].astype(float).values

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)

    batch_size = int(max(min(np.log(len(Y_train)) ** 2, 256), 32)) # 5% of the data, max 512, min 32

    train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    input_size = X_train.shape[1]
    output_size = Y_train.shape[1]

    return train_dataloader, input_size, output_size

def check_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def check_if_pth_exists(model_name):
    return os.path.exists(f'models/finetuning/{model_name}.pth')

# Load data & models

## Data

In [None]:
df_train, df_train_y, df_optimization, df_optimization_y, df_test, df_test_y, complete_feature_set, features_model = load_model_training_data_template(train_sets = ['data/wyscout/csv/events/Italy.csv', 'data/wyscout/csv/events/Germany.csv', 'data/wyscout/csv/events/France.csv'], optimization_sets = ['data/wyscout/csv/events/Italy.csv',], test_sets = ['data/wyscout/csv/events/Spain.csv', 'data/wyscout/csv/events/England.csv'])

# Tests

In [None]:
PL_TEAMS_REPLACEMENT = {
    'Arsenal': 'A. Iwobi', 
    'Leicester City': 'M. Albrighton', 
    'Manchester City': 'L. Sané', 
    'Brighton & Hove Albion': 'S. March', 
    'Burnley': 'A. Barnes', 
    'Chelsea': 'V. Moses', 
    'Crystal Palace': 'C. Benteke', 
    'Huddersfield Town': 'T. Ince', 
    'Everton': 'D. Calvert-Lewin', 
    'Stoke City': 'P. Crouch', 
    'Manchester United': 'R. Lukaku', 
    'West Ham United': 'M. Antonio', 
    'Tottenham Hotspur': 'Son Heung-Min', 
    'Newcastle United': 'Joselu', 
    'Swansea City': 'S. Clucas', 
    'Southampton': 'N. Redmond', 
    'Watford': 'A. Carrillo', 
    'Liverpool': 'S. Mané', 
    'West Bromwich Albion': 'J. Rodriguez', 
    'AFC Bournemouth': 'J. Ibe', 
    }

In [None]:
# Home Only Data
N_ITERATIONS = 10
TESTS = {}
for team in PL_TEAMS_REPLACEMENT.keys():
    for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'team_representations', 'team': team, 'player': None, 'remove_player': None, 'is_home': True, 'base_model': None,}

for team in PL_TEAMS_REPLACEMENT.keys():
    player = 'Cristiano Ronaldo'
    for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_adding', 'team': team, 'player': player, 'remove_player': None, 'is_home': True, 'base_model': None,}
    for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_replacement', 'team': team, 'player': player, 'remove_player': PL_TEAMS_REPLACEMENT[team], 'is_home': True, 'base_model': None,}

    player = 'L. Messi'
    for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_adding', 'team': team, 'player': player, 'remove_player': None, 'is_home': True, 'base_model': None,}
    for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_replacement', 'team': team, 'player': player, 'remove_player': PL_TEAMS_REPLACEMENT[team], 'is_home': True, 'base_model': None,}

for player in ['L. Messi', 'Cristiano Ronaldo', 'T. Kroos', 'Iago Aspas', 'Dani Parejo', 'L. Suárez', 'A. Griezmann', 'Casemiro', 'Illarramendi', 'Sergio Ramos']:
    for _ in range(N_ITERATIONS): TESTS[len(TESTS)] = {'type': 'player_only', 'team': None, 'player': player, 'remove_player': None, 'is_home': True, 'base_model': None,}

print(len(TESTS))

In [None]:
test = {}
for test_id in tqdm(TESTS.keys()):
    if test != TESTS[test_id]:
        test = TESTS[test_id]
        
        df_selected_data = prepare_data(df_test, team=test['team'], player=test['player'], remove_player=test['remove_player'], is_home=test['is_home'])
        train_dataloader, input_size, output_size = {}, {}, {}
        for MODEL_TYPE in ['TYPE', 'ACC', 'DATA']:
            train_dataloader[MODEL_TYPE], input_size[MODEL_TYPE], output_size[MODEL_TYPE] = prepare_dataloader(df_selected_data, df_test_y, MODEL_TYPE, features_model[MODEL_TYPE])

    models = load_models(base_model=test['base_model'])

    for MODEL_TYPE in ['TYPE', 'ACC', 'DATA']:
        home_sign = 'H' if test['is_home'] else 'A'
        test_description = ''
        #if test['base_model'] != None:
        #    test_description += f'{test["base_model"].split("_")[0]}_'
        test_description += str(test['team']) + '_' + str(test['player']) + '_' + str(test['remove_player']) 
        test_description = test_description.replace('None_', '').replace('.', '').replace('_None', '')
        test_description += '_' + home_sign

        check_dir(f'models/finetuning/{test["type"]}/{test_description}')

        if check_if_pth_exists(f'{test["type"]}/{test_description}/LEM_V343_{test_id}_{MODEL_TYPE}'):
            continue

        optimizer = optim.Adam(models[MODEL_TYPE].parameters(), lr=PARAMETERS[MODEL_TYPE]['learning_rate'])
        criterion = nn.BCELoss()

        counter, best_val_loss = 0, 1000
        for epoch in range(PARAMETERS[MODEL_TYPE]['num_epochs']):
            train_loss = train(models[MODEL_TYPE], train_dataloader[MODEL_TYPE], criterion, optimizer, device)

            if train_loss < (best_val_loss - 0.00005):
                best_val_loss = train_loss
                counter = 0

                torch.save(models[MODEL_TYPE], f'models/finetuning/{test["type"]}/{test_description}/LEM_V343_{test_id}_{MODEL_TYPE}.pth')
            else:
                counter += 1
                if counter >= PARAMETERS[MODEL_TYPE]['patience']:
                    break

        TESTS[test_id][f'{MODEL_TYPE}_train_loss'] = best_val_loss
        TESTS[test_id][f'{MODEL_TYPE}_epochs'] = epoch

        pd.DataFrame(TESTS).T.to_csv('res/training_process_data/TESTS.csv')