# Data

### Imports

In [1]:
import numpy as np
import os
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from GA.utils import get_avg_fitness, entropy
from GA.algorithm import run_ga
from metrics.lin_reg import get_fitness_lin_reg, get_fitness_log_reg,get_columns
from visualizers.plotters import plot_evolution

### Load data

In [2]:
import pandas as pd

two_season = pd.read_pickle('data/two_season_df.pickle')
two_season_soccermix_history = pd.read_pickle('data/two_season_df_soccermix_history.pickle')
two_season_soccermix_all = pd.read_pickle('data/two_season_df_soccermix_all.pickle')

In [3]:
linear_target_columns = [
    'player_season_obv_90_next_season',
    'player_season_obv_dribble_carry_90_next_season',
    'player_season_obv_pass_90_next_season',
    'player_season_obv_shot_90_next_season',
    'player_season_obv_defensive_action_90_next_season'
]

binary_target_columns = [
    'player_season_obv_90_increase',
    'player_season_obv_dribble_carry_90_increase',
    'player_season_obv_pass_90_increase',
    'player_season_obv_shot_90_increase',
    'player_season_obv_defensive_action_90_increase'
]

In [4]:
two_season['league_market_value_diff'] = two_season.league_market_value_next_season - two_season.league_market_value
two_season_soccermix_history['league_market_value_diff'] = two_season_soccermix_history.league_market_value_next_season - two_season_soccermix_history.league_market_value
two_season_soccermix_all['league_market_value_diff'] = two_season_soccermix_all.league_market_value_next_season - two_season_soccermix_all.league_market_value

two_season.drop(['league_market_value_next_season', 'league_market_value'], axis=1, inplace=True)
two_season_soccermix_history.drop(['league_market_value_next_season', 'league_market_value'], axis=1, inplace=True)
two_season_soccermix_all.drop(['league_market_value_next_season', 'league_market_value'], axis=1, inplace=True)

In [5]:
drop_cols = [  # mostly keeper cols
    'player_season_ot_shots_faced_ratio',
    'player_season_ot_shots_faced_90',
    'player_season_shots_faced_90',
    'player_season_np_xg_faced_90',
    'player_season_gsaa_ratio',
    'player_season_np_psxg_faced_90',
    'player_season_goals_faced_90',
    'player_season_clcaa',
    'player_season_gsaa_90',
    'player_season_npot_psxg_faced_90',
    'player_season_da_aggressive_distance',
    'player_season_xs_ratio',
    'league_market_value_diff'
]
two_season.drop(drop_cols, axis=1, inplace=True)
two_season_soccermix_history.drop(drop_cols, axis=1, inplace=True)
two_season_soccermix_all.drop(drop_cols, axis=1, inplace=True)

In [6]:
# corr = two_season.iloc[0:136].corr()
# _corr = two_season.corr()

In [7]:
# [col for col in two_season.columns if '_op_' in col]

In [8]:
# aa = {}
# for col in _corr.columns:
#     for row in _corr.index:
#         if row == col:
#             continue
#         if f'{row}-{col}' in aa.keys():
#             continue
#         v =  _corr.loc[row, col]
#         if v > 0.95 or v < -0.95:
#             aa[f'{col}-{row}'] = v
# aa

In [9]:
# extra_cols_to_drop = [
#     'player_season_sp_key_passes_90',
#     'player_season_op_xgbuildup',
#     'player_season_xgbuildup',
#     'player_season_op_xgchain',
#     'player_season_average_x_pressure',
#     'player_season_touches_inside_box_90',
#     'player_season_op_xgchain_per_possession',
#     'player_season_pressured_long_balls_90',
#     'player_season_ps_pass_length',
#     'player_season_aggressive_actions_90',
#     'player_season_clearance_90',
#     'player_season_s_pass_length',
#     'player_season_op_xgbuildup_per_possession',
#     'player_season_npg_90',
#     'player_season_starting_appearances',
#     'player_season_total_dribbles_90',
#     'player_season_op_xgchain_90',
#     'player_season_op_xgbuildup_90',
#     'player_season_average_x_defensive_action',
    
# ]

In [10]:
# two_season.drop(extra_cols_to_drop, axis=1, inplace=True)
# two_season_soccermix_history.drop(extra_cols_to_drop, axis=1, inplace=True)
# two_season_soccermix_all.drop(extra_cols_to_drop, axis=1, inplace=True)

### Split dataset into features and targets

In [11]:
two_season_features = two_season.drop(linear_target_columns + binary_target_columns, axis=1)
two_season_soccermix_history_features = two_season_soccermix_history.drop(linear_target_columns + binary_target_columns, axis=1)
two_season_soccermix_all_features = two_season_soccermix_all.drop(linear_target_columns + binary_target_columns, axis=1)

linear_target_sets = dict([(target, two_season[target]) for target in linear_target_columns])
binary_target_sets = dict([(target, two_season[target]) for target in binary_target_columns])

feature_sets = {
    'two_season': two_season_features,
    'two_season_soccermix_history': two_season_soccermix_history_features,
    'two_season_soccermix_all': two_season_soccermix_all_features,
}

In [12]:
features_path = 'data/feature_sets.pickle'
lin_targets_path = 'data/linear_target_sets.pickle'
bin_targets_path = 'data/binary_target_sets.pickle'

# Create directories if they don't exist
os.makedirs(os.path.dirname(features_path), exist_ok=True)
os.makedirs(os.path.dirname(lin_targets_path), exist_ok=True)
os.makedirs(os.path.dirname(bin_targets_path), exist_ok=True)

# Save data to file
with open(features_path, 'wb') as file:
    pickle.dump(feature_sets, file)

with open(lin_targets_path, 'wb') as file:
    pickle.dump(linear_target_sets, file)

with open(bin_targets_path, 'wb') as file:
    pickle.dump(binary_target_sets, file)

# Artificial task

### Imports

In [13]:
# import numpy as np
# from math import sin
# from GA.algorithm import run_ga
# from GA.utils import entropy
# from metrics.lin_reg import get_fitness
# from visualizers.plotters import plot_population_fitness

### Parameters

In [14]:
# population_size = 50
# generations = 10
# crossover_rate = 0.8
# mutation_rate = 0.1

### Fitness function

In [15]:
# # return sin fitness, positive fitness (fitness shifted to positive interval) and decimal value
# def fitness_function(bitstring):
#     # get real value of bitstring and scale to fit interval of [0, 128]
#     scaling_factor = 2**(-94)
#     value = int(bitstring, base=2) * scaling_factor
#     fitness = sin(value)
#     return fitness, fitness + 1, value

### SGA

In [16]:
# entropy_lst_artificial_task_without_crowding = []

# # define custom callback
# def callback(population, generation):
#     entropy_lst_artificial_task_without_crowding.append(entropy(population))
#     plot_population_fitness(population, [0, 128], additional_functions=[np.sin])

        

# print('Running genetic algorithm \n')

# run_ga(population_size, generations, mutation_rate, crossover_rate, fitness_function, crowding=False, epoch_callback=callback)


### With crowding

In [17]:
# entropy_lst_artificial_task_with_crowding = []


# def callback(population, generation):
#     entropy_lst_artificial_task_with_crowding.append(entropy(population))
#     plot_population_fitness(population, [0, 128], additional_functions=[np.sin])


# print('Running genetic algorithm \n')

# run_ga(population_size, generations, mutation_rate, crossover_rate, fitness_function, crowding=True, epoch_callback=callback)


### Compare crowding vs no crowding

In [18]:
# from visualizers.plotters import plot_and_compare_evolutions

# plot_and_compare_evolutions(entropy_lst_artificial_task_without_crowding, "SGA", entropy_lst_artificial_task_with_crowding, "Crowding", "Entropy")

# Real task

### Parameters

In [19]:
population_size = 100
generations = 80
crossover_rate = 0.8
mutation_rate = 0.1

In [20]:
# for target_key in linear_target_sets.keys():
#     for feature_key in feature_sets.keys():
#         features = feature_sets[feature_key]
#         targets = linear_target_sets[target_key]
#         print(get_fitness_lin_reg(features.to_numpy(), targets.to_numpy(), n_splits=5, include_mae=True))


In [21]:
# for target_key in binary_target_sets.keys():
#     for feature_key in feature_sets.keys():
#         features = feature_sets[feature_key]
#         targets = binary_target_sets[target_key]
#         print(get_fitness_log_reg(features.to_numpy(), targets.to_numpy(), n_splits=5))

### Linear regression

In [22]:
linear_results = {}
counter = 0

for target_key in linear_target_sets.keys():
    linear_results[target_key] = {}
    for feature_key in feature_sets.keys():

        features = feature_sets[feature_key]
        targets = linear_target_sets[target_key]

        def fitness_function(bitstring):
            columns = get_columns(features, bitstring)
            mse = get_fitness_lin_reg(columns, targets,  n_splits=5)
            
            # since we want lower rmse to be better
            positive_fitness = 1 / ((mse**2) * 100)
            return mse, positive_fitness, None    

        final_population = run_ga(len(features.columns), population_size, generations, mutation_rate, crossover_rate, fitness_function)
        final_population.sort(key=lambda x: x.fitness)

        linear_results[target_key][feature_key] = final_population[0].genotype

        print(counter)
        counter += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [23]:
linear_results_dict_path = 'data/linear_results_dict.pickle'

# Create directories if they don't exist
os.makedirs(os.path.dirname(linear_results_dict_path), exist_ok=True)

# Save data to file
with open(linear_results_dict_path, 'wb') as file:
    pickle.dump(linear_results, file)

### Logistic regression

In [24]:
binary_results = {}
counter = 0

for target_key in binary_target_sets.keys():
    binary_results[target_key] = {}
    for feature_key in feature_sets.keys():

        features = feature_sets[feature_key]
        targets = binary_target_sets[target_key]

        def fitness_function(bitstring):
            columns = get_columns(features, bitstring)
            score = get_fitness_log_reg(columns, targets,  n_splits=5)
            positive_fitness = score**2
            return score, positive_fitness, None    

        final_population = run_ga(len(features.columns), population_size, generations, mutation_rate, crossover_rate, fitness_function)
        final_population.sort(key=lambda x: x.fitness, reverse=True)

        binary_results[target_key][feature_key] = final_population[0].genotype

        print(counter)
        counter += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [25]:
binary_results_dict_path = 'data/binary_results_dict.pickle'

# Create directories if they don't exist
os.makedirs(os.path.dirname(binary_results_dict_path), exist_ok=True)

# Save data to file
with open(binary_results_dict_path, 'wb') as file:
    pickle.dump(binary_results, file)

### With crowding

In [26]:
# entropy_lst_real_task_with_crowding = []
# rmse_lst_with_crowding = []

# # define custom callback
# def callback(population, generation):
#     entropy_lst_real_task_with_crowding.append(entropy(population))
#     rmse = get_avg_fitness(population)
#     rmse_lst_with_crowding.append(rmse)
#     if generation % 10 == 0:
#         space = " " if generation == 0 else ""
#         print('Generation', space, str(generation) ,' | Population MSE ==>  ', rmse)

        

# print('MSE without feature selection: ', get_fitness(data, targets))

# print('\nRunning genetic algorithm \n')

# final_population = run_ga(population_size, generations, mutation_rate, crossover_rate, fitness_function, crowding=True, epoch_callback=callback)

# plot_evolution(rmse_lst_with_crowding, "MSE - Crowding")

### Compare crowding vs no crowding

In [27]:
# from visualizers.plotters import plot_and_compare_evolutions

# plot_and_compare_evolutions(rmse_lst_without_crowding, "SGA", rmse_lst_with_crowding, "Crowding", "RMSE")
# plot_and_compare_evolutions(entropy_lst_real_task_without_crowding, "SGA", entropy_lst_real_task_with_crowding, "Crowding", "Entropy")