In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from models import Dim
from datasets import fifa
import matplotlib.pyplot as plt
from math import floor
import numpy as np
from tqdm import tqdm

In [None]:
# Instantiate dim. red. framework
dim = Dim()

# Load dataset
fif = fifa()

X_train, X_test, y_train, y_test = train_test_split(fif.X, fif.y, test_size=0.1, random_state=33)
dim.col_names = fif.col_names
dim.X_train = X_train
dim.y_train = y_train
dim.X_test = X_test
dim.y_test = y_test

dim.new_dim = dict()

## Set Up

In [None]:
# Unplickle dim
dim.unpickle_dim('dim/02-11-12:42.pkl')

# Read scores
scores = pd.read_csv('scores/07-04-20:42.csv')

# Filter for the best configuration
best_sc = scores.loc[scores.sort_values('Dimensions').groupby('Dim. Technique')['Accuracy'].idxmax()]

# Transform rows to dictionary keys
best_sc = [tuple(row) for row in best_sc[['Dimensions', 'Dim. Technique', 'Dim. Params']].to_records(index=False)]

# List execution parameters to plot
best_sc = [('3Dim', 'KPCA', 'Linear'),
 ('5Dim', 'LLE', 'k=81-reg=0.001'),
 ('5Dim', 'LOL', ''),
 ('4Dim', 'LPP', 'k=5'),
 ('3Dim', 'PCA', ''),
 ('2Dim', 'SLMVP', 'Radial-Gammas=0.1')]

# Select only the executions with the parameters of 'best_sc'
newdict = {k: dim.new_dim[k] for k in best_sc}
dim.new_dim = newdict

# Get the variability 
weights = dim.get_weights().droplevel(3, axis=1)

# Format weights
weigts_c = []
for key in dim.new_dim.keys():
    if weigts_c != '':
        weigts_c = weigts_c + weights[key].tolist()
weigts_c = [x for x in weigts_c if x != '']

# Get the  correlations
corrs = dim.get_corr_table(num_dim=None, abs=False)

# Multiply each column by corresponding weigth
for i, col in enumerate(corrs.columns):
    corrs[col] *= weigts_c[i]

# Initialize dictionary to hold lists of tuples
result_dict = {}

# Group tuples by their second element
for tup in corrs.keys():

    if tup[1] not in result_dict:
        result_dict[tup[1]] = [(tup)]
    else:
        result_dict[tup[1]].append((tup))

# Convert dictionary values to lists
header_groups = [values for values in result_dict.values()]

corrs_avg = pd.DataFrame()
for header_g in header_groups:
    corrs_avg[header_g[0][1]] = corrs[header_g].abs().mean(axis=1)

corrs_avg.head()

## Plot

In [None]:
corrs['skill_set'] = corrs.index.to_series().str.split("_").str[0]
colors = corrs['skill_set'].map(
    {'attacking': '#1f77b4',
     'skill': '#2ca02c',
     'movement' : '#d62728',
     'power' : '#9467bd',
     'mentality' : '#ff7f0e',
     'defending' : '#7f7f7f'})

corrs.drop(columns=['skill_set'], inplace=True)

n_rows = 6
n_cols = 6

fig, ax = plt.subplots(
    n_rows, n_cols, figsize=(22, 45), sharey='row')

# Plot
pos = -1
for idx, key_dim in enumerate(list(corrs.keys())):
    values = corrs[key_dim]#.sort_values(ascending=True)
    categories = values.index
    if key_dim[-1] == 0:
        pos += 1
    ax[pos][key_dim[-1]+1].barh(categories, values, color=colors)
    #ax[floor(idx/n_cols)][idx % n_cols].set_title(key_dim)

# Plot avgs
for idx, key_dim in enumerate(list(corrs_avg.keys())):
    values = corrs_avg[key_dim]#.sort_values(ascending=True)
    categories = values.index
    ax[idx][0].barh(categories, values, color=colors)
    
# Add column labels
col_labels = ['Avg.', '1', '2', '3', '4', '5']
for ax_, col in zip(ax[0], col_labels):
    ax_.annotate(col, xy=(0.5, 1), xytext=(0, 20),
                xycoords='axes fraction', textcoords='offset points',
                size=30, ha='center', va='baseline')

# Add row lables
row_labels = [x[1] + '\n' + x[2] for x in corrs.keys().to_list()]
row_labels = [x for i, x in enumerate(row_labels) if x not in row_labels[:i]]
for ax_, row in zip(ax[:,0], row_labels):
    ax_.annotate(row, xy=(0, 0.5), xytext=(-ax_.yaxis.labelpad - 75, 0),
                xycoords=ax_.yaxis.label, textcoords='offset points',
                size=20, ha='center', va='center')
    
plt.subplots_adjust(hspace=0.0)  # Adjusting spacing between subplots

# Set the range of the x-axis for all plots
for ax_row in ax:
    # Get x-axis range for the correlations of the dim with the highest
    xlims = [x.get_xlim() for x in ax_row[1:]]
    # Remove x lims for the empty plots, tuples (0.0, 1.0)
    xlims = [x for x in xlims if x != (0.0, 1.0)]
    # Select the biggest xlim
    xlim = max(xlims, key=lambda t: abs(t[0] - t[1]))
    for ax_ in ax_row[1:]:
        ax_.set_xlim(xlim)

plt.tight_layout()
plt.show()


## Feature Selection #10

1. Calculates the weighted average of the components-features absolute correlations (previously calculated)
2. Orders the features by those average correlations in descending order
3. Trains ML classifiers that use the top 1, top 2, ..., top n features using AUROC as the performance metric
4. Creates plots like the following

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [None]:
# Use the calculated weighted average of the components-features absolute correlations
# Order the features by those average correlations in descending order
features = {}
for dim_t in corrs_avg.keys():
    features[dim_t] = corrs_avg[[dim_t]].sort_values(by=dim_t, ascending=False).index.to_list()

In [51]:
feats = ['attacking_crossing',
 'skill_ball_control']
indices = [fif.col_names.index(attr) for attr in feats]
indices

[0, 9]

In [61]:
X_train

array([[45, 28, 62, ..., 60, 60, 60],
       [36, 59, 57, ..., 22, 25, 23],
       [37, 25, 75, ..., 81, 77, 68],
       ...,
       [41, 33, 59, ..., 52, 62, 67],
       [31, 27, 71, ..., 68, 71, 64],
       [35, 22, 52, ..., 51, 50, 49]])

In [64]:
X_train[:,[0,9]].shape

(2, 29)

In [45]:
fif.col_names

['attacking_crossing',
 'attacking_finishing',
 'attacking_heading_accuracy',
 'attacking_short_passing',
 'attacking_volleys',
 'skill_dribbling',
 'skill_curve',
 'skill_fk_accuracy',
 'skill_long_passing',
 'skill_ball_control',
 'movement_acceleration',
 'movement_sprint_speed',
 'movement_agility',
 'movement_reactions',
 'movement_balance',
 'power_shot_power',
 'power_jumping',
 'power_stamina',
 'power_strength',
 'power_long_shots',
 'mentality_aggression',
 'mentality_interceptions',
 'mentality_positioning',
 'mentality_vision',
 'mentality_penalties',
 'mentality_composure',
 'defending_marking_awareness',
 'defending_standing_tackle',
 'defending_sliding_tackle',
 'position']

In [None]:
X_train[:,[0,9]]

In [66]:


indices = [fif.col_names.index(x) for x in features['KPCA'][:3]]
indices

[28, 27, 21]

In [70]:
# Instantiate dim. red. framework
dim = Dim()

# Load dataset
fif = fifa()

X_train, X_test, y_train, y_test = train_test_split(fif.X, fif.y, test_size=0.1, random_state=33)
dim.col_names = fif.col_names

y_train = y_train.ravel()
y_test = y_test.ravel()

# Create DataFrame
data = {
    'Features': [''],
    'SVM_Accuracy': [0],
    'XGBoost_Accuracy': [0]
}
results_df = pd.DataFrame(data)

for dim_t in features:
    for i in tqdm(range(1, 30)):

        if frozenset(features[dim_t][:i]) not in results_df['Features'].values:
            # Select feature subset
            indices = [fif.col_names.index(x) for x in features[dim_t][:i]]
            X_train_subset = X_train[:, indices]
            X_test_subset = X_test[:, indices]

            # Train ML classifier
            # SVM
            svm_pipe = Pipeline([('mms', MinMaxScaler()),
                                 ('svm', SVC())])
            params = [{'svm__C': [0.1, 1, 10],
                       'svm__kernel': ['linear', 'rbf', 'poly']}]
            gs_svm = GridSearchCV(svm_pipe,
                                  param_grid=params,
                                  scoring='accuracy',
                                  cv=5)
            gs_svm.fit(X_train_subset, y_train)
            # Calculate AUROC and other performance metrics
            svm_accuracy = accuracy_score(y_test, gs_svm.predict(X_test_subset))

            # XGBOOST            
            xgb_pipe = Pipeline([('mms', MinMaxScaler()), ('xgb', XGBClassifier())])
            params = [{'xgb__n_estimators': [5, 10, 20, 50, 100]}]
            gs_xgb = GridSearchCV(xgb_pipe,
                                  param_grid=params,
                                  scoring='accuracy',
                                  cv=5)
            gs_xgb.fit(X_train_subset, y_train)
            # Calculate AUROC and other performance metrics
            xgb_accuracy = accuracy_score(y_test, gs_xgb.predict(X_test_subset))
            # Save results
            results_df = results_df.append({'Features': frozenset(features[dim_t][:i]),
                                            'SVM_Accuracy': svm_accuracy,
                                            'XGBoost_Accuracy': xgb_accuracy}, ignore_index=True)

    # Export DataFrame to CSV
    results_df.to_csv('results.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)
  y = column_or_1d(y, warn=True)
  3%|▎         | 1/29 [03:33<1:39:33, 213.36s/it]


IndexError: index 28 is out of bounds for axis 1 with size 1