source: https://sofifa.com/

# Importy

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame
import re
import math

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn import datasets, linear_model

# Wczytywanie danych i pre-processing

## Czyszczenie danych

In [None]:
df = pd.read_csv('Fifa2022-Males.csv', low_memory=False)
df.info()

In [None]:
df.columns.tolist()

In [None]:
main_position = []

position_info = {
    "RF":"Forward",
    "CF":"Forward",
    "LF":"Forward",
    "RS":"Forward",
    "ST":"Forward",
    "LS":"Forward",
    "LW":"Forward",
    "RW":"Forward",
    "RDM":"Middle",
    "RCDM":"Middle",
    "CDM":"Middle",
    "LCDM":"Middle",
    "LDM":"Middle",
    "RWM":"Middle",
    "RM":"Middle",
    "RCM":"Middle",
    "CM":"Middle",
    "LCM":"Middle",
    "LM":"Middle",
    "LWM":"Middle",
    "RAM":"Middle",
    "RCAM":"Middle",
    "CAM":"Middle",
    "LCAM":"Middle",
    "LAM":"Middle",
    "SW":"Back",
    "RWB":"Back",
    "RB":"Back",
    "RCB":"Back",
    "CB":"Back",
    "LCB":"Back",
    "LB":"Back",
    "LWB":"Back",
    "GK":"Goalkeeper"
}

for i in range(len(df)):
    if(df['club_position'][i] in position_info):
        main_position.append(position_info.get(df['club_position'][i]))
    elif (df['nation_position'][i] in position_info):
        main_position.append(position_info.get(df['nation_position'][i]))
    else:
        position_list = df['player_positions'][i].replace(" ", "").split(",")
        temp = []
        for j in range(len(position_list)):
            temp.append(position_info.get(position_list[j]))
        main_position.append(max(set(temp), key = temp.count))
df['player_position'] = main_position
df['player_position'].unique()

In [None]:
df = df.drop(columns=[
    'sofifa_id',
    'long_name',
    'dob',  # date of birth
    'club_team_id',
    'club_position',
    'club_jersey_number',
    'club_loaned_from',
    'club_joined',
    'club_contract_valid_until',
    'nationality_id',
    'nation_team_id',
    'nation_position',
    'nation_jersey_number',
    'work_rate',
    'body_type',
    'real_face',
    'player_tags',
    'player_traits',
    'preferred_foot',
]);

In [None]:
def nan_columns(df: DataFrame, head: int = 10):  # columns with quantities of NA descending
    print(df.isna().sum().sort_values(ascending=False).head(head))

In [None]:
nan_columns(df, head=18)

In [None]:
df = df.dropna(subset=['value_eur'])  # no data about value
df = df.drop(columns=['pace', 'defending', 'dribbling', 'passing', 'shooting', 'physic',
                      'goalkeeping_speed'])  # incompatible data (from 2 different versions, complementary nan's)])  
df['release_clause_eur'] = df['release_clause_eur'].fillna(0)  # probably no release clause
df['lw'] = df['lw'].fillna(df['lf'])  # position lw is closest to lf
df['rw'] = df['rw'].fillna(df['rf'])  # position rw is closest to rf

# TODO zmapować 'players_positions' do 'players_position' (jednej z czterech)

In [None]:
nan_columns(df, head=3)

In [None]:
df = df.drop(columns=[ # usunięcie kolumn które znaczoąco wpływają na ocenę klasyfikatora
    'wage_eur',
    'release_clause_eur',
    'overall',
    'potential',
]);

In [None]:
cols_dict = {
    'descriptive': ['short_name',
                    'player_positions',
                    'player_position',  # przetworzone pozycje (1 z 4)
                    'club_name',
                    'league_name',
                    'nationality_name',
                    ],
   #'score': ['overall',
   #          'potential'],
    'attacking': ['attacking_crossing',
                  'attacking_finishing',
                  'attacking_heading_accuracy',
                  'attacking_short_passing',
                  'attacking_volleys'],
    'skill': ['skill_dribbling',
              'skill_curve',
              'skill_fk_accuracy',
              'skill_long_passing',
              'skill_ball_control'],
    'movement': ['movement_acceleration',
                 'movement_sprint_speed',
                 'movement_agility',
                 'movement_reactions',
                 'movement_balance'],
    'power': ['power_shot_power',
              'power_jumping',
              'power_stamina',
              'power_strength',
              'power_long_shots'],
    'mentality': ['mentality_aggression',
                  'mentality_interceptions',
                  'mentality_positioning',
                  'mentality_vision',
                  'mentality_penalties',
                  'mentality_composure'],
    'defending': ['defending_marking_awareness',
                  'defending_standing_tackle',
                  'defending_sliding_tackle'],
    'goalkeeping': ['goalkeeping_diving',
                    'goalkeeping_handling',
                    'goalkeeping_kicking',
                    'goalkeeping_positioning',
                    'goalkeeping_reflexes'],
    'position': [      'ls' ,'st' ,'rs',
                 'lw' ,'lf' ,'cf' ,'rf', 'rw',
                       'lam','cam','ram',
                 'lm' ,'lcm','cm' ,'rcm', 'rm',  # arrangement of positions
                 'lwb','ldm','cdm','rdm','rwb',
                 'lb', 'lcb','cb' ,'rcb', 'rb',
                             'gk']
}
cols_dict['features'] = [cols_dict[feature] for feature in cols_dict.keys()]
cols_dict['features'] = sum(cols_dict['features'], [])  # flaten list of lists
cols_dict['all'] = df.columns.to_list()
columns = cols_dict['all']
cols_dict['others'] = list(set(cols_dict['all']) - set(cols_dict['features']))
cols_dict['others'].sort()
cols_dict['others']

In [None]:
df[cols_dict['position']] = df[cols_dict['position']].applymap(
    lambda datum: sum([int(x) for x in re.split(r'[+-]', datum)]) if isinstance(datum, str) 
    else int(datum))  # including +- for 'international_reputation'

In [None]:
df.info()

## Wizualizacja wyczyszczonych danych

In [None]:
df[cols_dict['descriptive']].describe()[1:]

In [None]:
df[cols_dict['others']].describe()[1:]

In [None]:
def show_boxplot(df: DataFrame, columns: str):
    if columns in cols_dict.keys():
        labels = cols_dict[columns]
        labels = [label.removeprefix(columns + '_') for label in labels]
    else:
        labels = None
    
    leftmargin = 0.5
    rightmargin = 0.3
    categorysize = 0.1

    n = df.shape[1]

    figwidth = leftmargin + rightmargin + (n+1)*categorysize

    fig, ax = plt.subplots(figsize=(figwidth, 6))
    fig.subplots_adjust(left=leftmargin/figwidth, right=1-rightmargin/figwidth,
                        top=0.94, bottom=0.1)
    ax.boxplot(df[cols_dict[columns]], labels=labels)
    plt.title(columns)
    plt.show()

In [None]:
print(cols_dict.keys())
for key in cols_dict.keys():
    if key not in ['descriptive', 'features', 'all', 'others']:
        show_boxplot(df, key)

# Standaryzacja

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
df_stand = df.copy()

df_stand =  df_stand.drop(columns = cols_dict['descriptive'])

df_stand.info()

In [None]:
scaler = StandardScaler()
data_1 = df_stand.iloc[:,:]
data_stand = scaler.fit_transform(data_1)
data_stand

# Wyznaczanie składowych głównych

In [None]:
from sklearn.decomposition import PCA

In [None]:
index_pca = [
 #'overall',
 #'potential',
 'value_eur',
 #'wage_eur',
 'age',
 'height_cm',
 'weight_kg',
 'league_level',
 'weak_foot',
 'skill_moves',
 'international_reputation', 
 #'release_clause_eur',
 'attacking_crossing',
 'attacking_finishing',
 'attacking_heading_accuracy',
 'attacking_short_passing',
 'attacking_volleys',
 'skill_dribbling',
 'skill_curve',
 'skill_fk_accuracy',
 'skill_long_passing',
 'skill_ball_control',
 'movement_acceleration',
 'movement_sprint_speed',
 'movement_agility',
 'movement_reactions',
 'movement_balance',
 'power_shot_power',
 'power_jumping',
 'power_stamina',
 'power_strength',
 'power_long_shots',
 'mentality_aggression',
 'mentality_interceptions',
 'mentality_positioning',
 'mentality_vision',
 'mentality_penalties',
 'mentality_composure',
 'defending_marking_awareness',
 'defending_standing_tackle',
 'defending_sliding_tackle',  
 'goalkeeping_diving',
 'goalkeeping_handling', 
 'goalkeeping_kicking',
 'goalkeeping_positioning',
 'goalkeeping_reflexes',
 'ls',
 'st',
 'rs',
 'lw',
 'lf',
 'cf',
 'rf',
 'rw',
 'lam',
 'cam',
 'ram',
 'lm',
 'lcm',
 'cm',
 'rcm', 
 'rm',
 'lwb',
 'ldm',
 'cdm',
 'rdm' ,
 'rwb', 
 'lb',
 'lcb', 
 'cb',
 'rcb',
 'rb',
 'gk'
]

In [None]:
pca = PCA()
pca_data = pca.fit_transform(data_stand)
n_components = len(pca.components_)
pca_cols = [f'PC{x+1}' for x in range(n_components)]
df_pca_componenets = pd.DataFrame(data=np.transpose(pca.components_), columns=pca_cols, index = index_pca)
df_pca_componenets.abs().sort_values(by='PC1',ascending=False).iloc[:7,:]

In [None]:
df_pca_componenets.abs().sort_values(by='PC2',ascending=False).iloc[:7,:]

In [None]:
df_pca_componenets.abs().sort_values(by='PC3',ascending=False).iloc[:7,:]

In [None]:
df_pca_componenets.abs().sort_values(by='PC4',ascending=False).iloc[:7,:]

# Kryterium wartości własnej

In [None]:
print(pca_cols[:10])
print(pca.explained_variance_[:10])

# Kryterium częsci wyjaśnionej wariancji

In [None]:
print(pca_cols[:10])
print(np.cumsum(pca.explained_variance_ratio_)[:10])

# Kryterium minimalnego zasobu zmienności

In [None]:
pd.set_option('display.max_rows', None) # wyświetlanie wszyskich wierszów
pd.set_option('display.max_columns', None) # wyświetlanie wszyskich kolumn
pcaS = df_pca_componenets.pow(2).cumsum(axis=1)
pcaS

# Analiza czynnikowa

In [None]:
from factor_analyzer import FactorAnalyzer
from sklearn.decomposition import FactorAnalysis
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo

# Wyliczenie współczynników FA

In [None]:
fa = FactorAnalysis(random_state=1)
data_transformed = fa.fit_transform(data_stand)
n_components = len(fa.components_)
fa_cols = [f'F{x+1}' for x in range(n_components)]
df_fa_componenets = pd.DataFrame(data=np.transpose(fa.components_), columns=fa_cols, index = index_pca)
df_fa_componenets

# Rotacja varimax

In [None]:
fav = FactorAnalysis(random_state=1, rotation = 'varimax')
data_transformed = fav.fit_transform(data_stand)
n_components = len(fav.components_)
fav_cols = [f'F{x+1}' for x in range(n_components)]
df_fav_componenets = pd.DataFrame(data=np.transpose(fav.components_), columns=fav_cols, index = index_pca)
df_fav_componenets

# Klasyfikacja

## Drzewa decyzyjne

In [None]:
positions = np.array(['Goalkeeper', 'Back', 'Middle', 'Forward'])
features = list(set(cols_dict['all']) - set(cols_dict['descriptive']))
X_train = df[features]
y_train = df['player_position']

clf = DecisionTreeClassifier(
    random_state = 42,
    max_depth = 3,
    criterion = 'gini'
)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)

cr = classification_report(
    y_train,
    y_pred,
    target_names=clf.classes_,
    zero_division=0
)
print(cr)

cm = confusion_matrix(
    y_train,
    y_pred,
    labels=positions
)

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=positions
)
disp.plot();

plt.figure(figsize=(80, 60))
plot_tree(
    clf,
    filled = True,
    rounded = True,
    label = 'all',
    class_names = list(positions)
);

Bramkarze zostają bez problemu odseparowani od reszty jeszcze w pierwszym rozgałęzieniu.

Obrońcy z napastnikami również są prawie idealnie rozróżniani.

Pomiędzy pozycjami obrońca-pomocnik oraz pomocnik-napastnik nie ma ścisłej granicy.

In [None]:
y = df['value_eur'].to_numpy()
y = y.reshape(y.size, 1)
features = list(set(cols_dict['all']) - set(cols_dict['descriptive']))
x = df[features]
regr = linear_model.LinearRegression()
iter_1 = 0
iter_2 = 0
iter_3 = 0
num_of_plots = len(x.columns)
num_of_rows = math.ceil(num_of_plots/5)
min_max_values = np.ndarray((num_of_plots, 2))
figure, axis = plt.subplots(num_of_rows, 5, figsize=(50, 50))
for column in x:
    x_ = x[column].values
    x_ = x_.reshape(x_.size, 1)
    regr.fit(x_,y)
    axis[iter_1, iter_2].plot(x_, regr.predict(x_), linewidth=3) 
    axis[iter_1, iter_2].set_title(column)

    min_max_values[iter_3, 0] = min(regr.predict(x_))
    min_max_values[iter_3, 1] = min(regr.predict(x_))

    iter_3 = iter_3 + 1
    iter_2 = iter_2 + 1
    if iter_2 > 4:
        iter_1 = iter_1 + 1
        iter_2 = 0
        
min_max_values
