In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Read csv file

In [3]:
df = pd.read_csv('fifa_players.csv')
df['id'] = df.index
df.set_index('id', inplace=True)
df.head()

In [135]:
df.info()

In [136]:
df.describe()

## Remove irrelevant columns

In [137]:
df.drop(['name', 'full_name', 'birth_date'], axis=1, inplace=True)

## Missing values

In [138]:
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values.sort_values(inplace=True)
missing_values.plot.bar()

In [139]:
df.drop(['national_team', 'national_team_position','national_rating', 
         'national_jersey_number'], axis=1, inplace=True)

In [140]:
missing_cols = df.columns[df.isnull().any()]
df[missing_cols].isnull().sum()

## Histogram

In [141]:
df.hist(bins=30, figsize=(20,20))
plt.tight_layout()
plt.show()

In [142]:
sns.histplot(df['height_cm'], bins=30)
plt.title('Histogram of Height in cm')
plt.show()

In [143]:
height_outliers = df[df['height_cm'] <= 160]
height_outliers = height_outliers[['height_cm']]
height_outliers

In [144]:
df[df['height_cm'] <= 160].shape

In [145]:
df.loc[df['height_cm'] <= 160, 'height_cm'] = np.nan

In [146]:
df.isnull().sum().sort_values(ascending=False)

## Miss Forest for handling height ouliers

In [147]:
from missforest import MissForest

imputer = MissForest()

columns_to_impute = ['value_euro', 'height_cm', 'wage_euro', 'release_clause_euro']

df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

df[columns_to_impute].isnull().sum()

In [148]:
df['height_cm'].hist(bins=30)

In [149]:
df['value_euro'].hist(bins=30)

In [150]:
df['wage_euro'].hist(bins=30)

## Check duplicated values

In [151]:
df.duplicated().sum()

## Handle String values

In [152]:
object_cols = df.select_dtypes(include=['object']).columns
df[object_cols].nunique()

In [153]:
for col in object_cols:
    print(f'{col}: {df[col].unique()}')
    print()

In [154]:
valid_body_types = ['Lean', 'Normal', 'Stocky']
df = df[df['body_type'].isin(valid_body_types)]
df['body_type'].unique()

In [155]:
df['body_type'].value_counts()

In [156]:
df['preferred_foot'].value_counts()

In [157]:
df['international_reputation(1-5)'].value_counts()

In [158]:
df.drop(['international_reputation(1-5)'], axis=1, inplace=True)

## preferred foot

In [159]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['preferred_foot'] = le.fit_transform(df['preferred_foot'])
df['preferred_foot'].value_counts()

## body type

In [160]:
df_encoded = pd.get_dummies(df, columns=['body_type'])
df_encoded.head()

In [161]:
df = df_encoded.copy()

## positions

In [162]:
object_cols = df.select_dtypes(include=['object']).columns
object_cols

In [163]:
df['positions'] = df['positions'].str.split(',')
df['positions']

In [164]:
df_exploded = df.explode('positions')
df_encoded = pd.get_dummies(df_exploded, columns=['positions'])

one_hot_columns = [col for col in df_encoded.columns if col.startswith('positions_')]
df_one_hot_sum = df_encoded[one_hot_columns].groupby('id').sum()
df_final = df.drop(columns=['positions']).join(df_one_hot_sum)

df = df_final

In [165]:
import pandas as pd
pd.set_option('display.max_columns', None)
df.head()

# nationality

In [166]:
object_cols = df.select_dtypes(include=['object']).columns
object_cols

In [167]:
df['nationality'].nunique()

In [168]:
df['nationality'].unique()

In [169]:
df['nationality'].value_counts()

# Create new data

In [170]:
df.to_csv('data.csv', index=False)

# Leakage Problem

In [171]:
import pandas as pd

df = pd.read_csv('data.csv')

In [172]:
tmp = df.drop('nationality', axis=1)
tmp.corr()['overall_rating'].sort_values(ascending=False)

In [173]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.kdeplot(df['overall_rating'], label="Overall Rating", fill=True)
sns.kdeplot(df['potential'], label="Potential", fill=True)
plt.legend()
plt.title("Distribution Comparison")
plt.show()

In [174]:
df.drop(['value_euro', 'wage_euro', 'release_clause_euro',
         'potential', 'composure', 'reactions'], axis=1, inplace=True)

## Handle outliers

In [175]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

exclude_columns = ['positions_CAM', 'positions_CB', 'positions_CDM', 'positions_CF', 'positions_CM', 'positions_GK', 
                   'positions_LB', 'positions_LM', 'positions_LW', 'positions_LWB', 'positions_RB', 'positions_RM', 
                   'positions_RW', 'positions_RWB', 'positions_ST', 'body_type_Lean', 'body_type_Normal', 'body_type_Stocky',
                   'overall_rating', 'preferred_foot', 'weak_foot(1-5)']

In [176]:
from typing import Tuple
from sklearn.base import BaseEstimator, TransformerMixin


def find_boxplot_boundaries(col: pd.Series, whisker_coeff: float = 1.5) -> Tuple[float, float]:
    """Findx minimum and maximum in boxplot.

    Args:
        col: a pandas serires of input.
        whisker_coeff: whisker coefficient in box plot
    """
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - whisker_coeff * IQR
    upper = Q3 + whisker_coeff * IQR
    return lower, upper


class BoxplotOutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, whisker_coeff: float = 1.5):
        self.whisker = whisker_coeff
        self.lower = None
        self.upper = None

    def fit(self, X: pd.Series):
        self.lower, self.upper = find_boxplot_boundaries(X, self.whisker)
        return self

    def transform(self, X):
        return X.clip(self.lower, self.upper)

In [177]:
outlier_features = []

for column in numeric_columns:
    if column not in exclude_columns:
        clipper = BoxplotOutlierClipper()
        clipper.fit(df[column])
        lower, upper = clipper.lower, clipper.upper
        if df[(df[column] < lower) | (df[column] > upper)].shape[0] > 0:
            outlier_features.append(column)

print("Features with outliers:", outlier_features)

In [178]:
for column in outlier_features:
    if column in exclude_columns:
        continue
    plt.figure(figsize=(15, 5))
    
    # Distribution plot
    plt.subplot(1, 2, 1)
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
    
    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column}')
    
    plt.show()

In [179]:
for feature in outlier_features:
    clipper = BoxplotOutlierClipper()
    df[feature] = clipper.fit_transform(df[feature])

df.head()

In [180]:
df.to_csv('data.csv', index=False)