In [2]:
import pandas as pd
import numpy as np
from analysis.utils import process_categorical

In [3]:
df = pd.read_csv('../data/df_clean.csv', index_col=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17796 entries, 0 to 18357
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   LeagueCountry             17796 non-null  object 
 1   League                    17796 non-null  object 
 2   NationalLeagueLevel       17796 non-null  object 
 3   Club                      17796 non-null  object 
 4   No                        17511 non-null  float64
 5   Name                      17796 non-null  object 
 6   Value                     17796 non-null  float64
 7   HighestValue              17796 non-null  float64
 8   Age                       17794 non-null  float64
 9   Height                    16717 non-null  float64
 10  Nationality               17796 non-null  object 
 11  Foot                      16124 non-null  object 
 12  Position                  17796 non-null  object 
 13  Consultancy               14524 non-null  object 
 14  Suppli

In [5]:
df = df.loc[df['Position'] != 'Torwart']  # remove position Torwart

In [6]:
features = ['LeagueCountry', 'League', 'NationalLeagueLevel', 'Club', 'Age', 'Nationality', 'Position', 'PositionCategory', 'Supplier', 'ClubSince', 'InternationalTeam', 'ActiveInternational', 'FormerInternational', 'InternationalGames', 'InternationalGoals', 'Trending', 'MinutesQuote', 'FsGoalParticipationQuote', 'Games', 'PointsPerGame', 'Goals', 'Ins', 'Cards', 'Minutes', 'FsAssists', 'FsPenaltyGoals', 'FsMinutesPerGoal', 'Value']

df = df[features]

In [7]:
numeric_columns = df.select_dtypes(include=[np.number])
for col in numeric_columns.columns:
    print(f"Column: {col}, NaN values: {numeric_columns[col].isna().sum()}")

Column: Age, NaN values: 2
Column: ClubSince, NaN values: 25
Column: ActiveInternational, NaN values: 0
Column: FormerInternational, NaN values: 0
Column: InternationalGames, NaN values: 6209
Column: InternationalGoals, NaN values: 6209
Column: Trending, NaN values: 0
Column: MinutesQuote, NaN values: 1750
Column: FsGoalParticipationQuote, NaN values: 1750
Column: Games, NaN values: 3999
Column: PointsPerGame, NaN values: 4250
Column: Goals, NaN values: 3999
Column: Ins, NaN values: 3999
Column: Cards, NaN values: 3999
Column: Minutes, NaN values: 4290
Column: FsAssists, NaN values: 3999
Column: FsPenaltyGoals, NaN values: 3999
Column: FsMinutesPerGoal, NaN values: 9367
Column: Value, NaN values: 0


In [11]:
df = df.dropna(subset=['Age', 'ClubSince', 'MinutesQuote', 'FsGoalParticipationQuote', 'Games', 'PointsPerGame', 'Goals', 'Ins', 'Cards', 'Minutes', 'FsAssists', 'FsPenaltyGoals'])

In [12]:
df = df.fillna({'InternationalGames': 0, 'InternationalGoals': 0, 'FsMinutesPerGoal': 0})

In [13]:
numeric_columns = df.select_dtypes(include=[np.number])
for col in numeric_columns.columns:
    print(f"Column: {col}, NaN values: {numeric_columns[col].isna().sum()}")

Column: Age, NaN values: 0
Column: ClubSince, NaN values: 0
Column: ActiveInternational, NaN values: 0
Column: FormerInternational, NaN values: 0
Column: InternationalGames, NaN values: 0
Column: InternationalGoals, NaN values: 0
Column: Trending, NaN values: 0
Column: MinutesQuote, NaN values: 0
Column: FsGoalParticipationQuote, NaN values: 0
Column: Games, NaN values: 0
Column: PointsPerGame, NaN values: 0
Column: Goals, NaN values: 0
Column: Ins, NaN values: 0
Column: Cards, NaN values: 0
Column: Minutes, NaN values: 0
Column: FsAssists, NaN values: 0
Column: FsPenaltyGoals, NaN values: 0
Column: FsMinutesPerGoal, NaN values: 0
Column: Value, NaN values: 0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11471 entries, 5 to 18355
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   LeagueCountry             11471 non-null  object 
 1   League                    11471 non-null  object 
 2   NationalLeagueLevel       11471 non-null  object 
 3   Club                      11471 non-null  object 
 4   Age                       11471 non-null  float64
 5   Nationality               11471 non-null  object 
 6   Position                  11471 non-null  object 
 7   PositionCategory          11471 non-null  object 
 8   Supplier                  2127 non-null   object 
 9   ClubSince                 11471 non-null  float64
 10  InternationalTeam         6854 non-null   object 
 11  ActiveInternational       11471 non-null  int64  
 12  FormerInternational       11471 non-null  int64  
 13  InternationalGames        11471 non-null  float64
 14  Intern

In [15]:
df = process_categorical(df, 'Supplier')
df = process_categorical(df, 'InternationalTeam', 5)
df = process_categorical(df, 'League')
df = process_categorical(df, 'Club')
df = process_categorical(df, 'Nationality')
df = process_categorical(df, 'LeagueCountry', 5)
df = process_categorical(df, 'NationalLeagueLevel', 5)
df = process_categorical(df, 'PositionCategory', 5)
df = process_categorical(df, 'Position', 5)

In [16]:
df.to_csv('../data/df_extensive_model_fs_unfiltered.csv')

df = df[df['Value'] <= 150000000]  # exclude outliers

df.to_csv('../data/df_extensive_model_fs.csv')