In [76]:
import pandas as pd
import numpy as np
from analysis.utils import process_categorical

In [77]:
df = pd.read_csv('../data/df_clean.csv', index_col=0)

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17803 entries, 0 to 18334
Data columns (total 45 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   LeagueCountry             17803 non-null  object 
 1   League                    17803 non-null  object 
 2   NationalLeagueLevel       17803 non-null  object 
 3   Club                      17803 non-null  object 
 4   No                        17494 non-null  float64
 5   Name                      17803 non-null  object 
 6   Value                     17803 non-null  float64
 7   HighestValue              17803 non-null  float64
 8   Age                       17800 non-null  float64
 9   Height                    16713 non-null  float64
 10  Nationality               17803 non-null  object 
 11  Foot                      16118 non-null  object 
 12  Position                  17803 non-null  object 
 13  Consultancy               14496 non-null  object 
 14  Suppli

In [79]:
df = df.loc[df['Position'] == 'Torwart']

In [80]:
features = ['LeagueCountry', 'NationalLeagueLevel', 'InternationalTeam', 'League', 'Club', 'Age', 'ClubSince', 'ActiveInternational', 'FormerInternational', 'InternationalGames', 'Trending', 'Games', 'PointsPerGame', 'Goals', 'Ins', 'Cards', 'Minutes', 'TwCleanSheets', 'TwGoalsConceded', 'TwPenaltySavesQuote', 'Value']

df = df[features]

In [81]:
numeric_columns = df.select_dtypes(include=[np.number])
for col in numeric_columns.columns:
    print(f"Column: {col}, NaN values: {numeric_columns[col].isna().sum()}")

Column: Age, NaN values: 1
Column: ClubSince, NaN values: 10
Column: ActiveInternational, NaN values: 0
Column: FormerInternational, NaN values: 0
Column: InternationalGames, NaN values: 725
Column: Trending, NaN values: 0
Column: Games, NaN values: 568
Column: PointsPerGame, NaN values: 1016
Column: Goals, NaN values: 568
Column: Ins, NaN values: 568
Column: Cards, NaN values: 568
Column: Minutes, NaN values: 1018
Column: TwCleanSheets, NaN values: 568
Column: TwGoalsConceded, NaN values: 568
Column: TwPenaltySavesQuote, NaN values: 290
Column: Value, NaN values: 0


In [82]:
df = df.dropna(subset=['Age', 'ClubSince', 'Games', 'PointsPerGame', 'Goals', 'Ins', 'Cards', 'Minutes', 'TwCleanSheets', 'TwGoalsConceded', 'TwPenaltySavesQuote'])

In [83]:
df = df.fillna({'InternationalGames': 0})

In [84]:
numeric_columns = df.select_dtypes(include=[np.number])
for col in numeric_columns.columns:
    print(f"Column: {col}, NaN values: {numeric_columns[col].isna().sum()}")

Column: Age, NaN values: 0
Column: ClubSince, NaN values: 0
Column: ActiveInternational, NaN values: 0
Column: FormerInternational, NaN values: 0
Column: InternationalGames, NaN values: 0
Column: Trending, NaN values: 0
Column: Games, NaN values: 0
Column: PointsPerGame, NaN values: 0
Column: Goals, NaN values: 0
Column: Ins, NaN values: 0
Column: Cards, NaN values: 0
Column: Minutes, NaN values: 0
Column: TwCleanSheets, NaN values: 0
Column: TwGoalsConceded, NaN values: 0
Column: TwPenaltySavesQuote, NaN values: 0
Column: Value, NaN values: 0


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1007 entries, 0 to 18310
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   LeagueCountry        1007 non-null   object 
 1   NationalLeagueLevel  1007 non-null   object 
 2   InternationalTeam    742 non-null    object 
 3   League               1007 non-null   object 
 4   Club                 1007 non-null   object 
 5   Age                  1007 non-null   float64
 6   ClubSince            1007 non-null   float64
 7   ActiveInternational  1007 non-null   int64  
 8   FormerInternational  1007 non-null   int64  
 9   InternationalGames   1007 non-null   float64
 10  Trending             1007 non-null   int64  
 11  Games                1007 non-null   float64
 12  PointsPerGame        1007 non-null   float64
 13  Goals                1007 non-null   float64
 14  Ins                  1007 non-null   float64
 15  Cards                1007 non-null   

In [86]:
df = process_categorical(df, 'InternationalTeam', 5)
df = process_categorical(df, 'League')
df = process_categorical(df, 'Club')
df = process_categorical(df, 'LeagueCountry', 5)
df = process_categorical(df, 'NationalLeagueLevel', 5)

In [87]:
df.to_csv('../data/df_extensive_model_tw.csv')