In [338]:
import pandas as pd
import numpy as np

In [339]:
def process_categorical(df, column, min_count):
    value_counts = df[column].value_counts()
    frequent_categories = value_counts[value_counts >= min_count].index
    df.loc[:, 'processed_' + column] = df[column].apply(lambda x: x if x in frequent_categories else 'other')
    dummies = pd.get_dummies(df['processed_' + column], prefix=column)
    df = df.drop(column, axis=1)
    df = df.drop('processed_' + column, axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [340]:
df = pd.read_csv('../data/df_clean.csv')

In [341]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17803 entries, 0 to 17802
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   LeagueCountry             17803 non-null  object 
 1   League                    17803 non-null  object 
 2   NationalLeagueLevel       17803 non-null  object 
 3   Club                      17803 non-null  object 
 4   No                        17494 non-null  float64
 5   Name                      17803 non-null  object 
 6   Value                     17803 non-null  float64
 7   HighestValue              17803 non-null  float64
 8   Age                       17800 non-null  float64
 9   Height                    16713 non-null  float64
 10  Nationality               17803 non-null  object 
 11  Foot                      16118 non-null  object 
 12  Position                  17803 non-null  object 
 13  Consultancy               14496 non-null  object 
 14  Suppli

In [342]:
df = df[df['Value'] <= 150000000]

In [343]:
df = df.loc[df['Position'] != 'Torwart']  # remove position Torwart

In [344]:
features = ['LeagueCountry', 'League', 'NationalLeagueLevel', 'Club', 'Age', 'Nationality', 'Position', 'PositionCategory', 'Consultancy', 'Supplier', 'ClubSince', 'Injury', 'InternationalTeam', 'ActiveInternational', 'FormerInternational', 'InternationalGames', 'InternationalGoals', 'Trending', 'Value'] #Possibly include Games

df = df[features]

In [345]:
df = df.dropna(subset=['Age', 'ClubSince'])

In [346]:
df = df.fillna({'InternationalGames': 0, 'InternationalGoals': 0})

In [347]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15747 entries, 5 to 17802
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   LeagueCountry        15747 non-null  object 
 1   League               15747 non-null  object 
 2   NationalLeagueLevel  15747 non-null  object 
 3   Club                 15747 non-null  object 
 4   Age                  15747 non-null  float64
 5   Nationality          15747 non-null  object 
 6   Position             15747 non-null  object 
 7   PositionCategory     15747 non-null  object 
 8   Consultancy          12933 non-null  object 
 9   Supplier             2488 non-null   object 
 10  ClubSince            15747 non-null  float64
 11  Injury               15747 non-null  int64  
 12  InternationalTeam    10656 non-null  object 
 13  ActiveInternational  15747 non-null  int64  
 14  FormerInternational  15747 non-null  int64  
 15  InternationalGames   15747 non-null 

In [348]:
# Select numerical columns
numerical_columns = df.select_dtypes(include=[np.number])

# Print column names and count of NaN values for each numerical column
for col in numerical_columns.columns:
    print(f"Column: {col}, NaN values: {numerical_columns[col].isna().sum()}")

Column: Age, NaN values: 0
Column: ClubSince, NaN values: 0
Column: Injury, NaN values: 0
Column: ActiveInternational, NaN values: 0
Column: FormerInternational, NaN values: 0
Column: InternationalGames, NaN values: 0
Column: InternationalGoals, NaN values: 0
Column: Trending, NaN values: 0
Column: Value, NaN values: 0


In [349]:
df = process_categorical(df, 'Supplier', 10)

In [350]:
df = process_categorical(df, 'Consultancy', 10)

In [351]:
df = process_categorical(df, 'InternationalTeam', 5)

In [352]:
df = process_categorical(df, 'League', 10)

In [353]:
df = process_categorical(df, 'Club', 10)

In [354]:
df = process_categorical(df, 'LeagueCountry', 5)

In [355]:
df = process_categorical(df, 'NationalLeagueLevel', 5)

In [356]:
df = process_categorical(df, 'Nationality', 10)

In [357]:
df = process_categorical(df, 'Position', 5)

In [358]:
df = process_categorical(df, 'PositionCategory', 5)

In [359]:
df.to_csv('../data/df_simple_model.csv', index=False)