# Prepare data and convert data types

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_pickle("./df_merge_club_transfer_data.pkl")
print(df.dtypes)

DATE                            datetime64[ns]
WEEKDAY                                  int64
MONTH                                    int64
SEASON                                   int32
MATCHDAY                                 int64
HOME_TEAM                               object
PLACE_HOME_TEAM                         object
AWAY_TEAM                               object
PLACE_AWAY_TEAM                         object
WIN_PERC_HOME                          float64
REMIS_PERC                             float64
WIN_PERC_AWAY                          float64
HOME_GOALS                             float64
AWAY_GOALS                             float64
RESULT                                  object
REFEREE                                 object
HOME_PLAYERS_COUNT                      object
HOME_PLAYERS_AVG_AGE                    object
HOME_LEGIONARIES_COUNT                  object
HOME_AVG_MARKET_VALUE                   object
HOME_TOTAL_MARKET_VALUE                 object
HOME_AVG_AGE_

In [3]:
nan_count = df.isna().sum()
print(nan_count)
print(len(df))

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO       0
HOME_TOTAL_VALUE_LEAVING_MIO       0
HOME_EXPENSES_JOINING_MIO          0
HOME_REVENUE_LEAVING_MIO           0
A

## 1st step of data preparation

In [4]:
df1 = df.copy()
object_columns = df1.select_dtypes(include=['object']).columns
print(object_columns)

Index(['HOME_TEAM', 'PLACE_HOME_TEAM', 'AWAY_TEAM', 'PLACE_AWAY_TEAM',
       'RESULT', 'REFEREE', 'HOME_PLAYERS_COUNT', 'HOME_PLAYERS_AVG_AGE',
       'HOME_LEGIONARIES_COUNT', 'HOME_AVG_MARKET_VALUE',
       'HOME_TOTAL_MARKET_VALUE', 'HOME_AVG_AGE_LEAVING',
       'HOME_TOTAL_VALUE_JOINING_MIO', 'HOME_TOTAL_VALUE_LEAVING_MIO',
       'HOME_EXPENSES_JOINING_MIO', 'HOME_REVENUE_LEAVING_MIO',
       'AWAY_PLAYERS_COUNT', 'AWAY_PLAYERS_AVG_AGE', 'AWAY_LEGIONARIES_COUNT',
       'AWAY_AVG_MARKET_VALUE', 'AWAY_TOTAL_MARKET_VALUE',
       'AWAY_AVG_AGE_LEAVING', 'AWAY_TOTAL_VALUE_JOINING_MIO',
       'AWAY_TOTAL_VALUE_LEAVING_MIO', 'AWAY_EXPENSES_JOINING_MIO',
       'AWAY_REVENUE_LEAVING_MIO'],
      dtype='object')


In [5]:
print(df1[object_columns].iloc[0])

HOME_TEAM                       SV Werder Bremen
PLACE_HOME_TEAM                                3
AWAY_TEAM                          FC Schalke 04
PLACE_AWAY_TEAM                                2
RESULT                                  HOME_WIN
REFEREE                         Stefan Trautmann
HOME_PLAYERS_COUNT                            29
HOME_PLAYERS_AVG_AGE                        25,0
HOME_LEGIONARIES_COUNT                        14
HOME_AVG_MARKET_VALUE                3,40 Mio. €
HOME_TOTAL_MARKET_VALUE             98,48 Mio. €
HOME_AVG_AGE_LEAVING                        26.4
HOME_TOTAL_VALUE_JOINING_MIO                    
HOME_TOTAL_VALUE_LEAVING_MIO                    
HOME_EXPENSES_JOINING_MIO                    9.0
HOME_REVENUE_LEAVING_MIO                    6.45
AWAY_PLAYERS_COUNT                            33
AWAY_PLAYERS_AVG_AGE                        25,6
AWAY_LEGIONARIES_COUNT                        15
AWAY_AVG_MARKET_VALUE                2,62 Mio. €
AWAY_TOTAL_MARKET_VA

In [6]:
columns_to_convert = ['PLACE_HOME_TEAM','PLACE_AWAY_TEAM','HOME_PLAYERS_COUNT','HOME_PLAYERS_AVG_AGE','HOME_LEGIONARIES_COUNT',
                     'HOME_AVG_AGE_LEAVING', "HOME_TOTAL_VALUE_JOINING_MIO", "HOME_TOTAL_VALUE_LEAVING_MIO", 'HOME_EXPENSES_JOINING_MIO','HOME_REVENUE_LEAVING_MIO', 
                     'AWAY_PLAYERS_COUNT','AWAY_PLAYERS_AVG_AGE','AWAY_LEGIONARIES_COUNT','AWAY_AVG_AGE_LEAVING', "AWAY_TOTAL_VALUE_JOINING_MIO", "AWAY_TOTAL_VALUE_LEAVING_MIO",
                     'AWAY_EXPENSES_JOINING_MIO', 'AWAY_REVENUE_LEAVING_MIO']
for column in columns_to_convert:
    if df1[column].dtype == 'object':  # Nur Spalten vom Typ 'object' verarbeiten
        df1[column] = df1[column].astype(str).str.replace(',', '.')
        df1[column] = pd.to_numeric(df1[column], errors='coerce')

In [7]:
nan_count = df1.isna().sum()
print(nan_count)

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO     748
HOME_TOTAL_VALUE_LEAVING_MIO    1224
HOME_EXPENSES_JOINING_MIO         85
HOME_REVENUE_LEAVING_MIO         272
A

In [8]:

first_nan_index = df1[df1['AWAY_EXPENSES_JOINING_MIO'].isna()].index.max()
print(first_nan_index)
value = df.loc[first_nan_index, 'AWAY_EXPENSES_JOINING_MIO']
print(df.loc[first_nan_index, 'AWAY_EXPENSES_JOINING_MIO'])

5503



In [9]:
# Ein Set für die Teamnamen erstellen
unique_teams = set()

# Durch alle Zeilen im DataFrame iterieren und die Teamnamen in das Set aufnehmen, falls "AWAY_EXPENSES_JOINING_MIO" einen bestimmten Wert hat
for index, row in df.iterrows():
    if pd.notna(row['AWAY_EXPENSES_JOINING_MIO']):  # pd.notna prüft, ob der Wert nicht NaN ist
        unique_teams.add(row['AWAY_TEAM'])

# Das Set enthält jetzt die einzigartigen Teamnamen, wo "AWAY_EXPENSES_JOINING_MIO" nicht NaN ist
print(unique_teams)

{'Hamburger SV', 'Fortuna Düsseldorf', 'VfL Wolfsburg', 'SC Freiburg', 'VfB Stuttgart', 'TSG 1899 Hoffenheim', 'Hannover 96', 'SV Werder Bremen', 'Hertha BSC', 'FC Augsburg', 'FC Hansa Rostock', 'VfL Bochum', '1.FC Nürnberg', 'Borussia Mönchengladbach', 'FC Schalke 04', 'Alemannia Aachen', 'Bayer 04 Leverkusen', 'SpVgg Greuther Fürth', '1.FC Union Berlin', 'Borussia Dortmund', 'Arminia Bielefeld', 'FC Ingolstadt 04', 'Karlsruher SC', 'RasenBallsport Leipzig', 'MSV Duisburg', 'Eintracht Braunschweig', 'SC Paderborn 07', '1.FSV Mainz 05', 'FC St. Pauli', 'Eintracht Frankfurt', '1.FC Köln', 'FC Energie Cottbus', 'SV Darmstadt 98', '1.FC Kaiserslautern', 'FC Bayern München'}


In [10]:
def convert_market_value(value):
    try:
        # Entferne Kommas und ersetze sie durch Punkte
        value = value.replace(",", ".")
        
        # Millionen
        if "Mio. €" in value:
            value = value.replace("Mio. €", "")
            return float(value) * 1e6  # Multipliziere mit 1 Million

        # Tausend
        elif "Tsd. €" in value:
            value = value.replace("Tsd. €", "")
            return float(value) * 1e3  # Multipliziere mit 1 Tausend

        else:
            return None  # Falls die Konvertierung fehlschlägt, gebe None zurück

    except Exception as e:
        print(f"Konvertierungsfehler: {e}, Wert: {value}")
        return None  # Falls die Konvertierung fehlschlägt, gebe None zurück

In [11]:
df1 = df1.copy()
df1['HOME_AVG_MARKET_VALUE'] = df1['HOME_AVG_MARKET_VALUE'].apply(convert_market_value)
df1['HOME_TOTAL_MARKET_VALUE'] = df1['HOME_TOTAL_MARKET_VALUE'].apply(convert_market_value)
df1['AWAY_AVG_MARKET_VALUE'] = df1['AWAY_AVG_MARKET_VALUE'].apply(convert_market_value)
df1['AWAY_TOTAL_MARKET_VALUE'] = df1['AWAY_TOTAL_MARKET_VALUE'].apply(convert_market_value)
print(df1.dtypes)

DATE                            datetime64[ns]
WEEKDAY                                  int64
MONTH                                    int64
SEASON                                   int32
MATCHDAY                                 int64
HOME_TEAM                               object
PLACE_HOME_TEAM                          int64
AWAY_TEAM                               object
PLACE_AWAY_TEAM                          int64
WIN_PERC_HOME                          float64
REMIS_PERC                             float64
WIN_PERC_AWAY                          float64
HOME_GOALS                             float64
AWAY_GOALS                             float64
RESULT                                  object
REFEREE                                 object
HOME_PLAYERS_COUNT                       int64
HOME_PLAYERS_AVG_AGE                   float64
HOME_LEGIONARIES_COUNT                   int64
HOME_AVG_MARKET_VALUE                  float64
HOME_TOTAL_MARKET_VALUE                float64
HOME_AVG_AGE_

In [12]:
print(df1[object_columns].iloc[0])

HOME_TEAM                       SV Werder Bremen
PLACE_HOME_TEAM                                3
AWAY_TEAM                          FC Schalke 04
PLACE_AWAY_TEAM                                2
RESULT                                  HOME_WIN
REFEREE                         Stefan Trautmann
HOME_PLAYERS_COUNT                            29
HOME_PLAYERS_AVG_AGE                        25.0
HOME_LEGIONARIES_COUNT                        14
HOME_AVG_MARKET_VALUE                  3400000.0
HOME_TOTAL_MARKET_VALUE               98480000.0
HOME_AVG_AGE_LEAVING                        26.4
HOME_TOTAL_VALUE_JOINING_MIO                 NaN
HOME_TOTAL_VALUE_LEAVING_MIO                 NaN
HOME_EXPENSES_JOINING_MIO                    9.0
HOME_REVENUE_LEAVING_MIO                    6.45
AWAY_PLAYERS_COUNT                            33
AWAY_PLAYERS_AVG_AGE                        25.6
AWAY_LEGIONARIES_COUNT                        15
AWAY_AVG_MARKET_VALUE                  2620000.0
AWAY_TOTAL_MARKET_VA

In [13]:
nan_count = df1.isna().sum()
print(nan_count)

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO     748
HOME_TOTAL_VALUE_LEAVING_MIO    1224
HOME_EXPENSES_JOINING_MIO         85
HOME_REVENUE_LEAVING_MIO         272
A

In [14]:
df1.to_pickle("df_merge_club_transfer_data_prep1.pkl")