# Prepare data and convert data types

In [73]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [74]:
df = pd.read_pickle("./df_merge_club_transfer_data.pkl")
print(df.dtypes)

DATE                            datetime64[ns]
WEEKDAY                                  int64
MONTH                                    int64
SEASON                                   int32
MATCHDAY                                 int64
HOME_TEAM                               object
PLACE_HOME_TEAM                         object
AWAY_TEAM                               object
PLACE_AWAY_TEAM                         object
WIN_PERC_HOME                          float64
REMIS_PERC                             float64
WIN_PERC_AWAY                          float64
HOME_GOALS                             float64
AWAY_GOALS                             float64
RESULT                                  object
REFEREE                                 object
HOME_PLAYERS_COUNT                      object
HOME_PLAYERS_AVG_AGE                    object
HOME_LEGIONARIES_COUNT                  object
HOME_AVG_MARKET_VALUE                   object
HOME_TOTAL_MARKET_VALUE                 object
HOME_AVG_AGE_

In [75]:
nan_count = df.isna().sum()
print(nan_count)
print(len(df))

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO       0
HOME_TOTAL_VALUE_LEAVING_MIO       0
HOME_EXPENSES_JOINING_MIO          0
HOME_REVENUE_LEAVING_MIO           0
A

## 1st step of data preparation

In [76]:
df1 = df.copy()
object_columns = df1.select_dtypes(include=['object']).columns
print(object_columns)

Index(['HOME_TEAM', 'PLACE_HOME_TEAM', 'AWAY_TEAM', 'PLACE_AWAY_TEAM',
       'RESULT', 'REFEREE', 'HOME_PLAYERS_COUNT', 'HOME_PLAYERS_AVG_AGE',
       'HOME_LEGIONARIES_COUNT', 'HOME_AVG_MARKET_VALUE',
       'HOME_TOTAL_MARKET_VALUE', 'HOME_AVG_AGE_LEAVING',
       'HOME_TOTAL_VALUE_JOINING_MIO', 'HOME_TOTAL_VALUE_LEAVING_MIO',
       'HOME_EXPENSES_JOINING_MIO', 'HOME_REVENUE_LEAVING_MIO',
       'AWAY_PLAYERS_COUNT', 'AWAY_PLAYERS_AVG_AGE', 'AWAY_LEGIONARIES_COUNT',
       'AWAY_AVG_MARKET_VALUE', 'AWAY_TOTAL_MARKET_VALUE',
       'AWAY_AVG_AGE_LEAVING', 'AWAY_TOTAL_VALUE_JOINING_MIO',
       'AWAY_TOTAL_VALUE_LEAVING_MIO', 'AWAY_EXPENSES_JOINING_MIO',
       'AWAY_REVENUE_LEAVING_MIO'],
      dtype='object')


In [77]:
print(df1[object_columns].iloc[0])

HOME_TEAM                       SV Werder Bremen
PLACE_HOME_TEAM                                3
AWAY_TEAM                          FC Schalke 04
PLACE_AWAY_TEAM                                2
RESULT                                  HOME_WIN
REFEREE                         Stefan Trautmann
HOME_PLAYERS_COUNT                            29
HOME_PLAYERS_AVG_AGE                        25,0
HOME_LEGIONARIES_COUNT                        14
HOME_AVG_MARKET_VALUE                3,40 Mio. €
HOME_TOTAL_MARKET_VALUE             98,48 Mio. €
HOME_AVG_AGE_LEAVING                        26.4
HOME_TOTAL_VALUE_JOINING_MIO                    
HOME_TOTAL_VALUE_LEAVING_MIO                    
HOME_EXPENSES_JOINING_MIO                    9.0
HOME_REVENUE_LEAVING_MIO                    6.45
AWAY_PLAYERS_COUNT                            33
AWAY_PLAYERS_AVG_AGE                        25,6
AWAY_LEGIONARIES_COUNT                        15
AWAY_AVG_MARKET_VALUE                2,62 Mio. €
AWAY_TOTAL_MARKET_VA

In [78]:
columns_to_convert = ['PLACE_HOME_TEAM','PLACE_AWAY_TEAM','HOME_PLAYERS_COUNT','HOME_PLAYERS_AVG_AGE','HOME_LEGIONARIES_COUNT',
                     'HOME_AVG_AGE_LEAVING','HOME_EXPENSES_JOINING_MIO','HOME_REVENUE_LEAVING_MIO', 
                     'AWAY_PLAYERS_COUNT','AWAY_PLAYERS_AVG_AGE','AWAY_LEGIONARIES_COUNT','AWAY_AVG_AGE_LEAVING',
                     'AWAY_EXPENSES_JOINING_MIO', 'AWAY_REVENUE_LEAVING_MIO']
for column in columns_to_convert:
    if df1[column].dtype == 'object':  # Nur Spalten vom Typ 'object' verarbeiten
        df1[column] = df1[column].astype(str).str.replace(',', '.')
        df1[column] = pd.to_numeric(df1[column], errors='coerce')

In [79]:
nan_count = df1.isna().sum()
print(nan_count)

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO       0
HOME_TOTAL_VALUE_LEAVING_MIO       0
HOME_EXPENSES_JOINING_MIO         85
HOME_REVENUE_LEAVING_MIO         272
A

In [80]:

first_nan_index = df1[df1['AWAY_EXPENSES_JOINING_MIO'].isna()].index.max()
print(first_nan_index)
value = df.loc[first_nan_index, 'AWAY_EXPENSES_JOINING_MIO']
print(df.loc[first_nan_index, 'AWAY_EXPENSES_JOINING_MIO'])

5503



In [81]:
# Ein Set für die Teamnamen erstellen
unique_teams = set()

# Durch alle Zeilen im DataFrame iterieren und die Teamnamen in das Set aufnehmen, falls "AWAY_EXPENSES_JOINING_MIO" einen bestimmten Wert hat
for index, row in df.iterrows():
    if pd.notna(row['AWAY_EXPENSES_JOINING_MIO']):  # pd.notna prüft, ob der Wert nicht NaN ist
        unique_teams.add(row['AWAY_TEAM'])

# Das Set enthält jetzt die einzigartigen Teamnamen, wo "AWAY_EXPENSES_JOINING_MIO" nicht NaN ist
print(unique_teams)

{'1.FSV Mainz 05', 'Hannover 96', 'Arminia Bielefeld', 'MSV Duisburg', 'VfL Wolfsburg', 'Borussia Mönchengladbach', 'FC Ingolstadt 04', 'Eintracht Braunschweig', 'TSG 1899 Hoffenheim', 'Eintracht Frankfurt', '1.FC Nürnberg', 'Hertha BSC', 'FC Hansa Rostock', 'Fortuna Düsseldorf', 'RasenBallsport Leipzig', '1.FC Kaiserslautern', 'SpVgg Greuther Fürth', 'SV Werder Bremen', 'VfL Bochum', 'Karlsruher SC', 'FC Augsburg', 'FC Schalke 04', 'SV Darmstadt 98', 'SC Paderborn 07', 'Bayer 04 Leverkusen', '1.FC Köln', 'VfB Stuttgart', '1.FC Union Berlin', 'Alemannia Aachen', 'SC Freiburg', 'FC Bayern München', 'FC Energie Cottbus', 'FC St. Pauli', 'Hamburger SV', 'Borussia Dortmund'}


In [82]:
def convert_market_value(value):
    try:
        # Entferne Kommas und ersetze sie durch Punkte
        value = value.replace(",", ".")
        
        # Millionen
        if "Mio. €" in value:
            value = value.replace("Mio. €", "")
            return float(value) * 1e6  # Multipliziere mit 1 Million

        # Tausend
        elif "Tsd. €" in value:
            value = value.replace("Tsd. €", "")
            return float(value) * 1e3  # Multipliziere mit 1 Tausend

        else:
            return None  # Falls die Konvertierung fehlschlägt, gebe None zurück

    except Exception as e:
        print(f"Konvertierungsfehler: {e}, Wert: {value}")
        return None  # Falls die Konvertierung fehlschlägt, gebe None zurück

In [83]:
df1 = df1.copy()
df1['HOME_AVG_MARKET_VALUE'] = df1['HOME_AVG_MARKET_VALUE'].apply(convert_market_value)
df1['HOME_TOTAL_MARKET_VALUE'] = df1['HOME_TOTAL_MARKET_VALUE'].apply(convert_market_value)
df1['AWAY_AVG_MARKET_VALUE'] = df1['AWAY_AVG_MARKET_VALUE'].apply(convert_market_value)
df1['AWAY_TOTAL_MARKET_VALUE'] = df1['AWAY_TOTAL_MARKET_VALUE'].apply(convert_market_value)
print(df1.dtypes)

DATE                            datetime64[ns]
WEEKDAY                                  int64
MONTH                                    int64
SEASON                                   int32
MATCHDAY                                 int64
HOME_TEAM                               object
PLACE_HOME_TEAM                          int64
AWAY_TEAM                               object
PLACE_AWAY_TEAM                          int64
WIN_PERC_HOME                          float64
REMIS_PERC                             float64
WIN_PERC_AWAY                          float64
HOME_GOALS                             float64
AWAY_GOALS                             float64
RESULT                                  object
REFEREE                                 object
HOME_PLAYERS_COUNT                       int64
HOME_PLAYERS_AVG_AGE                   float64
HOME_LEGIONARIES_COUNT                   int64
HOME_AVG_MARKET_VALUE                  float64
HOME_TOTAL_MARKET_VALUE                float64
HOME_AVG_AGE_

In [84]:
print(df1[object_columns].iloc[0])

HOME_TEAM                       SV Werder Bremen
PLACE_HOME_TEAM                                3
AWAY_TEAM                          FC Schalke 04
PLACE_AWAY_TEAM                                2
RESULT                                  HOME_WIN
REFEREE                         Stefan Trautmann
HOME_PLAYERS_COUNT                            29
HOME_PLAYERS_AVG_AGE                        25.0
HOME_LEGIONARIES_COUNT                        14
HOME_AVG_MARKET_VALUE                  3400000.0
HOME_TOTAL_MARKET_VALUE               98480000.0
HOME_AVG_AGE_LEAVING                        26.4
HOME_TOTAL_VALUE_JOINING_MIO                    
HOME_TOTAL_VALUE_LEAVING_MIO                    
HOME_EXPENSES_JOINING_MIO                    9.0
HOME_REVENUE_LEAVING_MIO                    6.45
AWAY_PLAYERS_COUNT                            33
AWAY_PLAYERS_AVG_AGE                        25.6
AWAY_LEGIONARIES_COUNT                        15
AWAY_AVG_MARKET_VALUE                  2620000.0
AWAY_TOTAL_MARKET_VA

In [85]:
nan_count = df1.isna().sum()
print(nan_count)

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO       0
HOME_TOTAL_VALUE_LEAVING_MIO       0
HOME_EXPENSES_JOINING_MIO         85
HOME_REVENUE_LEAVING_MIO         272
A

In [86]:
df1.to_pickle("df_merge_club_transfer_data_prep1.pkl")

## 2nd step of data preparation

In [112]:
df2 = df1.copy()

In [113]:
le_HOME_TEAM = LabelEncoder()
encoded_HOME_TEAM = le_HOME_TEAM.fit_transform(df2['HOME_TEAM'])
df2['HOME_TEAM'] = encoded_HOME_TEAM

label_mapping_HOME_TEAM = {idx: label for idx, label in enumerate(le_HOME_TEAM.classes_)}
print("Label Mapping:", label_mapping_HOME_TEAM)

Label Mapping: {0: '1.FC Kaiserslautern', 1: '1.FC Köln', 2: '1.FC Nürnberg', 3: '1.FC Union Berlin', 4: '1.FSV Mainz 05', 5: 'Alemannia Aachen', 6: 'Arminia Bielefeld', 7: 'Bayer 04 Leverkusen', 8: 'Borussia Dortmund', 9: 'Borussia Mönchengladbach', 10: 'Eintracht Braunschweig', 11: 'Eintracht Frankfurt', 12: 'FC Augsburg', 13: 'FC Bayern München', 14: 'FC Energie Cottbus', 15: 'FC Hansa Rostock', 16: 'FC Ingolstadt 04', 17: 'FC Schalke 04', 18: 'FC St. Pauli', 19: 'Fortuna Düsseldorf', 20: 'Hamburger SV', 21: 'Hannover 96', 22: 'Hertha BSC', 23: 'Karlsruher SC', 24: 'MSV Duisburg', 25: 'RasenBallsport Leipzig', 26: 'SC Freiburg', 27: 'SC Paderborn 07', 28: 'SV Darmstadt 98', 29: 'SV Werder Bremen', 30: 'SpVgg Greuther Fürth', 31: 'TSG 1899 Hoffenheim', 32: 'VfB Stuttgart', 33: 'VfL Bochum', 34: 'VfL Wolfsburg'}


In [114]:
le_REFEREE = LabelEncoder()
encoded_REFEREE = le_REFEREE.fit_transform(df2['REFEREE'])
df2['REFEREE'] = encoded_REFEREE

label_mapping_REFEREE = {idx: label for idx, label in enumerate(le_REFEREE.classes_)}
print("Label Mapping:", label_mapping_REFEREE)

Label Mapping: {0: 'Babak Rafati', 1: 'Bastian Dankert', 2: 'Benjamin Brand', 3: 'Benjamin Cortus', 4: 'Bibiana Steinhaus-Webb', 5: 'Christian Dingert', 6: 'Daniel Schlager', 7: 'Daniel Siebert', 8: 'Deniz Aytekin', 9: 'Dr. Arne Aarnink', 10: 'Dr. Felix Brych', 11: 'Dr. Franz-Xaver Wack', 12: 'Dr. Helmut Fleischer', 13: 'Dr. Jochen Drees', 14: 'Dr. Markus Merk', 15: 'Dr. Martin Thomsen', 16: 'Dr. Matthias Jöllenbeck', 17: 'Dr. Robert Kampka', 18: 'Dr. Robin Braun', 19: 'Felix Zwayer', 20: 'Florian Badstübner', 21: 'Florian Meyer', 22: 'Frank Willenborg', 23: 'Guido Winkmann', 24: 'Günter Perl', 25: 'Harm Osmers', 26: 'Herbert Fandel', 27: 'Hermann Albrecht', 28: 'Jörg Keßler', 29: 'Jürgen Jansen', 30: 'Knut Kircher', 31: 'Lutz Wagner', 32: 'Lutz-Michael Fröhlich', 33: 'Manuel Gräfe', 34: 'Marc Seemann', 35: 'Marco Fritz', 36: 'Markus Schmidt', 37: 'Markus Wingenbach', 38: 'Martin Petersen', 39: 'Michael Kempter', 40: 'Michael Weiner', 41: 'Patrick Ittrich', 42: 'Peter Gagelmann', 43: '

In [115]:
le_RESULT = LabelEncoder()
encoded_RESULT = le_RESULT.fit_transform(df2['RESULT'])
df2['RESULT'] = encoded_RESULT

label_mapping_RESULT = {idx: label for idx, label in enumerate(le_RESULT.classes_)}
print("Label Mapping:", label_mapping_RESULT)

Label Mapping: {0: 'AWAY_WIN', 1: 'DRAW', 2: 'HOME_WIN'}


In [117]:
print(df2[object_columns].iloc[0])

HOME_TEAM                                2
PLACE_HOME_TEAM                          3
AWAY_TEAM                               17
PLACE_AWAY_TEAM                          2
RESULT                                   2
REFEREE                                 47
HOME_PLAYERS_COUNT                      29
HOME_PLAYERS_AVG_AGE                  25.0
HOME_LEGIONARIES_COUNT                  14
HOME_AVG_MARKET_VALUE            3400000.0
HOME_TOTAL_MARKET_VALUE         98480000.0
HOME_AVG_AGE_LEAVING                  26.4
HOME_TOTAL_VALUE_JOINING_MIO              
HOME_TOTAL_VALUE_LEAVING_MIO              
HOME_EXPENSES_JOINING_MIO              9.0
HOME_REVENUE_LEAVING_MIO              6.45
AWAY_PLAYERS_COUNT                      33
AWAY_PLAYERS_AVG_AGE                  25.6
AWAY_LEGIONARIES_COUNT                  15
AWAY_AVG_MARKET_VALUE            2620000.0
AWAY_TOTAL_MARKET_VALUE         86330000.0
AWAY_AVG_AGE_LEAVING                  27.8
AWAY_TOTAL_VALUE_JOINING_MIO              
AWAY_TOTAL_

In [118]:
print("Classes:", le.classes_)

Classes: ['AWAY_WIN' 'DRAW' 'HOME_WIN']
