# Prepare data and convert data types

In [65]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# OUTPUT_FILE_NAME = "df_merge_club_transfer_data_2010_prep1.pkl"
# OUTPUT_FILE_NAME = "df_merge_club_transfer_data_weather_prep1.pkl"
OUTPUT_FILE_NAME = "df_merge_club_transfer_data_with_leave_join_mio_prep1.pkl"
DROP_NAN = True

In [66]:
df = pd.read_pickle("./df_merge_club_transfer_data.pkl")
#df = pd.read_pickle("./df_merge_club_transfer_data_weather.pkl")

print(df.dtypes)

DATE                            datetime64[ns]
WEEKDAY                                  int64
MONTH                                    int64
SEASON                                   int32
MATCHDAY                                 int64
HOME_TEAM                               object
PLACE_HOME_TEAM                         object
AWAY_TEAM                               object
PLACE_AWAY_TEAM                         object
WIN_PERC_HOME                          float64
REMIS_PERC                             float64
WIN_PERC_AWAY                          float64
HOME_GOALS                             float64
AWAY_GOALS                             float64
RESULT                                  object
REFEREE                                 object
HOME_PLAYERS_COUNT                      object
HOME_PLAYERS_AVG_AGE                    object
HOME_LEGIONARIES_COUNT                  object
HOME_AVG_MARKET_VALUE                   object
HOME_TOTAL_MARKET_VALUE                 object
HOME_AVG_AGE_

In [67]:
nan_count = df.isna().sum()
print(nan_count)
print(len(df))

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO       0
HOME_TOTAL_VALUE_LEAVING_MIO       0
HOME_EXPENSES_JOINING_MIO          0
HOME_REVENUE_LEAVING_MIO           0
A

## 1st step of data preparation

In [68]:
df1 = df.copy()
object_columns = df1.select_dtypes(include=['object']).columns
print(object_columns)

Index(['HOME_TEAM', 'PLACE_HOME_TEAM', 'AWAY_TEAM', 'PLACE_AWAY_TEAM',
       'RESULT', 'REFEREE', 'HOME_PLAYERS_COUNT', 'HOME_PLAYERS_AVG_AGE',
       'HOME_LEGIONARIES_COUNT', 'HOME_AVG_MARKET_VALUE',
       'HOME_TOTAL_MARKET_VALUE', 'HOME_AVG_AGE_LEAVING',
       'HOME_TOTAL_VALUE_JOINING_MIO', 'HOME_TOTAL_VALUE_LEAVING_MIO',
       'HOME_EXPENSES_JOINING_MIO', 'HOME_REVENUE_LEAVING_MIO',
       'AWAY_PLAYERS_COUNT', 'AWAY_PLAYERS_AVG_AGE', 'AWAY_LEGIONARIES_COUNT',
       'AWAY_AVG_MARKET_VALUE', 'AWAY_TOTAL_MARKET_VALUE',
       'AWAY_AVG_AGE_LEAVING', 'AWAY_TOTAL_VALUE_JOINING_MIO',
       'AWAY_TOTAL_VALUE_LEAVING_MIO', 'AWAY_EXPENSES_JOINING_MIO',
       'AWAY_REVENUE_LEAVING_MIO'],
      dtype='object')


In [69]:
print(df1[object_columns].iloc[0])

HOME_TEAM                       SV Werder Bremen
PLACE_HOME_TEAM                                3
AWAY_TEAM                          FC Schalke 04
PLACE_AWAY_TEAM                                2
RESULT                                  HOME_WIN
REFEREE                         Stefan Trautmann
HOME_PLAYERS_COUNT                            29
HOME_PLAYERS_AVG_AGE                        25,0
HOME_LEGIONARIES_COUNT                        14
HOME_AVG_MARKET_VALUE                3,40 Mio. €
HOME_TOTAL_MARKET_VALUE             98,48 Mio. €
HOME_AVG_AGE_LEAVING                        26.4
HOME_TOTAL_VALUE_JOINING_MIO                    
HOME_TOTAL_VALUE_LEAVING_MIO                    
HOME_EXPENSES_JOINING_MIO                    9.0
HOME_REVENUE_LEAVING_MIO                    6.45
AWAY_PLAYERS_COUNT                            33
AWAY_PLAYERS_AVG_AGE                        25,6
AWAY_LEGIONARIES_COUNT                        15
AWAY_AVG_MARKET_VALUE                2,62 Mio. €
AWAY_TOTAL_MARKET_VA

In [70]:
columns_to_convert = ['PLACE_HOME_TEAM','PLACE_AWAY_TEAM','HOME_PLAYERS_COUNT','HOME_PLAYERS_AVG_AGE','HOME_LEGIONARIES_COUNT',
                     'HOME_AVG_AGE_LEAVING', "HOME_TOTAL_VALUE_JOINING_MIO", "HOME_TOTAL_VALUE_LEAVING_MIO", 'HOME_EXPENSES_JOINING_MIO','HOME_REVENUE_LEAVING_MIO', 
                     'AWAY_PLAYERS_COUNT','AWAY_PLAYERS_AVG_AGE','AWAY_LEGIONARIES_COUNT','AWAY_AVG_AGE_LEAVING', "AWAY_TOTAL_VALUE_JOINING_MIO", "AWAY_TOTAL_VALUE_LEAVING_MIO",
                     'AWAY_EXPENSES_JOINING_MIO', 'AWAY_REVENUE_LEAVING_MIO']

# with weather
# columns_to_convert = ['PLACE_HOME_TEAM','PLACE_AWAY_TEAM','HOME_PLAYERS_COUNT','HOME_PLAYERS_AVG_AGE','HOME_LEGIONARIES_COUNT',
#                      'HOME_AVG_AGE_LEAVING', "HOME_TOTAL_VALUE_JOINING_MIO", "HOME_TOTAL_VALUE_LEAVING_MIO", 'HOME_EXPENSES_JOINING_MIO','HOME_REVENUE_LEAVING_MIO', 
#                      'AWAY_PLAYERS_COUNT','AWAY_PLAYERS_AVG_AGE','AWAY_LEGIONARIES_COUNT','AWAY_AVG_AGE_LEAVING', "AWAY_TOTAL_VALUE_JOINING_MIO", "AWAY_TOTAL_VALUE_LEAVING_MIO",
#                      'AWAY_EXPENSES_JOINING_MIO', 'AWAY_REVENUE_LEAVING_MIO', 'SNOW_HEIGHT_cm', 'RAIN_l/m2', 'SUNSHINE_DURATION_h', 'MIN_TEMP_C', 'MAX_TEMP_C']

for column in columns_to_convert:
    if df1[column].dtype == 'object':  # Nur Spalten vom Typ 'object' verarbeiten
        df1[column] = df1[column].astype(str).str.replace(',', '.')
        df1[column] = pd.to_numeric(df1[column], errors='coerce')

In [71]:
nan_count = df1.isna().sum()
print(nan_count)

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO     748
HOME_TOTAL_VALUE_LEAVING_MIO    1224
HOME_EXPENSES_JOINING_MIO         85
HOME_REVENUE_LEAVING_MIO         272
A

In [72]:

first_nan_index = df1[df1['AWAY_EXPENSES_JOINING_MIO'].isna()].index.max()
print(first_nan_index)
value = df.loc[first_nan_index, 'AWAY_EXPENSES_JOINING_MIO']
print(df.loc[first_nan_index, 'AWAY_EXPENSES_JOINING_MIO'])

5503



In [73]:
# Ein Set für die Teamnamen erstellen
unique_teams = set()

# Durch alle Zeilen im DataFrame iterieren und die Teamnamen in das Set aufnehmen, falls "AWAY_EXPENSES_JOINING_MIO" einen bestimmten Wert hat
for index, row in df.iterrows():
    if pd.notna(row['AWAY_EXPENSES_JOINING_MIO']):  # pd.notna prüft, ob der Wert nicht NaN ist
        unique_teams.add(row['AWAY_TEAM'])

# Das Set enthält jetzt die einzigartigen Teamnamen, wo "AWAY_EXPENSES_JOINING_MIO" nicht NaN ist
print(unique_teams)

{'Arminia Bielefeld', '1.FC Köln', 'Hertha BSC', 'FC Energie Cottbus', 'FC St. Pauli', '1.FC Union Berlin', 'Alemannia Aachen', 'FC Augsburg', 'VfB Stuttgart', 'FC Hansa Rostock', 'Hamburger SV', 'Eintracht Braunschweig', 'RasenBallsport Leipzig', 'Fortuna Düsseldorf', 'FC Bayern München', 'FC Schalke 04', 'SV Darmstadt 98', 'Borussia Dortmund', 'SV Werder Bremen', 'SC Paderborn 07', 'VfL Bochum', 'Eintracht Frankfurt', 'Hannover 96', 'Karlsruher SC', 'TSG 1899 Hoffenheim', 'MSV Duisburg', 'VfL Wolfsburg', '1.FSV Mainz 05', 'SC Freiburg', 'SpVgg Greuther Fürth', '1.FC Kaiserslautern', 'FC Ingolstadt 04', 'Borussia Mönchengladbach', '1.FC Nürnberg', 'Bayer 04 Leverkusen'}


In [74]:
def convert_market_value(value):
    try:
        # Entferne Kommas und ersetze sie durch Punkte
        value = value.replace(",", ".")
        
        # Millionen
        if "Mio. €" in value:
            value = value.replace("Mio. €", "")
            return float(value) * 1e6  # Multipliziere mit 1 Million

        # Tausend
        elif "Tsd. €" in value:
            value = value.replace("Tsd. €", "")
            return float(value) * 1e3  # Multipliziere mit 1 Tausend

        else:
            return None  # Falls die Konvertierung fehlschlägt, gebe None zurück

    except Exception as e:
        print(f"Konvertierungsfehler: {e}, Wert: {value}")
        return None  # Falls die Konvertierung fehlschlägt, gebe None zurück

In [75]:
df1 = df1.copy()
df1['HOME_AVG_MARKET_VALUE'] = df1['HOME_AVG_MARKET_VALUE'].apply(convert_market_value)
df1['HOME_TOTAL_MARKET_VALUE'] = df1['HOME_TOTAL_MARKET_VALUE'].apply(convert_market_value)
df1['AWAY_AVG_MARKET_VALUE'] = df1['AWAY_AVG_MARKET_VALUE'].apply(convert_market_value)
df1['AWAY_TOTAL_MARKET_VALUE'] = df1['AWAY_TOTAL_MARKET_VALUE'].apply(convert_market_value)
print(df1.dtypes)

DATE                            datetime64[ns]
WEEKDAY                                  int64
MONTH                                    int64
SEASON                                   int32
MATCHDAY                                 int64
HOME_TEAM                               object
PLACE_HOME_TEAM                          int64
AWAY_TEAM                               object
PLACE_AWAY_TEAM                          int64
WIN_PERC_HOME                          float64
REMIS_PERC                             float64
WIN_PERC_AWAY                          float64
HOME_GOALS                             float64
AWAY_GOALS                             float64
RESULT                                  object
REFEREE                                 object
HOME_PLAYERS_COUNT                       int64
HOME_PLAYERS_AVG_AGE                   float64
HOME_LEGIONARIES_COUNT                   int64
HOME_AVG_MARKET_VALUE                  float64
HOME_TOTAL_MARKET_VALUE                float64
HOME_AVG_AGE_

In [76]:
print(df1[object_columns].iloc[0])

HOME_TEAM                       SV Werder Bremen
PLACE_HOME_TEAM                                3
AWAY_TEAM                          FC Schalke 04
PLACE_AWAY_TEAM                                2
RESULT                                  HOME_WIN
REFEREE                         Stefan Trautmann
HOME_PLAYERS_COUNT                            29
HOME_PLAYERS_AVG_AGE                        25.0
HOME_LEGIONARIES_COUNT                        14
HOME_AVG_MARKET_VALUE                  3400000.0
HOME_TOTAL_MARKET_VALUE               98480000.0
HOME_AVG_AGE_LEAVING                        26.4
HOME_TOTAL_VALUE_JOINING_MIO                 NaN
HOME_TOTAL_VALUE_LEAVING_MIO                 NaN
HOME_EXPENSES_JOINING_MIO                    9.0
HOME_REVENUE_LEAVING_MIO                    6.45
AWAY_PLAYERS_COUNT                            33
AWAY_PLAYERS_AVG_AGE                        25.6
AWAY_LEGIONARIES_COUNT                        15
AWAY_AVG_MARKET_VALUE                  2620000.0
AWAY_TOTAL_MARKET_VA

In [77]:
nan_count = df1.isna().sum()
print(nan_count)

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO     748
HOME_TOTAL_VALUE_LEAVING_MIO    1224
HOME_EXPENSES_JOINING_MIO         85
HOME_REVENUE_LEAVING_MIO         272
A

In [78]:
if DROP_NAN == True:
    all_columns = df1.columns.tolist()
    columns_to_exclude = ['WIN_PERC_HOME', 'REMIS_PERC', 'WIN_PERC_AWAY']
    columns_to_include = [col for col in all_columns if col not in columns_to_exclude]
    df1.dropna(subset=columns_to_include, inplace=True)
    df1.reset_index(drop=True)
    
nan_count = df1.isna().sum()
print(nan_count)

DATE                             0
WEEKDAY                          0
MONTH                            0
SEASON                           0
MATCHDAY                         0
HOME_TEAM                        0
PLACE_HOME_TEAM                  0
AWAY_TEAM                        0
PLACE_AWAY_TEAM                  0
WIN_PERC_HOME                   92
REMIS_PERC                      92
WIN_PERC_AWAY                   92
HOME_GOALS                       0
AWAY_GOALS                       0
RESULT                           0
REFEREE                          0
HOME_PLAYERS_COUNT               0
HOME_PLAYERS_AVG_AGE             0
HOME_LEGIONARIES_COUNT           0
HOME_AVG_MARKET_VALUE            0
HOME_TOTAL_MARKET_VALUE          0
HOME_AVG_AGE_JOINING             0
HOME_AVG_AGE_LEAVING             0
HOME_TOTAL_VALUE_JOINING_MIO     0
HOME_TOTAL_VALUE_LEAVING_MIO     0
HOME_EXPENSES_JOINING_MIO        0
HOME_REVENUE_LEAVING_MIO         0
AWAY_PLAYERS_COUNT               0
AWAY_PLAYERS_AVG_AGE

In [79]:
df1

Unnamed: 0,DATE,WEEKDAY,MONTH,SEASON,MATCHDAY,HOME_TEAM,PLACE_HOME_TEAM,AWAY_TEAM,PLACE_AWAY_TEAM,WIN_PERC_HOME,...,AWAY_PLAYERS_AVG_AGE,AWAY_LEGIONARIES_COUNT,AWAY_AVG_MARKET_VALUE,AWAY_TOTAL_MARKET_VALUE,AWAY_AVG_AGE_JOINING,AWAY_AVG_AGE_LEAVING,AWAY_TOTAL_VALUE_JOINING_MIO,AWAY_TOTAL_VALUE_LEAVING_MIO,AWAY_EXPENSES_JOINING_MIO,AWAY_REVENUE_LEAVING_MIO
930,2007-08-18,5,8,2007,2,Hertha BSC,14,VfB Stuttgart,8,,...,24.9,17,3110000.0,108850000.0,24.7,26.1,1.030,0.375,10.70,3.70
942,2007-08-25,5,8,2007,3,Arminia Bielefeld,4,Hertha BSC,7,,...,24.5,22,2250000.0,81000000.0,22.8,24.9,0.225,0.875,17.35,18.80
952,2007-09-02,6,9,2007,4,Hamburger SV,5,FC Bayern München,1,,...,25.7,14,8040000.0,249200000.0,23.6,27.4,15.200,6.250,93.20,36.15
994,2007-10-06,5,10,2007,9,Arminia Bielefeld,11,Hamburger SV,6,,...,24.1,24,3350000.0,120750000.0,20.4,24.6,6.730,0.300,12.25,8.91
1003,2007-10-20,5,10,2007,10,Hamburger SV,4,VfB Stuttgart,12,,...,24.9,17,3110000.0,108850000.0,24.7,26.1,1.030,0.375,10.70,3.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5809,2023-05-27,5,5,2022,34,Borussia Mönchengladbach,11,FC Augsburg,14,68.9,...,25.0,23,3150000.0,132280000.0,22.6,24.6,69.200,39.500,11.45,6.80
5810,2023-05-27,5,5,2022,34,Eintracht Frankfurt,8,SC Freiburg,5,31.4,...,26.0,11,6930000.0,200950000.0,23.3,23.1,59.750,92.150,15.90,21.33
5811,2023-05-27,5,5,2022,34,VfL Wolfsburg,7,Hertha BSC,18,93.6,...,25.4,21,2290000.0,94000000.0,23.7,24.4,84.830,80.780,8.55,24.25
5812,2023-05-27,5,5,2022,34,VfL Bochum,16,Bayer 04 Leverkusen,6,13.0,...,24.2,24,12930000.0,452550000.0,21.8,24.6,55.850,27.650,15.10,9.50


In [80]:
df1.to_pickle(OUTPUT_FILE_NAME)