# Modelling

In [31]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

In [2]:
df = pd.read_pickle("./df_merge_club_transfer_data.pkl")
df1 = pd.read_pickle("./df_merge_club_transfer_data_prep1.pkl")
object_columns = df.select_dtypes(include=['object']).columns

## 2nd step of data preparation

In [3]:
df2 = df1.copy()

In [4]:
# Kombinieren die Werte beider Spalten in eine Serie
combined_teams = pd.concat([df2['HOME_TEAM'], df2['AWAY_TEAM']])

# Initialisieren und anpassen Sie den LabelEncoder
le_teams = LabelEncoder()
le_teams.fit(combined_teams)

# Codieren Sie die 'HOME_TEAM' und 'AWAY_TEAM' Spalten
df2['HOME_TEAM'] = le_teams.transform(df2['HOME_TEAM'])
df2['AWAY_TEAM'] = le_teams.transform(df2['AWAY_TEAM'])

# Erstellen ein Mapping-Dictionary für die Decodierung
label_mapping_TEAM = {idx: label for idx, label in enumerate(le_teams.classes_)}

print("Label Mapping:", label_mapping_TEAM)

Label Mapping: {0: '1.FC Kaiserslautern', 1: '1.FC Köln', 2: '1.FC Nürnberg', 3: '1.FC Union Berlin', 4: '1.FSV Mainz 05', 5: 'Alemannia Aachen', 6: 'Arminia Bielefeld', 7: 'Bayer 04 Leverkusen', 8: 'Borussia Dortmund', 9: 'Borussia Mönchengladbach', 10: 'Eintracht Braunschweig', 11: 'Eintracht Frankfurt', 12: 'FC Augsburg', 13: 'FC Bayern München', 14: 'FC Energie Cottbus', 15: 'FC Hansa Rostock', 16: 'FC Ingolstadt 04', 17: 'FC Schalke 04', 18: 'FC St. Pauli', 19: 'Fortuna Düsseldorf', 20: 'Hamburger SV', 21: 'Hannover 96', 22: 'Hertha BSC', 23: 'Karlsruher SC', 24: 'MSV Duisburg', 25: 'RasenBallsport Leipzig', 26: 'SC Freiburg', 27: 'SC Paderborn 07', 28: 'SV Darmstadt 98', 29: 'SV Werder Bremen', 30: 'SpVgg Greuther Fürth', 31: 'TSG 1899 Hoffenheim', 32: 'VfB Stuttgart', 33: 'VfL Bochum', 34: 'VfL Wolfsburg'}


In [5]:
le_REFEREE = LabelEncoder()
encoded_REFEREE = le_REFEREE.fit_transform(df2['REFEREE'])
df2['REFEREE'] = encoded_REFEREE

label_mapping_REFEREE = {idx: label for idx, label in enumerate(le_REFEREE.classes_)}
print("Label Mapping:", label_mapping_REFEREE)

Label Mapping: {0: 'Babak Rafati', 1: 'Bastian Dankert', 2: 'Benjamin Brand', 3: 'Benjamin Cortus', 4: 'Bibiana Steinhaus-Webb', 5: 'Christian Dingert', 6: 'Daniel Schlager', 7: 'Daniel Siebert', 8: 'Deniz Aytekin', 9: 'Dr. Arne Aarnink', 10: 'Dr. Felix Brych', 11: 'Dr. Franz-Xaver Wack', 12: 'Dr. Helmut Fleischer', 13: 'Dr. Jochen Drees', 14: 'Dr. Markus Merk', 15: 'Dr. Martin Thomsen', 16: 'Dr. Matthias Jöllenbeck', 17: 'Dr. Robert Kampka', 18: 'Dr. Robin Braun', 19: 'Felix Zwayer', 20: 'Florian Badstübner', 21: 'Florian Meyer', 22: 'Frank Willenborg', 23: 'Guido Winkmann', 24: 'Günter Perl', 25: 'Harm Osmers', 26: 'Herbert Fandel', 27: 'Hermann Albrecht', 28: 'Jörg Keßler', 29: 'Jürgen Jansen', 30: 'Knut Kircher', 31: 'Lutz Wagner', 32: 'Lutz-Michael Fröhlich', 33: 'Manuel Gräfe', 34: 'Marc Seemann', 35: 'Marco Fritz', 36: 'Markus Schmidt', 37: 'Markus Wingenbach', 38: 'Martin Petersen', 39: 'Michael Kempter', 40: 'Michael Weiner', 41: 'Patrick Ittrich', 42: 'Peter Gagelmann', 43: '

In [6]:
le_RESULT = LabelEncoder()
encoded_RESULT = le_RESULT.fit_transform(df2['RESULT'])
df2['RESULT'] = encoded_RESULT

label_mapping_RESULT = {idx: label for idx, label in enumerate(le_RESULT.classes_)}
print("Label Mapping:", label_mapping_RESULT)

Label Mapping: {0: 'AWAY_WIN', 1: 'DRAW', 2: 'HOME_WIN'}


In [7]:
print(df2.dtypes)

DATE                            datetime64[ns]
WEEKDAY                                  int64
MONTH                                    int64
SEASON                                   int32
MATCHDAY                                 int64
HOME_TEAM                                int32
PLACE_HOME_TEAM                          int64
AWAY_TEAM                                int32
PLACE_AWAY_TEAM                          int64
WIN_PERC_HOME                          float64
REMIS_PERC                             float64
WIN_PERC_AWAY                          float64
HOME_GOALS                             float64
AWAY_GOALS                             float64
RESULT                                   int32
REFEREE                                  int32
HOME_PLAYERS_COUNT                       int64
HOME_PLAYERS_AVG_AGE                   float64
HOME_LEGIONARIES_COUNT                   int64
HOME_AVG_MARKET_VALUE                  float64
HOME_TOTAL_MARKET_VALUE                float64
HOME_AVG_AGE_

In [8]:
print(df2[object_columns].iloc[0])

HOME_TEAM                             29.00
PLACE_HOME_TEAM                        3.00
AWAY_TEAM                             17.00
PLACE_AWAY_TEAM                        2.00
RESULT                                 2.00
REFEREE                               47.00
HOME_PLAYERS_COUNT                    29.00
HOME_PLAYERS_AVG_AGE                  25.00
HOME_LEGIONARIES_COUNT                14.00
HOME_AVG_MARKET_VALUE            3400000.00
HOME_TOTAL_MARKET_VALUE         98480000.00
HOME_AVG_AGE_LEAVING                  26.40
HOME_TOTAL_VALUE_JOINING_MIO            NaN
HOME_TOTAL_VALUE_LEAVING_MIO            NaN
HOME_EXPENSES_JOINING_MIO              9.00
HOME_REVENUE_LEAVING_MIO               6.45
AWAY_PLAYERS_COUNT                    33.00
AWAY_PLAYERS_AVG_AGE                  25.60
AWAY_LEGIONARIES_COUNT                15.00
AWAY_AVG_MARKET_VALUE            2620000.00
AWAY_TOTAL_MARKET_VALUE         86330000.00
AWAY_AVG_AGE_LEAVING                  27.80
AWAY_TOTAL_VALUE_JOINING_MIO    

In [9]:
nan_count = df2.isna().sum()
print(nan_count)
print(len(df))

DATE                               0
WEEKDAY                            0
MONTH                              0
SEASON                             0
MATCHDAY                           0
HOME_TEAM                          0
PLACE_HOME_TEAM                    0
AWAY_TEAM                          0
PLACE_AWAY_TEAM                    0
WIN_PERC_HOME                   1530
REMIS_PERC                      1530
WIN_PERC_AWAY                   1530
HOME_GOALS                         0
AWAY_GOALS                         0
RESULT                             0
REFEREE                            0
HOME_PLAYERS_COUNT                 0
HOME_PLAYERS_AVG_AGE               0
HOME_LEGIONARIES_COUNT             0
HOME_AVG_MARKET_VALUE              0
HOME_TOTAL_MARKET_VALUE            0
HOME_AVG_AGE_JOINING               0
HOME_AVG_AGE_LEAVING               0
HOME_TOTAL_VALUE_JOINING_MIO     748
HOME_TOTAL_VALUE_LEAVING_MIO    1224
HOME_EXPENSES_JOINING_MIO         85
HOME_REVENUE_LEAVING_MIO         272
A

In [10]:
df2.to_pickle("df_merge_club_transfer_data_prep2.pkl")

## Modelling

In [43]:
selected_features = [
"HOME_TEAM", "PLACE_HOME_TEAM", "HOME_PLAYERS_COUNT", "HOME_PLAYERS_AVG_AGE", "HOME_LEGIONARIES_COUNT",
"HOME_AVG_MARKET_VALUE", "HOME_TOTAL_MARKET_VALUE", "HOME_AVG_AGE_JOINING", "HOME_AVG_AGE_LEAVING", 
#"HOME_TOTAL_VALUE_JOINING_MIO","HOME_TOTAL_VALUE_LEAVING_MIO",
"AWAY_TEAM", "PLACE_AWAY_TEAM", "AWAY_PLAYERS_COUNT", "AWAY_PLAYERS_AVG_AGE", "AWAY_LEGIONARIES_COUNT",
"AWAY_AVG_MARKET_VALUE", "AWAY_TOTAL_MARKET_VALUE", "AWAY_AVG_AGE_JOINING", "AWAY_AVG_AGE_LEAVING", 
#"AWAY_TOTAL_VALUE_JOINING_MIO", "AWAY_TOTAL_VALUE_LEAVING_MIO",
"MATCHDAY"
]

In [44]:
X = df2[selected_features]
y = df2['RESULT']
# Daten aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Modell auswählen und trainieren
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Vorhersagen treffen
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Genauigkeit für jede Klasse berechnen
accuracy_per_class = np.diag(cm) / np.sum(cm, axis=1)

# Modell evaluieren
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

for i, acc in enumerate(accuracy_per_class):
    print(f"Accuracy for class {i}: {acc:.2f}")

Accuracy: 0.4892519346517627

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.46      0.47       349
           1       0.25      0.09      0.13       293
           2       0.53      0.74      0.62       521

    accuracy                           0.49      1163
   macro avg       0.42      0.43      0.40      1163
weighted avg       0.44      0.49      0.45      1163

Accuracy for class 0: 0.46
Accuracy for class 1: 0.09
Accuracy for class 2: 0.74


In [46]:
# Modell auswählen und trainieren
clf_xgb = XGBClassifier(objective='multi:softprob', random_state=42)
clf_xgb.fit(X_train, y_train)

# Vorhersagen treffen
y_pred = clf_xgb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Genauigkeit für jede Klasse berechnen
accuracy_per_class = np.diag(cm) / np.sum(cm, axis=1)

# Modell evaluieren
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

for i, acc in enumerate(accuracy_per_class):
    print(f"Accuracy for class {i}: {acc:.2f}")

Accuracy: 0.472055030094583

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.46      0.46       349
           1       0.25      0.16      0.19       293
           2       0.53      0.66      0.59       521

    accuracy                           0.47      1163
   macro avg       0.42      0.42      0.42      1163
weighted avg       0.44      0.47      0.45      1163

Accuracy for class 0: 0.46
Accuracy for class 1: 0.16
Accuracy for class 2: 0.66


In [47]:
from sklearn.neighbors import KNeighborsClassifier
# k-NN Modell erstellen
k = 5  # Anzahl der Nachbarn
knn = KNeighborsClassifier(n_neighbors=k)

# Modell trainieren
knn.fit(X_train, y_train)

# Vorhersagen treffen
y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Genauigkeit für jede Klasse berechnen
accuracy_per_class = np.diag(cm) / np.sum(cm, axis=1)

# Modell evaluieren
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

for i, acc in enumerate(accuracy_per_class):
    print(f"Accuracy for class {i}: {acc:.2f}")

Accuracy: 0.44969905417024936

Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.52      0.45       349
           1       0.28      0.20      0.23       293
           2       0.56      0.55      0.55       521

    accuracy                           0.45      1163
   macro avg       0.41      0.42      0.41      1163
weighted avg       0.44      0.45      0.44      1163

Accuracy for class 0: 0.52
Accuracy for class 1: 0.20
Accuracy for class 2: 0.55


In [48]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Genauigkeit für jede Klasse berechnen
accuracy_per_class = np.diag(cm) / np.sum(cm, axis=1)

# Modell evaluieren
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

for i, acc in enumerate(accuracy_per_class):
    print(f"Accuracy for class {i}: {acc:.2f}")

Accuracy: 0.47893379191745483

Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.56      0.50       349
           1       0.25      0.18      0.21       293
           2       0.60      0.59      0.60       521

    accuracy                           0.48      1163
   macro avg       0.43      0.44      0.43      1163
weighted avg       0.47      0.48      0.47      1163

Accuracy for class 0: 0.56
Accuracy for class 1: 0.18
Accuracy for class 2: 0.59


In [49]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
y_pred = qda.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Genauigkeit für jede Klasse berechnen
accuracy_per_class = np.diag(cm) / np.sum(cm, axis=1)

# Modell evaluieren
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

for i, acc in enumerate(accuracy_per_class):
    print(f"Accuracy for class {i}: {acc:.2f}")

Accuracy: 0.47377472055030095

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.42      0.45       349
           1       0.29      0.30      0.29       293
           2       0.57      0.61      0.59       521

    accuracy                           0.47      1163
   macro avg       0.45      0.44      0.44      1163
weighted avg       0.47      0.47      0.47      1163

Accuracy for class 0: 0.42
Accuracy for class 1: 0.30
Accuracy for class 2: 0.61
