In [15]:
import helpers
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score,  balanced_accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from skopt.space import Integer
from skopt.space import Categorical
from skopt.space import Real
import warnings
import json
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [16]:
df = pd.read_pickle("./df_transfer_data.pkl")

In [17]:
df

Unnamed: 0,CLUB_NAME,AVG_AGE_JOINING,AVG_AGE_LEAVING,TOTAL_VALUE_JOINING_MIO,TOTAL_VALUE_LEAVING_MIO,EXPENSES_JOINING_MIO,REVENUE_LEAVING_MIO,season
0,1.FC Saarbrücken,22.8,25.2,,,,,1963
1,1.FC Kaiserslautern,22.8,24.2,,,0.06,,1963
2,1.FC Köln,18.0,24.3,,,,0.25,1963
3,1.FC Nürnberg,24.0,21.5,,,0.025,,1963
4,Borussia Dortmund,22.8,29.3,,,,0.115,1963
...,...,...,...,...,...,...,...,...
13,FC Augsburg,22.6,24.6,69.2,39.5,11.45,6.8,2022
14,VfB Stuttgart,22.8,22.8,72.75,99.9,14.99,52.6,2022
15,Hertha BSC,23.7,24.4,84.83,80.78,8.55,24.25,2022
16,FC Schalke 04,24.7,24.6,90.15,89.85,8.63,21.35,2022


In [18]:
columns_to_check = [
    "TOTAL_VALUE_JOINING_MIO", 
    "TOTAL_VALUE_LEAVING_MIO"
]

# Sortieren Sie den DataFrame nach CLUB_NAME und DATE
df = df.sort_values(by=["CLUB_NAME", "season"])

# Finden Sie für jeden Club die erste Zeile mit gültigen Werten
result = df.groupby("CLUB_NAME").apply(
    lambda group: group[group[columns_to_check].notna().all(axis=1)].head(1)
).reset_index(drop=True)

print(result[["CLUB_NAME", "season"] + columns_to_check])

                          CLUB_NAME season TOTAL_VALUE_JOINING_MIO  \
0               1.FC Kaiserslautern   1963                           
1                         1.FC Köln   1963                           
2                     1.FC Nürnberg   1963                           
3                  1.FC Saarbrücken   1963                           
4                 1.FC Union Berlin   2019                   47.88   
5                    1.FSV Mainz 05   2004                           
6                  Alemannia Aachen   1967                           
7                 Arminia Bielefeld   1970                           
8               Bayer 04 Leverkusen   1979                           
9                Bayer 05 Uerdingen   1975                           
10                Borussia Dortmund   1963                           
11         Borussia Mönchengladbach   1965                           
12             Borussia Neunkirchen   1964                           
13           Eintrac

In [19]:
# Konvertieren der 'season' Spalte in numerische Werte
result['season'] = pd.to_numeric(result['season'], errors='coerce')

# Filtern der Clubs, die erst nach 2004 gültige Werte haben
clubs_to_remove = result[result['season'] > 2004]['CLUB_NAME']

# Entfernen der Zeilen aus dem ursprünglichen DataFrame, die zu den ungültigen Clubs gehören
df_filtered = df[~df['CLUB_NAME'].isin(clubs_to_remove)]

In [20]:
df_filtered

Unnamed: 0,CLUB_NAME,AVG_AGE_JOINING,AVG_AGE_LEAVING,TOTAL_VALUE_JOINING_MIO,TOTAL_VALUE_LEAVING_MIO,EXPENSES_JOINING_MIO,REVENUE_LEAVING_MIO,season
1,1.FC Kaiserslautern,22.8,24.2,,,0.06,,1963
0,1.FC Kaiserslautern,24.7,26.7,,,,,1964
9,1.FC Kaiserslautern,21.5,26.8,,,,,1965
9,1.FC Kaiserslautern,25.8,25.8,,,,,1966
9,1.FC Kaiserslautern,21.3,26.4,,,,,1967
...,...,...,...,...,...,...,...,...
3,VfL Wolfsburg,21.8,23.8,125.45,41.7,76.18,22.7,2021
11,VfL Wolfsburg,22.3,24.6,78.6,73.45,35.75,23.7,2022
1,Wuppertaler SV Borussia,22.7,26.3,,,,,1972
0,Wuppertaler SV Borussia,21.4,23.0,,,0.21,,1973


In [23]:
df_filtered = df_filtered.dropna(subset=["TOTAL_VALUE_JOINING_MIO", "TOTAL_VALUE_LEAVING_MIO"])
df_filtered = df_filtered[df_filtered['TOTAL_VALUE_JOINING_MIO'] != '']
df_filtered = df_filtered[df_filtered['TOTAL_VALUE_LEAVING_MIO'] != '']

In [24]:
df_filtered 

Unnamed: 0,CLUB_NAME,AVG_AGE_JOINING,AVG_AGE_LEAVING,TOTAL_VALUE_JOINING_MIO,TOTAL_VALUE_LEAVING_MIO,EXPENSES_JOINING_MIO,REVENUE_LEAVING_MIO,season
17,1.FC Kaiserslautern,23.0,23.8,0.8,0.625,4.6,0.1,2010
17,1.FC Kaiserslautern,21.9,25.8,15.33,0.65,6.63,4.45,2011
17,1.FC Köln,23.5,25.8,0.85,0.1,7.15,0.75,2008
12,1.FC Köln,21.6,26.7,0.725,0.2,11.5,0.9,2009
15,1.FC Köln,22.3,25.0,1.13,0.8,4.6,0.05,2010
...,...,...,...,...,...,...,...,...
15,VfL Wolfsburg,23.6,23.8,31.75,146.03,36.0,0.8,2018
5,VfL Wolfsburg,22.8,24.0,177.6,155.75,34.8,14.3,2019
6,VfL Wolfsburg,22.4,24.9,53.53,28.5,22.0,11.1,2020
3,VfL Wolfsburg,21.8,23.8,125.45,41.7,76.18,22.7,2021
