In [66]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [67]:
matches = pd.read_csv("matches.csv", index_col=0)

In [68]:
matches.head()

Unnamed: 0,rk,squad,year,mp,w,d,l,gf,ga,gd,pts,top team scorer,goalkeeper
0,1,Italy,2021,7.0,5.0,2.0,0.0,13.0,4.0,9.0,17.0,"Lorenzo Insigne, Ciro Immobile... - 2",Gianluigi Donnarumma
1,2,England,2021,7.0,5.0,2.0,0.0,11.0,2.0,9.0,17.0,Harry Kane - 4,Jordan Pickford
4,3,Belgium,2021,5.0,4.0,0.0,1.0,9.0,3.0,6.0,12.0,Romelu Lukaku - 4,Thibaut Courtois
2,4,Spain,2021,6.0,2.0,4.0,0.0,13.0,6.0,7.0,10.0,Álvaro Morata - 3,Unai Simón
3,5,Denmark,2021,6.0,3.0,0.0,3.0,12.0,7.0,5.0,9.0,Kasper Dolberg - 3,Kasper Schmeichel


In [69]:
matches.shape

(112, 13)

In [70]:
24 * 2 + 16 * 4 # 2 * 24 (die letzten 2 Jahre), 4 * 16 ()

112

In [71]:
# matches = matches[matches['rk'] <= 16]

In [72]:
# matches

In [73]:
matches['squad'].value_counts()

squad
Italy               6
Portugal            6
Spain               6
Germany             6
Czechia             6
Sweden              6
France              6
England             5
Croatia             5
Russia              5
Netherlands         5
Switzerland         4
Türkiye             4
Denmark             4
Poland              4
Romania             3
Ukraine             3
Greece              3
Belgium             3
Austria             3
Slovakia            2
Wales               2
Rep. of Ireland     2
Hungary             2
Yugoslavia          1
Norway              1
Bulgaria            1
Latvia              1
N. Macedonia        1
Albania             1
Northern Ireland    1
Iceland             1
Scotland            1
Finland             1
Slovenia            1
Name: count, dtype: int64

In [74]:
matches.dtypes

rk                   int64
squad               object
year                 int64
mp                 float64
w                  float64
d                  float64
l                  float64
gf                 float64
ga                 float64
gd                 float64
pts                float64
top team scorer     object
goalkeeper          object
dtype: object

In [75]:
# matches = matches.sample(frac=1) # shuffle dataframe using sample function

In [76]:
matches.head()

Unnamed: 0,rk,squad,year,mp,w,d,l,gf,ga,gd,pts,top team scorer,goalkeeper
0,1,Italy,2021,7.0,5.0,2.0,0.0,13.0,4.0,9.0,17.0,"Lorenzo Insigne, Ciro Immobile... - 2",Gianluigi Donnarumma
1,2,England,2021,7.0,5.0,2.0,0.0,11.0,2.0,9.0,17.0,Harry Kane - 4,Jordan Pickford
4,3,Belgium,2021,5.0,4.0,0.0,1.0,9.0,3.0,6.0,12.0,Romelu Lukaku - 4,Thibaut Courtois
2,4,Spain,2021,6.0,2.0,4.0,0.0,13.0,6.0,7.0,10.0,Álvaro Morata - 3,Unai Simón
3,5,Denmark,2021,6.0,3.0,0.0,3.0,12.0,7.0,5.0,9.0,Kasper Dolberg - 3,Kasper Schmeichel


In [77]:
matches.reset_index(drop=True)

Unnamed: 0,rk,squad,year,mp,w,d,l,gf,ga,gd,pts,top team scorer,goalkeeper
0,1,Italy,2021,7.0,5.0,2.0,0.0,13.0,4.0,9.0,17.0,"Lorenzo Insigne, Ciro Immobile... - 2",Gianluigi Donnarumma
1,2,England,2021,7.0,5.0,2.0,0.0,11.0,2.0,9.0,17.0,Harry Kane - 4,Jordan Pickford
2,3,Belgium,2021,5.0,4.0,0.0,1.0,9.0,3.0,6.0,12.0,Romelu Lukaku - 4,Thibaut Courtois
3,4,Spain,2021,6.0,2.0,4.0,0.0,13.0,6.0,7.0,10.0,Álvaro Morata - 3,Unai Simón
4,5,Denmark,2021,6.0,3.0,0.0,3.0,12.0,7.0,5.0,9.0,Kasper Dolberg - 3,Kasper Schmeichel
...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,12,Belgium,2000,3.0,1.0,0.0,2.0,2.0,5.0,-3.0,3.0,"Émile Mpenza, Bart Goor - 1",Filip De Wilde
108,13,Slovenia,2000,3.0,0.0,2.0,1.0,4.0,5.0,-1.0,2.0,Zlatko Zahovič - 3,Mladen Dabanovič
109,14,Sweden,2000,3.0,0.0,1.0,2.0,2.0,4.0,-2.0,1.0,"Johan Mjällby, Henrik Larsson - 1",Magnus Hedman
110,15,Germany,2000,3.0,0.0,1.0,2.0,1.0,5.0,-4.0,1.0,Mehmet Scholl - 1,Oliver Kahn


In [78]:
# Aggregate data for each team across all tournaments
team_stats = matches.groupby('squad').agg({
    'mp': 'sum',
    'w': 'sum',
    'd': 'sum',
    'l': 'sum',
    'gf': 'sum',
    'ga': 'sum',
    'gd': 'sum',
    'pts': 'sum',
    'year': 'count'  # Number of tournaments participated
}).reset_index()

team_stats

Unnamed: 0,squad,mp,w,d,l,gf,ga,gd,pts,year
0,Albania,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1
1,Austria,10.0,2.0,2.0,6.0,7.0,12.0,-5.0,8.0,3
2,Belgium,13.0,8.0,0.0,5.0,20.0,13.0,7.0,24.0,3
3,Bulgaria,3.0,0.0,0.0,3.0,1.0,9.0,-8.0,0.0,1
4,Croatia,18.0,7.0,6.0,5.0,25.0,23.0,2.0,27.0,5
5,Czechia,23.0,10.0,2.0,11.0,29.0,29.0,0.0,32.0,6
6,Denmark,16.0,5.0,2.0,9.0,20.0,25.0,-5.0,17.0,4
7,England,22.0,11.0,7.0,4.0,35.0,21.0,14.0,40.0,5
8,Finland,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1
9,France,28.0,14.0,7.0,7.0,44.0,34.0,10.0,49.0,6


In [79]:
# Calculate additional features
team_stats['avg_pts_per_match'] = team_stats['pts'] / team_stats['mp']
team_stats['avg_gd_per_match'] = team_stats['gd'] / team_stats['mp']
team_stats['win_rate'] = team_stats['w'] / team_stats['mp']

team_stats

Unnamed: 0,squad,mp,w,d,l,gf,ga,gd,pts,year,avg_pts_per_match,avg_gd_per_match,win_rate
0,Albania,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333
1,Austria,10.0,2.0,2.0,6.0,7.0,12.0,-5.0,8.0,3,0.8,-0.5,0.2
2,Belgium,13.0,8.0,0.0,5.0,20.0,13.0,7.0,24.0,3,1.846154,0.538462,0.615385
3,Bulgaria,3.0,0.0,0.0,3.0,1.0,9.0,-8.0,0.0,1,0.0,-2.666667,0.0
4,Croatia,18.0,7.0,6.0,5.0,25.0,23.0,2.0,27.0,5,1.5,0.111111,0.388889
5,Czechia,23.0,10.0,2.0,11.0,29.0,29.0,0.0,32.0,6,1.391304,0.0,0.434783
6,Denmark,16.0,5.0,2.0,9.0,20.0,25.0,-5.0,17.0,4,1.0625,-0.3125,0.3125
7,England,22.0,11.0,7.0,4.0,35.0,21.0,14.0,40.0,5,1.818182,0.636364,0.5
8,Finland,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333
9,France,28.0,14.0,7.0,7.0,44.0,34.0,10.0,49.0,6,1.75,0.357143,0.5


In [80]:
from collections import Counter

# Sort teams by average points per match and select top 4
top_4_teams0 = team_stats.nlargest(4, 'avg_pts_per_match')['squad'].tolist()
top_4_teams1 = team_stats.nlargest(4, 'avg_gd_per_match')['squad'].tolist()
top_4_teams2 = team_stats.nlargest(4, 'win_rate')['squad'].tolist()

# Find the intersection of the three lists
result_team = list(set(top_4_teams0) & set(top_4_teams1) & set(top_4_teams2))

# If result_team has fewer than 4 teams, add more teams based on frequency
if len(result_team) < 4:
    # Combine all teams from the three lists into one list
    all_top_teams = top_4_teams0 + top_4_teams1 + top_4_teams2
    # Count the frequency of each team
    team_frequency = Counter(all_top_teams)
    
    # Sort teams by frequency and then alphabetically
    sorted_teams = sorted(team_frequency.keys(), key=lambda x: (-team_frequency[x], x))
    
    # Add teams to result_team until we have at least 4 teams
    for team in sorted_teams:
        if team not in result_team:
            result_team.append(team)
        if len(result_team) >= 4:
            break
            
result_team

['Spain', 'Italy', 'Belgium', 'England']

In [81]:
# Create target variable
team_stats['is_Top_4'] = team_stats['squad'].isin(result_team)

team_stats

Unnamed: 0,squad,mp,w,d,l,gf,ga,gd,pts,year,avg_pts_per_match,avg_gd_per_match,win_rate,is_Top_4
0,Albania,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333,False
1,Austria,10.0,2.0,2.0,6.0,7.0,12.0,-5.0,8.0,3,0.8,-0.5,0.2,False
2,Belgium,13.0,8.0,0.0,5.0,20.0,13.0,7.0,24.0,3,1.846154,0.538462,0.615385,True
3,Bulgaria,3.0,0.0,0.0,3.0,1.0,9.0,-8.0,0.0,1,0.0,-2.666667,0.0,False
4,Croatia,18.0,7.0,6.0,5.0,25.0,23.0,2.0,27.0,5,1.5,0.111111,0.388889,False
5,Czechia,23.0,10.0,2.0,11.0,29.0,29.0,0.0,32.0,6,1.391304,0.0,0.434783,False
6,Denmark,16.0,5.0,2.0,9.0,20.0,25.0,-5.0,17.0,4,1.0625,-0.3125,0.3125,False
7,England,22.0,11.0,7.0,4.0,35.0,21.0,14.0,40.0,5,1.818182,0.636364,0.5,True
8,Finland,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333,False
9,France,28.0,14.0,7.0,7.0,44.0,34.0,10.0,49.0,6,1.75,0.357143,0.5,False


In [82]:
# Select features for the model
features = ['mp', 'w', 'd', 'l', 'gf', 'ga', 'gd', 'pts', 'year', 'avg_pts_per_match', 'avg_gd_per_match', 'win_rate']
X = team_stats[features]
y = team_stats['is_Top_4']

In [125]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [127]:
# Apply PCA (PCA macht nicht viel Sinn, da Anzahl feature nicht sehr hoch ist)
# pca = PCA(0.95)  # Retain 95% of variance
# X_pca = pca.fit_transform(X_scaled)

# Print the number of components and explained variance ratio
# print(f"Number of components: {pca.n_components_}")
# print(f"Explained variance ratio: {pca.explained_variance_ratio_}")

Number of components: 4
Explained variance ratio: [0.74817063 0.13140121 0.06676795 0.02800426]


In [128]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

In [129]:
# Train the model
clf = RandomForestClassifier(random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

In [130]:
# Make predictions
y_pred = clf.predict(X_test)

In [131]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.7272727272727273
Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.70      0.82        10
        True       0.25      1.00      0.40         1

    accuracy                           0.73        11
   macro avg       0.62      0.85      0.61        11
weighted avg       0.93      0.73      0.79        11



In [116]:
# Feature importance
feature_importance = pd.DataFrame({'feature': features, 'importance': clf.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
              feature  importance
9   avg_pts_per_match    0.207541
10   avg_gd_per_match    0.187500
6                  gd    0.154584
11           win_rate    0.113371
1                   w    0.093968
4                  gf    0.070941
7                 pts    0.065482
2                   d    0.036737
0                  mp    0.034420
5                  ga    0.017120
8                year    0.014329
3                   l    0.004006


In [98]:
# Print top 4 teams
print("\nTop 4 Teams based on average points per match:")
print(result_team)


Top 4 Teams based on average points per match:
['Spain', 'Italy', 'Belgium', 'England']


In [99]:
# Predict probabilities for all teams
team_stats['is_top_4_probability'] = clf.predict_proba(X_scaled)[:, 1]

team_stats

Unnamed: 0,squad,mp,w,d,l,gf,ga,gd,pts,year,avg_pts_per_match,avg_gd_per_match,win_rate,is_Top_4,is_top_4_probability
0,Albania,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333,False,0.0
1,Austria,10.0,2.0,2.0,6.0,7.0,12.0,-5.0,8.0,3,0.8,-0.5,0.2,False,0.0
2,Belgium,13.0,8.0,0.0,5.0,20.0,13.0,7.0,24.0,3,1.846154,0.538462,0.615385,True,0.73
3,Bulgaria,3.0,0.0,0.0,3.0,1.0,9.0,-8.0,0.0,1,0.0,-2.666667,0.0,False,0.0
4,Croatia,18.0,7.0,6.0,5.0,25.0,23.0,2.0,27.0,5,1.5,0.111111,0.388889,False,0.02
5,Czechia,23.0,10.0,2.0,11.0,29.0,29.0,0.0,32.0,6,1.391304,0.0,0.434783,False,0.03
6,Denmark,16.0,5.0,2.0,9.0,20.0,25.0,-5.0,17.0,4,1.0625,-0.3125,0.3125,False,0.0
7,England,22.0,11.0,7.0,4.0,35.0,21.0,14.0,40.0,5,1.818182,0.636364,0.5,True,0.69
8,Finland,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333,False,0.0
9,France,28.0,14.0,7.0,7.0,44.0,34.0,10.0,49.0,6,1.75,0.357143,0.5,False,0.55


In [100]:
# Print top 10 teams by model prediction
print("\nTop 10 Teams based on model prediction:")
print(team_stats.nlargest(10, 'is_top_4_probability')[['squad', 'is_top_4_probability']])


Top 10 Teams based on model prediction:
          squad  is_top_4_probability
28        Spain                  0.93
14        Italy                  0.89
21     Portugal                  0.82
2       Belgium                  0.73
7       England                  0.69
17  Netherlands                  0.57
9        France                  0.55
10      Germany                  0.11
5       Czechia                  0.03
4       Croatia                  0.02


In [101]:
# Analyze misclassifications
misclassified = team_stats[clf.predict(X_scaled) != team_stats['is_Top_4']] # team_stats['is_Top_4'] != team_stats['is_top_4_probability']
print("\nMisclassified Teams:")
print(misclassified[['squad', 'is_Top_4', 'is_top_4_probability']])


Misclassified Teams:
          squad  is_Top_4  is_top_4_probability
9        France     False                  0.55
17  Netherlands     False                  0.57
21     Portugal     False                  0.82


In [102]:
top_8_teams_based_on_model = team_stats.nlargest(10, 'is_top_4_probability')[['squad', 'is_top_4_probability']]
top_8_teams_based_on_model

Unnamed: 0,squad,is_top_4_probability
28,Spain,0.93
14,Italy,0.89
21,Portugal,0.82
2,Belgium,0.73
7,England,0.69
17,Netherlands,0.57
9,France,0.55
10,Germany,0.11
5,Czechia,0.03
4,Croatia,0.02


In [103]:
# Now we want to predict the winner of the next tournement 
# based on the player stats from WC 2022 and EM Qualification 2024 for the top_8_team based on the model

In [60]:
players = pd.read_csv("players.csv", index_col=0)
players

Unnamed: 0,min,squad,g+a_1,crdr,starts,pk,g-pk,ast,g-pk_1,crdy,mp,g+a-pk,gls_1,gls,player,g+a,ast_1
0,13,Rep. of Ireland,0.00,0,0,0,0,0,0.00,0,1,0.00,0.00,0,Aaron Connolly,0,0.00
1,513,Scotland,0.00,0,6,0,0,0,0.00,1,6,0.00,0.00,0,Aaron Hickey,0,0.00
2,360,Australia,0.0,0,4,0,0,0,0.0,1,4,0.0,0.0,0,Aaron Mooy,0,0.0
3,266,Wales,0.0,0,3,0,0,0,0.0,1,3,0.0,0.0,0,Aaron Ramsey,0,0.0
4,139,Andorra,0.00,0,1,0,0,0,0.00,2,4,0.00,0.00,0,Aaron Sánchez,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2093,180,Slovakia,0.50,0,2,0,1,0,0.50,0,2,0.50,0.50,1,Ľubomír Šatka,1,0.00
2094,74,Slovenia,0.00,0,1,0,0,0,0.00,0,2,0.00,0.00,0,Žan Celar,0,0.00
2095,867,Slovenia,0.31,0,10,0,1,2,0.10,0,10,0.31,0.10,1,Žan Karničnik,3,0.21
2096,282,Slovenia,0.64,0,2,0,2,0,0.64,0,8,0.64,0.64,2,Žan Vipotnik,2,0.00


In [139]:
top_8_teams = top_8_teams_based_on_model['squad'].tolist()
top_8_teams

['Spain',
 'Italy',
 'Portugal',
 'Belgium',
 'England',
 'Netherlands',
 'France',
 'Germany',
 'Czechia',
 'Croatia']

In [146]:
players = players[players['squad'].isin(top_8_teams)]
players.reset_index(drop=True)


Unnamed: 0,squad,g+a_1,crdr,starts,pk,g-pk,ast,g-pk_1,crdy,mp,g+a-pk,gls_1,gls,g+a,ast_1,player_score
0,Czechia,0.00,0,5.0,0,0.0,0.0,0.00,0,8.0,0.00,0.00,0,0.0,0.00,0.8
1,France,0.37,0,5.0,0,1.0,1.0,0.19,1,6.0,0.37,0.19,1,2.0,0.19,4.1
2,Spain,0.00,0,0.0,0,0.0,0.0,0.00,0,1.0,0.00,0.00,0,0.0,0.00,0.1
3,Spain,0.0,0,1.0,0,0.0,0.0,0.0,0,4.0,0.0,0.0,0,0.0,0.0,0.4
4,Italy,0.00,0,3.0,0,0.0,0.0,0.00,0,4.0,0.00,0.00,0,0.0,0.00,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,Germany,0.0,0,0.0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,0.1
344,Belgium,0.00,0,0.0,0,0.0,0.0,0.00,0,1.0,0.00,0.00,0,0.0,0.00,0.1
345,Spain,1.00,0,1.0,0,0.0,1.0,0.00,0,1.0,1.00,0.00,0,1.0,1.00,1.8
346,Spain,1.95,0,1.0,0,3.0,1.0,1.46,0,4.0,1.95,1.46,3,4.0,0.49,7.5


In [62]:
# Konvertieren Sie relevante Spalten in numerische Werte
numeric_columns = ['g+a', 'g-pk', 'ast', 'mp']
for col in numeric_columns:
    players[col] = pd.to_numeric(players[col], errors='coerce')

# 1. Berechnen Sie einen Gesamtwert für jeden Spieler
players['player_score'] = (
    players['g+a'].fillna(0) +  # Tore + Assists
    players['g-pk'].fillna(0) * 0.8 +  # Tore ohne Elfmeter (leicht geringer gewichtet)
    players['ast'].fillna(0) * 0.7 +  # Assists (etwas geringer gewichtet)
    players['mp'].fillna(0) * 0.1  # Gespielte Spiele (geringer gewichtet)
)

# 2. Aggregieren Sie die Werte für jedes Team
team_scores = players.groupby('squad')['player_score'].sum().sort_values(ascending=False)

# 3. Wählen Sie die beiden besten Teams aus
top_2_teams = team_scores.head(2) # Maybe use all 8 squads and then calculate how many squad_player are higher than others?

print("Die zwei besten Teams basierend auf den Spielerstatistiken:")
print(top_2_teams)

# Filtern Sie die Spielerdaten für diese beiden Teams
top_2_players = players[players['squad'].isin(top_2_teams.index)]

# Sortieren Sie die Spieler nach ihrem Score
top_2_players = top_2_players.sort_values('player_score', ascending=False)

print("\nDie Top-Spieler der beiden besten Teams:")
print(top_2_players[['squad', 'player', 'player_score', 'g+a', 'g-pk', 'ast', 'mp']].head(10))

Die zwei besten Teams basierend auf den Spielerstatistiken:
squad
Spain     75.9
France    67.2
Name: player_score, dtype: float64

Die Top-Spieler der beiden besten Teams:
       squad             player  player_score  g+a  g-pk  ast  mp
1119  France      Kylian Mbappé          16.9   10     6    2   7
963    Spain             Joselu          11.3    6     4    2   7
1534  France     Olivier Giroud           7.8    4     4    0   6
2077   Spain      Álvaro Morata           7.5    4     3    1   4
175   France  Antoine Griezmann           5.8    3     0    3   7
1879  France     Theo Hernández           5.8    3     1    2   6
1374   Spain    Mikel Oyarzabal           5.7    3     1    2   5
942   France    Jonathan Clauss           5.5    3     1    2   3
1553  France    Ousmane Dembélé           4.1    2     0    2   7
1373   Spain       Mikel Merino           4.1    2     1    1   6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players[col] = pd.to_numeric(players[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players['player_score'] = (


In [63]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Konvertieren Sie relevante Spalten in numerische Werte
numeric_columns = ['g+a', 'g-pk', 'ast', 'mp', 'starts', 'min']
for col in numeric_columns:
    players[col] = pd.to_numeric(players[col], errors='coerce')

# Füllen Sie NaN-Werte mit 0
players[numeric_columns] = players[numeric_columns].fillna(0)

# Gruppieren Sie die Daten nach Teams und berechnen Sie den Durchschnitt der Statistiken
team_stats = players.groupby('squad')[numeric_columns].mean()

# Erstellen Sie ein Klassenlabel basierend auf den Top-8-Teams
top_8_teams = set(top_8_teams_based_on_model['squad'])
team_stats['is_top_8'] = team_stats.index.isin(top_8_teams).astype(int)

# Teilen Sie die Daten in Trainings- und Testsets
X = team_stats[numeric_columns]
y = team_stats['is_top_8']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Standardisieren Sie die Daten
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Trainieren Sie den Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_classifier.fit(X_train_scaled, y_train)

# Berechnen Sie die Wahrscheinlichkeiten für alle Teams
team_probabilities = rf_classifier.predict_proba(scaler.transform(X))

# Erstellen Sie ein DataFrame mit den Wahrscheinlichkeiten
team_predictions = pd.DataFrame({
    'squad': team_stats.index,
    'probability': team_probabilities[:, 1] if team_probabilities.shape[1] > 1 else team_probabilities[:, 0]
})

# Sortieren Sie die Teams nach der Wahrscheinlichkeit
top_teams = team_predictions.sort_values('probability', ascending=False)

print("Top Teams basierend auf dem Random Forest Classifier:")
print(top_teams.head(8))

# Wählen Sie die besten 2 Teams aus
best_2_teams = top_teams.head(2)['squad'].tolist()

# Filtern Sie die Spielerdaten für diese beiden Teams
top_2_players = players[players['squad'].isin(best_2_teams)]

# Sortieren Sie die Spieler nach ihren Statistiken
top_2_players['player_score'] = (
    top_2_players['g+a'] +
    top_2_players['g-pk'] * 0.8 +
    top_2_players['ast'] * 0.7 +
    top_2_players['mp'] * 0.1
)

top_2_players = top_2_players.sort_values('player_score', ascending=False)

print("\nDie Top-Spieler der beiden besten Teams:")
print(top_2_players[['squad', 'player', 'player_score', 'g+a', 'g-pk', 'ast', 'mp']].head(10))

Top Teams basierend auf dem Random Forest Classifier:
         squad  probability
0      Belgium          1.0
1      Croatia          1.0
2      Czechia          1.0
3      England          1.0
4       France          1.0
5      Germany          1.0
6        Italy          1.0
7  Netherlands          1.0

Die Top-Spieler der beiden besten Teams:
        squad           player  player_score  g+a  g-pk  ast  mp
482   Belgium   Dodi Lukebakio           9.2    5     2    3   5
839   Croatia     Ivan Perišić           7.6    4     1    3   7
933   Belgium   Johan Bakayoko           5.9    3     1    2   7
1396  Croatia     Mislav Oršić           5.8    3     1    2   6
142   Croatia  Andrej Kramarić           4.3    2     2    0   7
1543  Belgium     Orel Mangala           4.2    2     0    2   8
213   Belgium    Arthur Theate           4.2    2     0    2   8
1264  Croatia     Marko Livaja           4.1    2     1    1   6
1181  Croatia    Luka Ivanušec           4.0    2     1    1   5
11

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players[col] = pd.to_numeric(players[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players[numeric_columns] = players[numeric_columns].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_2_players['player_score'] = (


In [147]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


# Überprüfen und Konvertieren der Datentypen
numeric_columns = players.select_dtypes(include=[np.number]).columns
players[numeric_columns] = players[numeric_columns].astype(float)

# Aggregiere Spielerdaten auf Teamebene
team_stats = players.groupby('squad').agg({
    col: 'sum' if col != 'squad' else 'first' for col in players.columns
}).reset_index()

# Definiere Features und Zielvariable
X = team_stats.drop(['squad', 'player_score'], axis=1)
y = team_stats['player_score']  # Wir nutzen player_score als Proxy für Teamstärke

# Skalieren der Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Modell trainieren
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Feature Importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("Top 10 wichtigste Features:")
print(feature_importance.head(10))

# Vorhersage für jedes Team
team_stats['predicted_score'] = model.predict(X_scaled)

# Sortiere Teams nach vorhergesagtem Score
sorted_teams = team_stats.sort_values('predicted_score', ascending=False)

print("\nVorhergesagte Turnier-Rangliste:")
for i, (index, row) in enumerate(sorted_teams.iterrows(), 1):
    print(f"{i}. {row['squad']}: {row['predicted_score']:.2f}")

print(f"\nVorhergesagter Turniersieger: {sorted_teams.iloc[0]['squad']}")

ValueError: cannot insert squad, already exists