In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
matches = pd.read_csv("matches.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,rk,squad,year,mp,w,d,l,gf,ga,gd,pts,top team scorer,goalkeeper
0,1,Italy,2021,7.0,5.0,2.0,0.0,13.0,4.0,9.0,17.0,"Lorenzo Insigne, Ciro Immobile... - 2",Gianluigi Donnarumma
1,2,England,2021,7.0,5.0,2.0,0.0,11.0,2.0,9.0,17.0,Harry Kane - 4,Jordan Pickford
4,3,Belgium,2021,5.0,4.0,0.0,1.0,9.0,3.0,6.0,12.0,Romelu Lukaku - 4,Thibaut Courtois
2,4,Spain,2021,6.0,2.0,4.0,0.0,13.0,6.0,7.0,10.0,Álvaro Morata - 3,Unai Simón
3,5,Denmark,2021,6.0,3.0,0.0,3.0,12.0,7.0,5.0,9.0,Kasper Dolberg - 3,Kasper Schmeichel


In [4]:
matches.shape

(112, 13)

In [5]:
24 * 2 + 16 * 4 # 2 * 24 (die letzten 2 Jahre), 4 * 16 ()

112

In [6]:
# matches = matches[matches['rk'] <= 16]

In [7]:
# matches

In [8]:
matches['squad'].value_counts()

squad
Italy               6
Portugal            6
Spain               6
Germany             6
Czechia             6
Sweden              6
France              6
England             5
Croatia             5
Russia              5
Netherlands         5
Switzerland         4
Türkiye             4
Denmark             4
Poland              4
Romania             3
Ukraine             3
Greece              3
Belgium             3
Austria             3
Slovakia            2
Wales               2
Rep. of Ireland     2
Hungary             2
Yugoslavia          1
Norway              1
Bulgaria            1
Latvia              1
N. Macedonia        1
Albania             1
Northern Ireland    1
Iceland             1
Scotland            1
Finland             1
Slovenia            1
Name: count, dtype: int64

In [9]:
matches.dtypes

rk                   int64
squad               object
year                 int64
mp                 float64
w                  float64
d                  float64
l                  float64
gf                 float64
ga                 float64
gd                 float64
pts                float64
top team scorer     object
goalkeeper          object
dtype: object

In [10]:
# matches = matches.sample(frac=1) # shuffle dataframe using sample function

In [11]:
matches.head()

Unnamed: 0,rk,squad,year,mp,w,d,l,gf,ga,gd,pts,top team scorer,goalkeeper
0,1,Italy,2021,7.0,5.0,2.0,0.0,13.0,4.0,9.0,17.0,"Lorenzo Insigne, Ciro Immobile... - 2",Gianluigi Donnarumma
1,2,England,2021,7.0,5.0,2.0,0.0,11.0,2.0,9.0,17.0,Harry Kane - 4,Jordan Pickford
4,3,Belgium,2021,5.0,4.0,0.0,1.0,9.0,3.0,6.0,12.0,Romelu Lukaku - 4,Thibaut Courtois
2,4,Spain,2021,6.0,2.0,4.0,0.0,13.0,6.0,7.0,10.0,Álvaro Morata - 3,Unai Simón
3,5,Denmark,2021,6.0,3.0,0.0,3.0,12.0,7.0,5.0,9.0,Kasper Dolberg - 3,Kasper Schmeichel


In [12]:
matches.reset_index(drop=True)

Unnamed: 0,rk,squad,year,mp,w,d,l,gf,ga,gd,pts,top team scorer,goalkeeper
0,1,Italy,2021,7.0,5.0,2.0,0.0,13.0,4.0,9.0,17.0,"Lorenzo Insigne, Ciro Immobile... - 2",Gianluigi Donnarumma
1,2,England,2021,7.0,5.0,2.0,0.0,11.0,2.0,9.0,17.0,Harry Kane - 4,Jordan Pickford
2,3,Belgium,2021,5.0,4.0,0.0,1.0,9.0,3.0,6.0,12.0,Romelu Lukaku - 4,Thibaut Courtois
3,4,Spain,2021,6.0,2.0,4.0,0.0,13.0,6.0,7.0,10.0,Álvaro Morata - 3,Unai Simón
4,5,Denmark,2021,6.0,3.0,0.0,3.0,12.0,7.0,5.0,9.0,Kasper Dolberg - 3,Kasper Schmeichel
...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,12,Belgium,2000,3.0,1.0,0.0,2.0,2.0,5.0,-3.0,3.0,"Émile Mpenza, Bart Goor - 1",Filip De Wilde
108,13,Slovenia,2000,3.0,0.0,2.0,1.0,4.0,5.0,-1.0,2.0,Zlatko Zahovič - 3,Mladen Dabanovič
109,14,Sweden,2000,3.0,0.0,1.0,2.0,2.0,4.0,-2.0,1.0,"Johan Mjällby, Henrik Larsson - 1",Magnus Hedman
110,15,Germany,2000,3.0,0.0,1.0,2.0,1.0,5.0,-4.0,1.0,Mehmet Scholl - 1,Oliver Kahn


In [13]:
# Aggregate data for each team across all tournaments
team_stats = matches.groupby('squad').agg({
    'mp': 'sum',
    'w': 'sum',
    'd': 'sum',
    'l': 'sum',
    'gf': 'sum',
    'ga': 'sum',
    'gd': 'sum',
    'pts': 'sum',
    'year': 'count'  # Number of tournaments participated
}).reset_index()

team_stats

Unnamed: 0,squad,mp,w,d,l,gf,ga,gd,pts,year
0,Albania,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1
1,Austria,10.0,2.0,2.0,6.0,7.0,12.0,-5.0,8.0,3
2,Belgium,13.0,8.0,0.0,5.0,20.0,13.0,7.0,24.0,3
3,Bulgaria,3.0,0.0,0.0,3.0,1.0,9.0,-8.0,0.0,1
4,Croatia,18.0,7.0,6.0,5.0,25.0,23.0,2.0,27.0,5
5,Czechia,23.0,10.0,2.0,11.0,29.0,29.0,0.0,32.0,6
6,Denmark,16.0,5.0,2.0,9.0,20.0,25.0,-5.0,17.0,4
7,England,22.0,11.0,7.0,4.0,35.0,21.0,14.0,40.0,5
8,Finland,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1
9,France,28.0,14.0,7.0,7.0,44.0,34.0,10.0,49.0,6


In [14]:
# Calculate additional features
team_stats['avg_pts_per_match'] = team_stats['pts'] / team_stats['mp']
team_stats['avg_gd_per_match'] = team_stats['gd'] / team_stats['mp']
team_stats['win_rate'] = team_stats['w'] / team_stats['mp']

team_stats

Unnamed: 0,squad,mp,w,d,l,gf,ga,gd,pts,year,avg_pts_per_match,avg_gd_per_match,win_rate
0,Albania,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333
1,Austria,10.0,2.0,2.0,6.0,7.0,12.0,-5.0,8.0,3,0.8,-0.5,0.2
2,Belgium,13.0,8.0,0.0,5.0,20.0,13.0,7.0,24.0,3,1.846154,0.538462,0.615385
3,Bulgaria,3.0,0.0,0.0,3.0,1.0,9.0,-8.0,0.0,1,0.0,-2.666667,0.0
4,Croatia,18.0,7.0,6.0,5.0,25.0,23.0,2.0,27.0,5,1.5,0.111111,0.388889
5,Czechia,23.0,10.0,2.0,11.0,29.0,29.0,0.0,32.0,6,1.391304,0.0,0.434783
6,Denmark,16.0,5.0,2.0,9.0,20.0,25.0,-5.0,17.0,4,1.0625,-0.3125,0.3125
7,England,22.0,11.0,7.0,4.0,35.0,21.0,14.0,40.0,5,1.818182,0.636364,0.5
8,Finland,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333
9,France,28.0,14.0,7.0,7.0,44.0,34.0,10.0,49.0,6,1.75,0.357143,0.5


In [15]:
from collections import Counter

# Sort teams by average points per match and select top 4
top_4_teams0 = team_stats.nlargest(4, 'avg_pts_per_match')['squad'].tolist()
top_4_teams1 = team_stats.nlargest(4, 'avg_gd_per_match')['squad'].tolist()
top_4_teams2 = team_stats.nlargest(4, 'win_rate')['squad'].tolist()

# Find the intersection of the three lists
result_team = list(set(top_4_teams0) & set(top_4_teams1) & set(top_4_teams2))

# If result_team has fewer than 4 teams, add more teams based on frequency
if len(result_team) < 4:
    # Combine all teams from the three lists into one list
    all_top_teams = top_4_teams0 + top_4_teams1 + top_4_teams2
    # Count the frequency of each team
    team_frequency = Counter(all_top_teams)
    
    # Sort teams by frequency and then alphabetically
    sorted_teams = sorted(team_frequency.keys(), key=lambda x: (-team_frequency[x], x))
    
    # Add teams to result_team until we have at least 4 teams
    for team in sorted_teams:
        if team not in result_team:
            result_team.append(team)
        if len(result_team) >= 4:
            break
            
result_team

['Italy', 'Spain', 'Belgium', 'England']

In [16]:
# Create target variable
team_stats['is_Top_4'] = team_stats['squad'].isin(result_team)

team_stats

Unnamed: 0,squad,mp,w,d,l,gf,ga,gd,pts,year,avg_pts_per_match,avg_gd_per_match,win_rate,is_Top_4
0,Albania,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333,False
1,Austria,10.0,2.0,2.0,6.0,7.0,12.0,-5.0,8.0,3,0.8,-0.5,0.2,False
2,Belgium,13.0,8.0,0.0,5.0,20.0,13.0,7.0,24.0,3,1.846154,0.538462,0.615385,True
3,Bulgaria,3.0,0.0,0.0,3.0,1.0,9.0,-8.0,0.0,1,0.0,-2.666667,0.0,False
4,Croatia,18.0,7.0,6.0,5.0,25.0,23.0,2.0,27.0,5,1.5,0.111111,0.388889,False
5,Czechia,23.0,10.0,2.0,11.0,29.0,29.0,0.0,32.0,6,1.391304,0.0,0.434783,False
6,Denmark,16.0,5.0,2.0,9.0,20.0,25.0,-5.0,17.0,4,1.0625,-0.3125,0.3125,False
7,England,22.0,11.0,7.0,4.0,35.0,21.0,14.0,40.0,5,1.818182,0.636364,0.5,True
8,Finland,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333,False
9,France,28.0,14.0,7.0,7.0,44.0,34.0,10.0,49.0,6,1.75,0.357143,0.5,False


In [20]:
# Select features for the model
features = ['mp', 'w', 'd', 'l', 'gf', 'ga', 'gd', 'pts', 'year', 'avg_pts_per_match', 'avg_gd_per_match', 'win_rate']
X = team_stats[features]
y = team_stats['is_Top_4']

In [21]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [22]:
# Apply PCA (PCA macht nicht viel Sinn, da Anzahl feature nicht sehr hoch ist)
# pca = PCA(0.95)  # Retain 95% of variance
# X_pca = pca.fit_transform(X_scaled)

# Print the number of components and explained variance ratio
# print(f"Number of components: {pca.n_components_}")
# print(f"Explained variance ratio: {pca.explained_variance_ratio_}")

In [23]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

In [24]:
# Train the model
clf = RandomForestClassifier(random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

In [25]:
# Make predictions
y_pred = clf.predict(X_test)

In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.7272727272727273
Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.70      0.82        10
        True       0.25      1.00      0.40         1

    accuracy                           0.73        11
   macro avg       0.62      0.85      0.61        11
weighted avg       0.93      0.73      0.79        11



In [27]:
# Feature importance
feature_importance = pd.DataFrame({'feature': features, 'importance': clf.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
              feature  importance
9   avg_pts_per_match    0.207541
10   avg_gd_per_match    0.187500
6                  gd    0.154584
11           win_rate    0.113371
1                   w    0.093968
4                  gf    0.070941
7                 pts    0.065482
2                   d    0.036737
0                  mp    0.034420
5                  ga    0.017120
8                year    0.014329
3                   l    0.004006


In [28]:
# Print top 4 teams
print("\nTop 4 Teams based on average points per match:")
print(result_team)


Top 4 Teams based on average points per match:
['Italy', 'Spain', 'Belgium', 'England']


In [29]:
# Predict probabilities for all teams
team_stats['is_top_4_probability'] = clf.predict_proba(X_scaled)[:, 1]

team_stats

Unnamed: 0,squad,mp,w,d,l,gf,ga,gd,pts,year,avg_pts_per_match,avg_gd_per_match,win_rate,is_Top_4,is_top_4_probability
0,Albania,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333,False,0.0
1,Austria,10.0,2.0,2.0,6.0,7.0,12.0,-5.0,8.0,3,0.8,-0.5,0.2,False,0.0
2,Belgium,13.0,8.0,0.0,5.0,20.0,13.0,7.0,24.0,3,1.846154,0.538462,0.615385,True,0.73
3,Bulgaria,3.0,0.0,0.0,3.0,1.0,9.0,-8.0,0.0,1,0.0,-2.666667,0.0,False,0.0
4,Croatia,18.0,7.0,6.0,5.0,25.0,23.0,2.0,27.0,5,1.5,0.111111,0.388889,False,0.02
5,Czechia,23.0,10.0,2.0,11.0,29.0,29.0,0.0,32.0,6,1.391304,0.0,0.434783,False,0.03
6,Denmark,16.0,5.0,2.0,9.0,20.0,25.0,-5.0,17.0,4,1.0625,-0.3125,0.3125,False,0.0
7,England,22.0,11.0,7.0,4.0,35.0,21.0,14.0,40.0,5,1.818182,0.636364,0.5,True,0.69
8,Finland,3.0,1.0,0.0,2.0,1.0,3.0,-2.0,3.0,1,1.0,-0.666667,0.333333,False,0.0
9,France,28.0,14.0,7.0,7.0,44.0,34.0,10.0,49.0,6,1.75,0.357143,0.5,False,0.55


In [30]:
# Print top 10 teams by model prediction
print("\nTop 10 Teams based on model prediction:")
print(team_stats.nlargest(10, 'is_top_4_probability')[['squad', 'is_top_4_probability']])


Top 10 Teams based on model prediction:
          squad  is_top_4_probability
28        Spain                  0.93
14        Italy                  0.89
21     Portugal                  0.82
2       Belgium                  0.73
7       England                  0.69
17  Netherlands                  0.57
9        France                  0.55
10      Germany                  0.11
5       Czechia                  0.03
4       Croatia                  0.02


In [34]:
# Analyze misclassifications
misclassified = team_stats[clf.predict(X_scaled) != team_stats['is_Top_4']] # Maybe vlt mit team_stats['is_Top_4'] != team_stats['is_top_4_probability']?
print("\nMisclassified Teams:")
print(misclassified[['squad', 'is_Top_4', 'is_top_4_probability']])


Misclassified Teams:
          squad  is_Top_4  is_top_4_probability
9        France     False                  0.55
17  Netherlands     False                  0.57
21     Portugal     False                  0.82


In [35]:
top_8_teams_based_on_model = team_stats.nlargest(8, 'is_top_4_probability')[['squad', 'is_top_4_probability']]
top_8_teams_based_on_model

Unnamed: 0,squad,is_top_4_probability
28,Spain,0.93
14,Italy,0.89
21,Portugal,0.82
2,Belgium,0.73
7,England,0.69
17,Netherlands,0.57
9,France,0.55
10,Germany,0.11


In [36]:
# Now we want to predict the winner of the next tournement 
# based on the player stats from WC 2022 and EM Qualification 2024 for the top_8_team based on the model

In [38]:
players = pd.read_csv("players.csv", index_col=0)

players

Unnamed: 0,min,squad,g+a_1,crdr,starts,pk,g-pk,ast,g-pk_1,crdy,mp,g+a-pk,gls_1,gls,player,g+a,ast_1
0,13,Rep. of Ireland,0.00,0,0,0,0,0,0.00,0,1,0.00,0.00,0,Aaron Connolly,0,0.00
1,513,Scotland,0.00,0,6,0,0,0,0.00,1,6,0.00,0.00,0,Aaron Hickey,0,0.00
2,360,Australia,0.0,0,4,0,0,0,0.0,1,4,0.0,0.0,0,Aaron Mooy,0,0.0
3,266,Wales,0.0,0,3,0,0,0,0.0,1,3,0.0,0.0,0,Aaron Ramsey,0,0.0
4,139,Andorra,0.00,0,1,0,0,0,0.00,2,4,0.00,0.00,0,Aaron Sánchez,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2093,180,Slovakia,0.50,0,2,0,1,0,0.50,0,2,0.50,0.50,1,Ľubomír Šatka,1,0.00
2094,74,Slovenia,0.00,0,1,0,0,0,0.00,0,2,0.00,0.00,0,Žan Celar,0,0.00
2095,867,Slovenia,0.31,0,10,0,1,2,0.10,0,10,0.31,0.10,1,Žan Karničnik,3,0.21
2096,282,Slovenia,0.64,0,2,0,2,0,0.64,0,8,0.64,0.64,2,Žan Vipotnik,2,0.00


In [39]:
top_8_teams = top_8_teams_based_on_model['squad'].tolist()

top_8_teams

['Spain',
 'Italy',
 'Portugal',
 'Belgium',
 'England',
 'Netherlands',
 'France',
 'Germany']

In [126]:
players = players[players['squad'].isin(top_8_teams)]

players.reset_index(drop=True)

Unnamed: 0,min,squad,g+a_1,crdr,starts,pk,g-pk,ast,g-pk_1,crdy,mp,g+a-pk,gls_1,gls,player,g+a,ast_1
0,483,France,0.37,0,5,0,1,1,0.19,1,6,0.37,0.19,1,Adrien Rabiot,2,0.19
1,45,Spain,0.00,0,0,0,0,0,0.00,0,1,0.00,0.00,0,Aleix García,0,0.00
2,126,Spain,0.00,0,1,0,0,0,0.0,0,4,0.00,0.00,0,Alejandro Balde,0,0.00
3,298,Italy,0.00,0,3,0,0,0,0.00,0,4,0.00,0.00,0,Alessandro Bastoni,0,0.00
4,90,Italy,0.00,0,1,0,0,0,0.00,1,1,0.00,0.00,0,Alessandro Buongiorno,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,1,Germany,0.00,0,0,0,0,0,0.0,0,1,0.00,0.00,0,Youssoufa Moukoko,0,0.00
283,7,Belgium,0.00,0,0,0,0,0,0.00,0,1,0.00,0.00,0,Zeno Debast,0,0.00
284,90,Spain,1.00,0,1,0,0,1,0.00,0,1,1.00,0.00,0,Álex Grimaldo,1,1.00
285,185,Spain,1.95,0,1,0,3,1,1.46,0,4,1.95,1.46,3,Álvaro Morata,4,0.49


In [127]:
players['squad'].value_counts()

squad
Spain          48
Netherlands    41
Italy          40
Belgium        37
France         35
Portugal       34
England        32
Germany        20
Name: count, dtype: int64

In [128]:
#------------------------------------------------------------------------------------------#

In [129]:
numeric_columns = ['g+a', 'g+a_1', 'ast', 'ast_1', 'g-pk', 'crdy', 'mp','g+a-pk', 'gls_1', 'gls', 'g-pk']  

for col in numeric_columns:
    # 'coerce' wandelt nicht-konvertierbare Werte in NaN um
    players[col] = pd.to_numeric(players[col], errors='coerce')

In [130]:
team_stats_based_on_players = players.groupby('squad').agg({
    'g+a': 'mean',
    'g+a_1': 'mean',
    'ast': 'mean',
    'ast_1': 'mean',
    'g-pk': 'mean',
    'crdy': 'mean',
    'mp': 'mean',
    'g+a-pk': 'mean',
    'gls_1': 'mean',
    'gls': 'mean',
    'g-pk': 'mean',
})

team_stats_based_on_players

Unnamed: 0_level_0,g+a,g+a_1,ast,ast_1,g-pk,crdy,mp,g+a-pk,gls_1,gls
squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Belgium,0.432432,0.136486,0.324324,0.093243,0.108108,0.243243,2.837838,0.136486,0.042973,0.108108
England,0.75,0.390313,0.34375,0.206875,0.375,0.125,3.15625,0.383437,0.183438,0.40625
France,0.914286,0.346571,0.4,0.110571,0.457143,0.228571,3.6,0.337714,0.236,0.514286
Germany,0.55,0.418,0.25,0.1585,0.25,0.15,2.4,0.394,0.2595,0.3
Italy,0.725,0.3655,0.325,0.15075,0.4,0.35,3.2,0.3655,0.21475,0.4
Netherlands,0.731707,0.385122,0.365854,0.181951,0.365854,0.414634,3.243902,0.385122,0.203415,0.365854
Portugal,0.882353,0.355,0.411765,0.133235,0.411765,0.205882,3.176471,0.338529,0.221765,0.470588
Spain,0.75,0.448958,0.333333,0.149167,0.395833,0.1875,2.8125,0.440208,0.299792,0.416667


In [131]:
team_stats_based_on_players['better_count'] = (team_stats_based_on_players.rank(ascending=False) == 1).sum(axis=1)
# (team_stats_based_on_players.rank(ascending=False) == 1) überprüft, ob eine Mannschaft in einer bestimmten Spalte die beste ist (d.h., einen Rang von 1 hat).
# .sum(axis=1) summiert die Anzahl der Spalten, in denen die Mannschaft die beste ist.

team_stats_based_on_players

Unnamed: 0_level_0,g+a,g+a_1,ast,ast_1,g-pk,crdy,mp,g+a-pk,gls_1,gls,better_count
squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Belgium,0.432432,0.136486,0.324324,0.093243,0.108108,0.243243,2.837838,0.136486,0.042973,0.108108,0
England,0.75,0.390313,0.34375,0.206875,0.375,0.125,3.15625,0.383437,0.183438,0.40625,1
France,0.914286,0.346571,0.4,0.110571,0.457143,0.228571,3.6,0.337714,0.236,0.514286,4
Germany,0.55,0.418,0.25,0.1585,0.25,0.15,2.4,0.394,0.2595,0.3,0
Italy,0.725,0.3655,0.325,0.15075,0.4,0.35,3.2,0.3655,0.21475,0.4,0
Netherlands,0.731707,0.385122,0.365854,0.181951,0.365854,0.414634,3.243902,0.385122,0.203415,0.365854,1
Portugal,0.882353,0.355,0.411765,0.133235,0.411765,0.205882,3.176471,0.338529,0.221765,0.470588,1
Spain,0.75,0.448958,0.333333,0.149167,0.395833,0.1875,2.8125,0.440208,0.299792,0.416667,3


In [132]:
top_2_teams = team_stats_based_on_players['better_count'].nlargest(n=2).index.tolist() # .index.tolist() bekommt man nur die Länder

top_2_teams

['France', 'Spain']

In [133]:
# squad Spalte ist als Index gesetzt, müssen das ändern

team_stats_based_on_players = team_stats_based_on_players.reset_index()

team_stats_based_on_players

Unnamed: 0,squad,g+a,g+a_1,ast,ast_1,g-pk,crdy,mp,g+a-pk,gls_1,gls,better_count
0,Belgium,0.432432,0.136486,0.324324,0.093243,0.108108,0.243243,2.837838,0.136486,0.042973,0.108108,0
1,England,0.75,0.390313,0.34375,0.206875,0.375,0.125,3.15625,0.383437,0.183438,0.40625,1
2,France,0.914286,0.346571,0.4,0.110571,0.457143,0.228571,3.6,0.337714,0.236,0.514286,4
3,Germany,0.55,0.418,0.25,0.1585,0.25,0.15,2.4,0.394,0.2595,0.3,0
4,Italy,0.725,0.3655,0.325,0.15075,0.4,0.35,3.2,0.3655,0.21475,0.4,0
5,Netherlands,0.731707,0.385122,0.365854,0.181951,0.365854,0.414634,3.243902,0.385122,0.203415,0.365854,1
6,Portugal,0.882353,0.355,0.411765,0.133235,0.411765,0.205882,3.176471,0.338529,0.221765,0.470588,1
7,Spain,0.75,0.448958,0.333333,0.149167,0.395833,0.1875,2.8125,0.440208,0.299792,0.416667,3


In [134]:
team_stats_based_on_players['is_Top_2'] = team_stats_based_on_players['squad'].isin(top_2_teams)

team_stats_based_on_players

Unnamed: 0,squad,g+a,g+a_1,ast,ast_1,g-pk,crdy,mp,g+a-pk,gls_1,gls,better_count,is_Top_2
0,Belgium,0.432432,0.136486,0.324324,0.093243,0.108108,0.243243,2.837838,0.136486,0.042973,0.108108,0,False
1,England,0.75,0.390313,0.34375,0.206875,0.375,0.125,3.15625,0.383437,0.183438,0.40625,1,False
2,France,0.914286,0.346571,0.4,0.110571,0.457143,0.228571,3.6,0.337714,0.236,0.514286,4,True
3,Germany,0.55,0.418,0.25,0.1585,0.25,0.15,2.4,0.394,0.2595,0.3,0,False
4,Italy,0.725,0.3655,0.325,0.15075,0.4,0.35,3.2,0.3655,0.21475,0.4,0,False
5,Netherlands,0.731707,0.385122,0.365854,0.181951,0.365854,0.414634,3.243902,0.385122,0.203415,0.365854,1,False
6,Portugal,0.882353,0.355,0.411765,0.133235,0.411765,0.205882,3.176471,0.338529,0.221765,0.470588,1,False
7,Spain,0.75,0.448958,0.333333,0.149167,0.395833,0.1875,2.8125,0.440208,0.299792,0.416667,3,True


In [64]:
# for col in numeric_columns:
    # team_stats_based_on_players['high_scoring'] += (team_stats_based_on_players[col] > team_stats_based_on_players[col].mean()).astype(int)

# team_stats_based_on_players

In [141]:
features = numeric_columns + ['better_count']

# Features und Zielvariable definieren
X = team_stats_based_on_players[features]
y = team_stats_based_on_players['is_Top_2']

In [142]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [165]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [172]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [173]:
# Make predictions
y_pred = model.predict(X_test)

In [174]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

