In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Load the data (Assuming 'corrected_euro_team_data.csv' is the data file)
team_data = pd.read_csv("corrected_euro_team_data.csv")

# Create score differential column
team_data['score_differential'] = team_data['team_score'] - team_data['opponent_score']

# Reorder columns to move score_differential beside team_score and opponent_score
columns_order = team_data.columns.tolist()
columns_order.remove('score_differential')
insert_position = columns_order.index('opponent_score') + 1
columns_order.insert(insert_position, 'score_differential')
team_data = team_data[columns_order]

# Convert date column to datetime
team_data['date'] = pd.to_datetime(team_data['date'])

# Extract month, day of the week, and year from date
team_data['month'] = team_data['date'].dt.month
team_data['day_of_week'] = team_data['date'].dt.dayofweek
team_data['year'] = team_data['date'].dt.year

# Drop the original date column
team_data.drop(columns=['date'], inplace=True)

# One-Hot Encode categorical columns
team_data_encoded = pd.get_dummies(team_data, columns=['source', 'team_code', 'opponent_code'])

# Assuming 'team_rank' column indicates final position, create a 'top_4' column
# Here we simulate it. You should replace this with actual logic.
team_data_encoded['team_rank'] = np.random.randint(1, 21, team_data_encoded.shape[0])
team_data_encoded['target_top_4'] = (team_data_encoded['team_rank'] <= 4).astype(int)

# Drop team_rank if present to prevent data leakage
team_data_encoded.drop(columns=['team_rank'], inplace=True)

# Scale the data
scaler = StandardScaler()
features = team_data_encoded.drop(columns=['target_top_4'])
scaled_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

# Combine scaled features with the target variable
data_prepared = pd.concat([scaled_features, team_data_encoded['target_top_4']], axis=1)

# Define features and target
X = data_prepared.drop(columns=['target_top_4'])
y = data_prepared['target_top_4']

# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model (Gradient Boosting Classifier as an example)
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Evaluate the model
gb_predictions = gb_model.predict(X_test)
print("Gradient Boosting Classifier Report:")
print(classification_report(y_test, gb_predictions))

# Predict top 4 using the trained model
overall_predictions = gb_model.predict(X)

# Add predictions to the original data
team_data_encoded['top_4_prediction'] = overall_predictions

# Extract top 4 teams based on prediction
top_4_teams = team_data_encoded[team_data_encoded['top_4_prediction'] == 1]

print("Predicted Top 4 Teams:")
print(top_4_teams)


Gradient Boosting Classifier Report:
              precision    recall  f1-score   support

           0       0.73      0.96      0.83       165
           1       0.22      0.03      0.06        59

    accuracy                           0.71       224
   macro avg       0.48      0.50      0.45       224
weighted avg       0.60      0.71      0.63       224

Predicted Top 4 Teams:
      id_match  home_or_away  team_xG  team_odds  opponent_xG  opponent_odds  \
14     2034448             0     -2.2      48468         -3.0          70000   
27     2040794             0     -3.0      70000         -3.0          70000   
33     2036399             1     -3.0      70000         -3.0          70000   
38     2034802             1     -3.0      70000         -3.0          70000   
66     2039097             1     -1.2       6048          2.5            398   
67     2034478             1     -1.2       6048          1.2            401   
82     2036346             1     -3.0      70000     

In [2]:
team_data

Unnamed: 0,id_match,source,home_or_away,team_code,team_xG,team_odds,opponent_code,opponent_xG,opponent_odds,team_score,...,score_differential,match_attendance,stadium_latitude,stadium_longitude,stadium_pitch_length,stadium_pitch_width,target_top_4,month,day_of_week,year
0,2036436,Qualifiers,1,ALB,-2.2,48468,CZE,-1.4,15861,3,...,3,20917,41.318403,19.823953,105,68,0,10,3,2023
1,2040328,Friendlies,1,ALB,-2.2,48468,CHI,-3.0,70000,0,...,-3,21425,44.794981,10.338325,105,68,0,3,4,2024
2,2039019,Friendlies,1,ALB,-2.2,48468,BUL,-3.0,70000,2,...,2,17232,41.318403,19.823953,105,68,0,10,1,2023
3,2034801,Friendlies,1,ALB,-2.2,48468,EST,-3.0,70000,0,...,0,21425,41.318403,19.823953,105,68,0,6,0,2022
4,2034569,Nations,1,ALB,-2.2,48468,ISL,-3.0,70000,1,...,0,8800,41.318403,19.823953,105,68,0,9,1,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,2036412,Qualifiers,0,WAL,-3.0,70000,LVA,-3.0,70000,2,...,2,6464,56.961378,24.116383,105,68,0,9,0,2023
1116,2034452,Nations,1,WAL,-3.0,70000,NED,0.1,1553,1,...,-1,23395,51.474537,-3.200818,105,68,0,6,2,2022
1117,2036366,Qualifiers,0,WAL,-3.0,70000,TUR,1.7,5515,0,...,-2,28766,41.228006,36.457621,105,68,0,6,0,2023
1118,2034553,Nations,1,WAL,-3.0,70000,POL,-0.1,17538,0,...,-1,31520,51.474537,-3.200818,105,68,0,9,6,2022


In [7]:
!pip install --upgrade scikit-learn imbalanced-learn pandas numpy

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Load the data
team_data = pd.read_csv("corrected_euro_team_data.csv")

# Create score differential column
team_data['score_differential'] = team_data['team_score'] - team_data['opponent_score']

# Reorder columns
columns_order = team_data.columns.tolist()
columns_order.remove('score_differential')
insert_position = columns_order.index('opponent_score') + 1
columns_order.insert(insert_position, 'score_differential')
team_data = team_data[columns_order]

# Convert date column to datetime
team_data['date'] = pd.to_datetime(team_data['date'])

# Extract month, day of the week, and year from date
team_data['month'] = team_data['date'].dt.month
team_data['day_of_week'] = team_data['date'].dt.dayofweek
team_data['year'] = team_data['date'].dt.year

# Drop the original date column
team_data.drop(columns=['date'], inplace=True)

# One-Hot Encode categorical columns
team_data_encoded = pd.get_dummies(team_data, columns=['source', 'team_code', 'opponent_code'])

# Create a 'top_4' column
team_data_encoded['team_rank'] = np.random.randint(1, 21, team_data_encoded.shape[0])
team_data_encoded['target_top_4'] = (team_data_encoded['team_rank'] <= 4).astype(int)

# Drop team_rank if present to prevent data leakage
team_data_encoded.drop(columns=['team_rank'], inplace=True)

# Scale the data
scaler = StandardScaler()
features = team_data_encoded.drop(columns=['target_top_4'])
scaled_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

# Combine scaled features with the target variable
data_prepared = pd.concat([scaled_features, team_data_encoded['target_top_4']], axis=1)

# Define features and target
X = data_prepared.drop(columns=['target_top_4'])
y = data_prepared['target_top_4']

# Address class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Initialize and train the model (Gradient Boosting Classifier)
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Evaluate the model
gb_predictions = gb_model.predict(X_test)
print("Gradient Boosting Classifier Report (with SMOTE):")
print(classification_report(y_test, gb_predictions))

# Predict top 4 using the trained model on the entire dataset
overall_predictions = gb_model.predict(X)

# Add predictions to the original data
team_data_encoded['top_4_prediction'] = overall_predictions

# Extract top 4 teams based on prediction
top_4_teams = team_data_encoded[team_data_encoded['top_4_prediction'] == 1]

print("Predicted Top 4 Teams:")
print(top_4_teams)


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Collecting pandas
  Downloading pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.0.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m946.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pandas, imbalanced-learn
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.4
    Uninstalling pan