In [None]:
import pandas as pd
from glob import glob
import os
from imblearn.under_sampling import RandomUnderSampler

# train test split
from sklearn.model_selection import train_test_split

from supervised.automl import AutoML
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline



### List files

In [None]:
# Load all CSV files in the data/processed directory
list_files = glob("../data/processed/**/*.csv", recursive=True)

# Load all CSV files into a dictionary of DataFrames
dfs = {file: pd.read_csv(file) for file in list_files}
# Combine all DataFrames into a single DataFrame
df = pd.concat(dfs.values(), ignore_index=True)

# extract unique gameday from list_files, where gameday is indicated as **_gd_01_** in the file name
gameday = [file.split("_gd_")[1].split("_")[0] for file in list_files]

print(f"Number of unique gameday: {len(set(gameday))}")
print(set(gameday))


# Calculate the sorted order based on the count of each event type
event_type_order = df["event_type"].value_counts().index

# Plot the sorted histogram of event_type
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="event_type", order=event_type_order)  # Sort by count
plt.title(f"Event Type Distribution for {len(df)} events")
plt.xlabel("Event Type")
plt.ylabel("Count")
# grid, both axis
plt.grid(True, "both", alpha=0.2)
plt.xticks(rotation=45)
plt.show()

# print event_type value counts
print(df['event_type'].value_counts())

### Number of rows without sync point

In [None]:
# print how many events have no sync point
print(f'Number of events with no sync point: {df["event_time_throw"].isna().sum()}')

df_no_sync = df[df["event_time_throw"].isna()]

# Plot the sorted histogram of event_type
plt.figure(figsize=(10, 6))
sns.countplot(data=df_no_sync, x="event_type", order=event_type_order)  # Sort by count
plt.title(f"Event Type Distribution for {len(df_no_sync)} events with no sync point")
plt.xlabel("Event Type")
plt.ylabel("Count")
# grid, both axis
plt.grid(True, "both", alpha=0.2)
plt.xticks(rotation=45)
plt.show()





In [None]:

# Create a new column to indicate whether an event has sync or not
df["sync_status"] = df["event_time_throw"].notna().map({True: 'With Sync', False: 'Without Sync'})

# Define the order of event types if needed
# event_type_order = ['event_type_1', 'event_type_2', ...]  # Adjust this to your event type order

# Plot the stacked count of event types for events with and without sync
plt.figure(figsize=(10, 6))

# Create a stacked countplot based on sync status
sns.countplot(data=df, x="name_team_home", hue="sync_status", palette="Set2", dodge=False)

# Add title and labels
plt.title("Event Type Distribution (Stacked by Sync Status)")
plt.xlabel("Event Type")
plt.ylabel("Count")

# Add grid for both axes
plt.grid(True, "both", alpha=0.2)

# Rotate x-ticks for readability
plt.xticks(rotation=90)

# Show the plot
plt.show()

In [None]:
# Extract the events with sync points
df_sync = df.dropna(subset=["event_time_throw"])

# Plot the sorted histogram of event_type
plt.figure(figsize=(10, 6))
sns.countplot(data=df_sync, x="event_type", order=event_type_order)  # Sort by count
plt.title(f"Event Type Distribution for {len(df_sync)} events with sync point")
plt.xlabel("Event Type")
plt.ylabel("Count")
# grid, both axis
plt.grid(True, "both", alpha=0.2)
plt.xticks(rotation=45)
plt.show()


# Distribution over teams

### How many matches have the teams played at home?

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Group by home team and count the number of matches for each team
df_grouped = df.groupby("name_team_home")["match_id"].nunique().reset_index(name="count")

# Sort by the number of home matches for better visual clarity
df_grouped = df_grouped.sort_values(by="count", ascending=False)

# Plot the bar chart of home matches per team
plt.figure(figsize=(12, 6))
sns.barplot(data=df_grouped, x="name_team_home", y="count")

# Add title and labels
plt.title("Number of Home Matches by Team")
plt.xlabel("Team")
plt.ylabel("Number of Home Matches")

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Add gridlines
plt.grid(True, which="both", axis='y', alpha=0.2)

# Show the plot
plt.show()


In [None]:
# Sort the teams by overall count of events
team_order = df_sync['name_team_home'].value_counts().index

# Plot the stacked histogram of event_type for each team, sorted by team count
plt.figure(figsize=(10, 6))
sns.histplot(data=df_sync, x="name_team_home", hue="event_type", bins=50, multiple="stack")

# Calculate the average number of events per team
average_event_count = df_sync.groupby('name_team_home').size().mean()

# Add a vertical line representing the average number of events per team
plt.axhline(average_event_count, color='red', linestyle='--', label=f'Average: {average_event_count:.1f}')

# Add title and labels
plt.title(f"Event Type Distribution for {len(df_sync)} events with sync point")
plt.xlabel("Team")
plt.ylabel("Count")

# Add gridlines
plt.grid(True, which="both", alpha=0.2)

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Add legend for the average line
plt.legend()

# Show the plot
plt.show()


In [None]:
features = [
    "distance_player_to_goal",  # distance between player and the goal
    "distance_player_to_goalkeeper",  # distance between player and goalkeeper
    "distance_goalkeeper_to_goal",  # distance between goalkeeper and goal
    "angle_player_to_goal",  # angle of the ball relative to the goal
    # "angle_ball_to_goal",
    # "speed_ball",  # speed of the ball when thrown
    "speed_player",  # speed of the player
    "distance_player_to_nearst_opponent",  # distance to the nearest defender
    "distance_player_to_nearest_teammate",  # number of defenders close to the player
    "num_opponents_between_player_and_goal",  # number of defenders between player and goal
    "num_opponents_close_to_player",  # number of defenders close to the player
    "efficiency_shots_team",  # efficiency of the team in scoring goals
    "efficiency_shots_player",  # efficiency of the player in scoring goals
    "efficiency_goalkeeper",  # efficiency of the goalkeeper in saving goals
    # "home_advantage",  # home advantage
]

target_column = "event_type"  # target column to predict


df_features = df[features].copy()

df_features["target"] = df["event_type"].apply(
    lambda x: 1 if x == "score_change" else 0
)

df_features["target_as_string"] = df["event_type"]
    


### Filtering

In [None]:

print(df_features.shape[0])
# how many nans
print(f'Before dropping NaNs\n')
print(f'Total number of rows: {df_features.shape[0]}\n')
print(df_features.isnull().sum())


# Analysis

### Histogram of feature values

In [None]:
# plot hist of features
df_features.hist(figsize=(20, 20))
plt.show()

In [None]:
#  Plot NaN histogram
# Count the number of NaN values per feature
nan_counts = df_features.isnull().sum()

# Sort the NaN counts in descending order (optional for better visualization)
nan_counts = nan_counts[nan_counts > 0].sort_values(ascending=False)

# Plot the NaN counts as a bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x=nan_counts.index, y=nan_counts.values, palette='viridis')

# Rotate the x labels for better readability
plt.xticks(rotation=90)

# Add labels and title
plt.xlabel("Features", fontsize=12)
plt.ylabel("Number of NaN Values", fontsize=12)
plt.title("Number of NaN Values per Feature", fontsize=16)

# Show the plot
plt.tight_layout()
plt.show()

# plot nans grouped by name_team_home
nan_counts = df_features.isnull().sum()
nan_counts = nan_counts[nan_counts > 0].sort_values(ascending=False)

# Group by home team and count the number of NaN values for each team
df_grouped = df_features.groupby("name_team_home")[nan_counts.index].sum()

# Sort by the total number of NaN values for better visual clarity
df_grouped = df_grouped.sum(axis=1).sort_values(ascending=False)

# Plot the bar chart of NaN values per team
plt.figure(figsize=(12, 6))
sns.barplot(data=df_grouped.reset_index(), x="name_team_home", y=0)

# Add title and labels
plt.title("Number of NaN Values by Team")
plt.xlabel("Team")
plt.ylabel("Number of NaN Values")

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)
plt.show()



### Dropping unrealistic values

In [None]:
# Add target column because we want to see if it correlates with the features
# target is 1 if the event is a event_type is "score_change", 0 otherwise

# Correlation matrix for df_features but without the target_as_string column
corr = df_features.drop(columns=["target_as_string"]).corr()
# Correlation matrix
# corr = df_features.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
)

plt.show()

# print most correlated features
print(corr["target"].sort_values(ascending=False))


###  Feature Importance with a Tree-Based Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Define the feature columns (excluding the target)
feature_columns = df_features.drop("target", axis=1).drop("target_as_string", axis=1).columns
# Train a Random Forest model to check feature importance
X = df_features[feature_columns]
y = df_features["target"]

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Get feature importance
feature_importances = pd.Series(model.feature_importances_, index=feature_columns)

# Sort and plot feature importances
feature_importances.sort_values(ascending=False).plot(kind="barh", figsize=(12, 6))
plt.title("Feature Importances from Random Forest")
plt.show()


### is there mutual information?

In [None]:
from sklearn.feature_selection import mutual_info_classif

# Compute the mutual information between each feature and the target
mi = mutual_info_classif(X, y)

# Create a DataFrame for better visualization
mi_series = pd.Series(mi, index=feature_columns)
mi_series.sort_values(ascending=False).plot(kind="barh", figsize=(12, 6))
plt.title("Mutual Information Scores")
plt.show()


### Variance Threshold (Detecting Low-Variance Features)

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Apply a variance threshold to remove features with low variance
selector = VarianceThreshold(threshold=0.1)  # Choose an appropriate threshold
selector.fit(X)

# Get the remaining features
remaining_features = X.columns[selector.get_support()]
print(f"Features retained after variance threshold: {list(remaining_features)}")
# print features that were dropped
print(f"Features dropped after variance threshold: {list(X.columns[~selector.get_support()])}")


### Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = feature_columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(feature_columns))]

# Print VIF data
print(vif_data.sort_values(by="VIF", ascending=False))


###  Feature Distributions and Target Separation (Boxplots and KDEs)

In [None]:
from sklearn.decomposition import PCA

# Apply PCA and plot the first two components
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)

plt.scatter(pca_result[:, 0], pca_result[:, 1], c=y, cmap='viridis')
plt.title("PCA of Features")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()


# Training

### Preparing data

In [None]:

# Select the features and target column
X = df_features[features]
y = df_features["target"]

print(y.value_counts())




### Apply RandomUndersampling

In [None]:
# sampling_strategy = 1
# rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X_res, y_res = rus.fit_resample(X, y)

# # print distribution of target column
# print(y_res.value_counts())

In [None]:
X_res = X
y_res = y


### Train-Test-Splits

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42
)

### Paths and algorithms for automl

In [None]:
path_out = "../data/ml_stuff/automl_8"

algorithms=[
    "Xgboost",
    "CatBoost",
    "Random Forest",
]

### Train automl

In [None]:
# automl = AutoML(
#     results_path=path_out,
#     algorithms=[
#         "Xgboost",
#         "CatBoost",
#         "Random Forest",
#     ],
#     total_time_limit=5 * 60,
#     # n_jobs=6,
#     explain_level=2,
#     mode="Explain",
#     random_state=42,
    
# )

automl = AutoML(
    results_path=path_out,
    algorithms=[
        "Xgboost",
        "CatBoost",
        "Random Forest",
    ],
    total_time_limit=5 * 60,
    # n_jobs=6,
    explain_level=2,
    mode="Explain",
    random_state=42,
    
)

### Fit

In [None]:
automl.fit(X_train, y_train)

### Test

In [None]:
# Predict the target values
y_pred = automl.predict(X_test)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print the evaluation metrics
print(f"\nAccuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")


# plot the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()


### Now apply model on complete dataframe

In [None]:
# Now apply model on complete dataframe and insert column "xG"
X = df_features[features]
y = df_features["target"]

# Predict the target values
y_pred = automl.predict(X)
# get proba
y_pred = automl.predict_proba(X)[:, 1]

# insert column "xG" into df_features
df_features["xG"] = y_pred

# print some xG
print(df_features["xG"].value_counts())

# insert xG of df_features into df
df["xG"] = df_features["xG"]
# and target
df["target"] = df_features["target"]

### Plot positions and xG

In [None]:

# Create a figure and axis for the histogram
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot the histogram with stacking based on target (score change)
sns.histplot(data=df, x="xG", bins=50, hue="target", multiple="stack", palette="Set1", stat="count", ax=ax1)

# Set labels and grid for the first y-axis (counts)
ax1.set_title("xG Distribution (Stacked by Score Change) with Density Lines")
ax1.set_xlabel("xG")
ax1.set_ylabel("Count")
ax1.grid(True, "both", alpha=0.2)

# Create a second y-axis for the density
ax2 = ax1.twinx()

# Add density lines for both score change == 0 and score change == 1 on the second y-axis
sns.kdeplot(data=df[df["target"] == 0], x="xG", color="red", label="Density (No Score Change)", ax=ax2, lw=2)
sns.kdeplot(data=df[df["target"] == 1], x="xG", color="blue", label="Density (Score Change)", ax=ax2, lw=2)

# Set label for the second y-axis (density)
ax2.set_ylabel("Density")

# Show legends for the density lines
ax2.legend(loc="upper left")

# Show the plot
plt.show()

In [None]:
df_plot = df.copy()
# group by name_team_home and calculate mean y position

# plot positions and xG
plt.figure(figsize=(10, 6))
# assets\handballfeld.png
img = plt.imread("../assets/handballfeld.png")
plt.imshow(img, extent=[0, 40, 0, 20])
plt.scatter(df["pos_x_player"], df["pos_y_player"], c=df["xG"], cmap="coolwarm")
plt.colorbar()
plt.title("xG of Throws")
plt.xlabel("x-coordinate")
plt.ylabel("y-coordinate")
plt.show()

# display
display(df.head())


In [None]:
# Group by player and calculate the sum of xG and goals, and count the number of throws
df_grouped = df.groupby("name_player")[["xG", "target"]].agg(['sum', 'count']).reset_index()

# Flatten the multi-index columns
df_grouped.columns = ["name_player", "xG_sum", "goal_sum", "xG_count", "goal_count"]

# Normalize the xG and goals by dividing by the number of throws (using the count for xG or target)
# df_grouped["xG_normalized"] = df_grouped["xG_sum"] / len(df_grouped)
# df_grouped["goal_normalized"] = df_grouped["goal_sum"] / len(df_grouped)

# sort by xG sum
df_grouped = df_grouped.sort_values(by="goal_count", ascending=True)
# drop players with less than 10 throws
df_grouped = df_grouped[df_grouped["goal_count"] > 30]

# Display the dataframe to check the results
display(df_grouped.head())

# Plot xG sum vs. goal sum (both normalized)
plt.figure(figsize=(10, 10))
plt.scatter(df_grouped["xG_sum"], df_grouped["goal_sum"], color='blue', alpha=0.7)
plt.title("Normalized xG Sum vs Normalized Goal Sum")
plt.xlabel("Normalized xG")
plt.ylabel("Normalized Goals")
# make axis equal
# plt.axis('equal')
# bot start at 0
plt.xlim(0, 70)
plt.ylim(0.0, 70)
plt.grid(True)
plt.show()


### Print information about the xG, which teams are the best atm?

In [None]:

# Aggregate xG and actual goals (target) for each team
df_team_stats = df.groupby("name_team_home").agg(
    total_xG=("xG", "sum"),
    total_goals=("target", "sum")
).reset_index()

# Melt the DataFrame to a long format for easier plotting
df_melted = pd.melt(df_team_stats, id_vars=["name_team_home"], value_vars=["total_xG", "total_goals"],
                    var_name="Metric", value_name="Value")

# Sort the teams by xG or total goals
df_team_stats = df_team_stats.sort_values(by="total_xG", ascending=False)

# Plot the xG and actual goals side by side for each team
plt.figure(figsize=(12, 6))
sns.barplot(data=df_melted, x="name_team_home", y="Value", hue="Metric", palette="Blues_d")

# Add title and labels
plt.title("Total xG and Actual Goals by Home Team", fontsize=16)
plt.xlabel("Team", fontsize=12)
plt.ylabel("Total Value", fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Add gridlines for the y-axis
plt.grid(True, which="both", axis="y", alpha=0.3)

# Show the plot
plt.show()

In [None]:
# # Assuming 'num_events' represents the number of matches or events for each team
# df_team_stats["num_events"] = df["name_team_home"].value_counts()

# display(df_team_stats)
# # transform df_team_stats["num_events"] to int
# df_team_stats["num_events"] = df_team_stats["num_events"].astype(int)

# df_team_stats["xG_per_event"] = df_team_stats["total_xG"] / df_team_stats["num_events"]
# df_team_stats["goals_per_event"] = df_team_stats["total_goals"] / df_team_stats["num_events"]

# # Melt the DataFrame to a long format for easier plotting
# df_melted = pd.melt(df_team_stats, id_vars=["name_team_home"], value_vars=["xG_per_event", "goals_per_event"],
#                     var_name="Metric", value_name="Value")

# # Plot normalized xG vs normalized goals
# plt.figure(figsize=(12, 6))
# sns.scatterplot(data=df_team_stats, x="xG_per_event", y="goals_per_event", hue="name_team_home", s=100)

# # Add title and labels
# plt.title("Normalized xG vs Actual Goals by Home Team (Per Event)", fontsize=16)
# plt.xlabel("Normalized xG (Per Event)", fontsize=12)
# plt.ylabel("Normalized Goals (Per Event)", fontsize=12)

# # Add 1:1 line
# plt.plot([0, df_team_stats["xG_per_event"].max()], [0, df_team_stats["goals_per_event"].max()], color="red", linestyle="--", linewidth=2)

# # Adjust axis limits based on normalized values
# plt.xlim(0, df_team_stats["xG_per_event"].max() + 0.1)
# plt.ylim(0, df_team_stats["goals_per_event"].max() + 0.1)

# # Add gridlines
# plt.grid(True, which="both", alpha=0.3)

# # Show the plot
# plt.show()


In [None]:
# plot xG vs actual goals
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df_team_stats, x="total_xG", y="total_goals", hue="name_team_home", s=100)

# Add title and labels
plt.title("Total xG vs Actual Goals by Home Team", fontsize=16)
plt.xlabel("Total xG", fontsize=12)
plt.ylabel("Total Goals", fontsize=12)

# make axis equal and add 1:1 line
# plt.axis("equal")
plt.plot([0, 300], [0, 300], color="red", linestyle="--", linewidth=2)

# limits to 0-300
plt.xlim(150, 300)
plt.ylim(150, 300)

# Add gridlines
plt.grid(True, which="both", alpha=0.3)

# Show the plot
plt.show()
