# Building an Interpretable Expected Goals (xG) Model
* Author: Oliver Mueller
* Date: 2024-02-15

In [None]:
# Install packages that are not already installed on Colab
#!pip install statsbombpy

In [None]:
import warnings
warnings.filterwarnings("ignore")

import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from statsbombpy import sb
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, auc, RocCurveDisplay
from sklearn.metrics import classification_report
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Load data

Using the `statsbombpy` package, we will query the StatsBomb API for all events of the Bundesliga season 2015/16.  

In [None]:
grouped_events = sb.competition_events(
    country="Germany",
    division= "1. Bundesliga",
    season="2015/2016",
    gender="male",
    split=True
)

What types of events do we have?

In [None]:
grouped_events.keys()

Extract and explore the shots.

In [None]:
shots = grouped_events["shots"]

In [None]:
shots.shape

In [None]:
shots.head()

In [None]:
shots.columns

In [None]:
shots.iloc[0]

In [None]:
shots.iloc[0]["shot_freeze_frame"]

In [None]:
shots["shot_outcome"].value_counts()

Save/load shots object to/from disk.

In [None]:
#shots.to_pickle("data/shots.pkl")
shots = pd.read_pickle("https://github.com/olivermueller/vhbprodok_datascience/raw/main/football_shots/data/shots.pkl")

## Train classifier

Construct the label.

In [None]:
shots["is_goal"] = shots["shot_outcome"].apply(lambda x: 1 if x == "Goal" else 0)

Engineer some features.

In [None]:
shots["location_x"] = shots["location"].apply(lambda x: float(x[0]))
shots["location_y"] = shots["location"].apply(lambda x: float(x[1]))

In [None]:
shots["distance_to_goal"] = shots.apply(lambda row: math.dist(row["location"], [120.0, 40.0]), axis=1)

In [None]:
def angle_between_points(P, Q1, Q2):
    # Calculate vectors from P to Q1 and Q2
    vector_PQ1 = (Q1[0] - P[0], Q1[1] - P[1])
    vector_PQ2 = (Q2[0] - P[0], Q2[1] - P[1])
    
    # Calculate dot product
    dot_product = vector_PQ1[0] * vector_PQ2[0] + vector_PQ1[1] * vector_PQ2[1]
    
    # Calculate magnitudes of vectors
    magnitude_PQ1 = math.sqrt(vector_PQ1[0] ** 2 + vector_PQ1[1] ** 2)
    magnitude_PQ2 = math.sqrt(vector_PQ2[0] ** 2 + vector_PQ2[1] ** 2)
    
    # Calculate angle in radians
    angle_radians = math.acos(dot_product / (magnitude_PQ1 * magnitude_PQ2))
    
    # Convert angle to degrees
    angle_degrees = math.degrees(angle_radians)
    
    return angle_degrees

In [None]:
shots["angle_to_goal"] = shots.apply(lambda row: angle_between_points(row["location"], [120.0, 36.0], [120.0, 44.0]), axis=1)

In [None]:
shots.iloc[0]

Make a train-test split.

In [None]:
train, test = train_test_split(shots, test_size=0.2, random_state=42)

Train a logistic regression model.

In [None]:
model_logit = smf.logit(formula='is_goal ~ location_x + location_y + distance_to_goal + angle_to_goal', data=train)
model_logit = model_logit.fit()

In [None]:
print(model_logit.summary())

Evaluate the model.

In [None]:
pred_proba = model_logit.predict(test)

In [None]:
roc_auc_score(test["is_goal"], pred_proba)

## Visualize model

We create a 2D grid of x/y pitch coordinates from (0, 0) to (120, 80). We also calculate distance and angle to the goal for each cell.

In [None]:
simulated_positions = pd.DataFrame()
simulated_positions["location_x"] = np.tile(np.arange(0, 121, 1), 81)
simulated_positions["location_y"] = np.repeat(np.arange(0, 81, 1), 121)
simulated_positions["distance_to_goal"] = simulated_positions.apply(lambda row: math.dist([row["location_x"], row["location_y"]], [120.0, 40.0]), axis=1)
simulated_positions["angle_to_goal"] = simulated_positions.apply(lambda row: angle_between_points([row["location_x"], row["location_y"]], [120.0, 36.0], [120.0, 44.0]), axis=1)

In [None]:
simulated_positions.head()

In [None]:
simulated_positions.tail()

In [None]:
simulated_positions[(simulated_positions["location_x"] == 109) & (simulated_positions["location_y"] == 40)]

We predict the probability of a goal for each test observation.

In [None]:
pred_proba = model_logit.predict(simulated_positions)
simulated_positions["xG"] = pred_proba

In [None]:
simulated_positions.head()

The highlight is a visualization of our xG model.

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=simulated_positions, x="location_x", y="location_y", hue="xG")
plt.show()
