# xG CNN - All Messi's shots in La Liga 2020/2021

### By Matteo Matteotti \& Hadi Sotudeh

This notebook script will allow you to load pretrained xG-CNN model and use it to compute the expected goals for each shot taken by Lionel Messi in his last season in Barcelona. 🐐

In [1]:
from statsbombpy import sb
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import math
import pandas as pd
import torch
from torch.utils.data import DataLoader
from src import load_model, load_heatmap_dataset


Create a dictionary of shots which will be later turned into heatmaps using numpy.

In [2]:
# create a json
dict_shots = {}

# get data last Messi's season in Barcelona 2020/2021
all_matches = sb.matches(competition_id=11, season_id=90)['match_id'].tolist()

# for every game
for match_id in tqdm(all_matches, total = len(all_matches), desc = 'Scanning games, looking for shots.'):
    # extract all shots
    df_shots = sb.events(match_id, split=True)['shots']
    # isolate open-play shots by messi
    df_shots = df_shots.loc[(~df_shots['shot_freeze_frame'].isna()) & (df_shots['shot_type'] == 'Open Play') & (df_shots['player_id'] == 5503), ['index', 'location', 'shot_freeze_frame', 'shot_outcome']].reset_index(drop = True)
    # for every Messi's shot
    for _, shot in df_shots.iterrows():
        # create shot dictionary
        shot_dict = {str(index): dictionary for index, dictionary in enumerate(shot['shot_freeze_frame'])}
        shot_dict['ball'] = shot['location']
        shot_dict['outcome'] = shot["shot_outcome"].lower()
        shot_name = f'{match_id}_{shot["index"]}'
        shot_dict['match_id'] = match_id
        shot_dict['index'] = shot['index']
        dict_shots[shot_name] = shot_dict


Scanning games, looking for shots.: 100%|██████████| 35/35 [00:28<00:00,  1.25it/s]


Turn the dictionary into heatmaps.

In [3]:
# create the array of shots
shots = np.zeros((len(dict_shots), 2, 30, 40))
# keep track of the labels (goal or non goal)
labels = []
# iterate over every shot
for i, (shot_name, shot) in enumerate(tqdm(dict_shots.items())):
    # create the heatmap of the shot
    shot_frame = np.zeros((2, 60, 40))
    # add the ball location, after having discretised its location
    ball_discrete_x = min(math.floor(shot['ball'][0] / 2), 59)
    ball_discrete_y = min(math.floor(shot['ball'][1] / 2), 39)
    shot_frame[1, ball_discrete_x, ball_discrete_y] = 1
    # iterate over every player
    for player_id, player in shot.items():
        # make sure this is actually a player
        if player_id not in ['ball', 'outcome', 'match_id', 'index']:
            # only consider opponents
            if not player['teammate']:
                # add the player in the corresponding location, after having discretised it
                discrete_x = min(math.floor(player['location'][0] / 2), 59)
                discrete_y = min(math.floor(player['location'][1] / 2), 39)
                # add the player in the right location, only if they are actually closer to the goal than the ball or if they are in a 1x1 neighbourhood around the ball
                if (discrete_x >= ball_discrete_x) or (discrete_x >= ball_discrete_x - 1 and abs(discrete_y - ball_discrete_y) <= 1):
                    shot_frame[0, discrete_x, discrete_y] += 1
    
    # all players have been added, crop out the defensive half
    cropped_shot_frame = shot_frame[:, 30:, :]
    # # add it to the global array of shots, after having smoothed it out using gaussian smoothing filter
    # all_shot_frames[i] = gaussian_filter(cropped_shot_frame, sigma = (0, .75, .75))
    # add it to the global array of shots, after having smoothed it out using gaussian smoothing filter
    shots[i] = cropped_shot_frame
    # add the label (goal or nongoal)
    labels.append(0 if shot_name.startswith('n') else 1)

np.save('data/messi_shots.npy', shots)
np.save('data/messi_labels.npy', labels)

100%|██████████| 150/150 [00:00<00:00, 30655.64it/s]


Load the model

In [4]:
# remember to set the device here, whether you are using a cpu or cuda
device = 'cpu'

In [5]:
model_dict = torch.load('trained_models/xg_cnn.pth', map_location=torch.device(device))
xg_cnn = load_model()
xg_cnn.load_state_dict(model_dict)
# put the model in evaluation mode
xg_cnn.eval()

XGCNN(
  (conv_layer_1): Sequential(
    (0): Conv2d(2, 16, kernel_size=(3, 3), stride=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (pooling_1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv_layer_2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (pooling_2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv_layer_3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (pooling_3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv_layer_4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, mom

Load the dataset into torch and apply the model

In [6]:
dataset = load_heatmap_dataset(data_path='data/messi_shots.npy', labels_path='data/messi_labels.npy', augmentation=False, g_filter=1.25)
shots_loader = DataLoader(dataset, batch_size=100, shuffle=False)


In [7]:
xGs = []
    
with torch.no_grad():
    for images, labels in tqdm(shots_loader, total = len(shots_loader)):
        # send to device
        images, labels = images.to(device), labels.to(device)
        # compute the xg
        outputs = xg_cnn(images)
        # append the xG
        xGs.append(outputs.squeeze())

100%|██████████| 2/2 [00:00<00:00, 26.61it/s]


In [8]:
# create a dataframe
df_shots = pd.DataFrame({
    'shot_name' : list(dict_shots.keys()),
    'xG' : torch.cat(xGs).tolist()
})
# split the shot name to get match_id and index, for better consultability
df_shots[['match_id', 'index']] = df_shots['shot_name'].str.split('_', expand = True)
df_shots = df_shots.drop(columns = ['shot_name'])[['match_id', 'index', 'xG']]
df_shots

Unnamed: 0,match_id,index,xG
0,3773386,1749,0.082750
1,3773386,2003,0.052889
2,3773386,2146,0.027046
3,3773386,2685,0.165181
4,3773386,3727,0.028766
...,...,...,...
145,3764440,3114,0.391903
146,3773689,800,0.030031
147,3773689,2494,0.099656
148,3773477,3401,0.086078
