# **Data Preprocessing**

This section covers the importation of necessary libraries, reading in the data, and performing preprocessing tasks such as filtering non-penalty shots and converting the `freeze_frame` column.

# **Data Visualization - Freeze Frame Plotting**

This section introduces a function to visualize the freeze frames using `mplsoccer`. It includes a plot for random samples and specific shots from the dataset.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mplsoccer import VerticalPitch
import ast
import seaborn as sns
import scipy.ndimage as scn
import os

# Set the base directory to the project's root
BASE_DIR = os.path.dirname(os.getcwd())

# Path to the data directory
DATA_DIR = os.path.join(BASE_DIR, 'data')

# Example: Path to your shots_df.csv file
SHOTS_DF_PATH = os.path.join(DATA_DIR, 'shots_df.csv')

# Load the data
shots_df = pd.read_csv(SHOTS_DF_PATH)

#Keep only non-penalty shots
shots_df = shots_df[shots_df['shot_type'] != 'Penalty']

# Replace null values with an empty string
shots_df['freeze_frame'] = shots_df['freeze_frame'].fillna('[]')

# Convert the freeze_frame column back to a list of dictionaries
shots_df['freeze_frame'] = shots_df['freeze_frame'].apply(ast.literal_eval)

# Filter out rows where there is no goalkeeper who is not a teammate
def has_non_teammate_goalkeeper(freeze_frame):
    #For every player in the freezeframe
    for player in freeze_frame:
        #Return false if there is a player tagged as a teammate
        if player.get('keeper', False) and not player.get('teammate', False):
            return True
    return False

shots_df = shots_df[shots_df['freeze_frame'].apply(has_non_teammate_goalkeeper)]

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\callu\\data\\shots_df.csv'

In [None]:
# Function to draw a freeze frame using 'mplsoccer'
def draw_freeze_frame(shot, plot_angle=False):
    #'mplsoccer' plots the pitch
    p = VerticalPitch(pitch_color='#aabb97', line_color='white',
              stripe_color='#c2d59d', stripe=True, pitch_type='statsbomb', axis = True, label = True, half = True)
    # Draw the figure
    fig, ax = p.draw(figsize=(20, 12))

    # Loop through each player in the shot freeze frame
    for player in shot['freeze_frame']:
        # Color the shooter as black
        if player['actor']:
            color = 'black'
        # Color the goalkeeper as orange
        elif player['keeper'] and not player['teammate']:
            color = 'orange'
        # Color the attacking teammates as green
        elif player['teammate']:
            color = 'green'
        # Color all remaining players (defenders) in red
        else:
            color = 'red'
        # Plot the player
        p.scatter(x=player['location'][0], y=player['location'][1], ax=ax, c=color, s=100)
        
    # Plot a shaded area depicting the angle to the goal if plot_angle argument is TRUE
    if plot_angle:
        p.goal_angle(shot.x_shot, shot.y_shot, ax=ax, alpha=0.2, zorder=1.1,
                     color='black', goal='right')

    # Extract shot details for the title
    player_name = shot['player']
    team_name = shot['possession_team']
    competition_name = shot['competition']
    home_team = shot['home_team']
    away_team = shot['away_team']
    match_date = shot['match_date']
    half = shot['period']
    minute = shot['timestamp']
    xg = shot['shot_statsbomb_xg']
    shot_outcome = shot['shot_outcome']
    opponent = away_team if home_team == team_name else home_team
    build_up = shot['shot_type']
    body_part = shot['shot_body_part']
    technique = shot['shot_technique']

    # Construct the title string using the shot details
    title_str = (
        f"{player_name} ({team_name}) vs {opponent}\n"
        f"Half: {half}, Minute: {minute}\n"
        f"{competition_name} {match_date}\n"
        f"{build_up}, {technique} technique with {body_part}\n"
        f"SBxG: {xg}, Outcome: {shot_outcome}"
    )

    # Set the title of the plot
    ax.set_title(title_str, fontsize=12)
    return fig

# Select 5 rows from shots_df at random
random_shots = shots_df.sample(n=5)

# Draw the freeze frame for each random shot
for index, row in random_shots.iterrows():
    fig = draw_freeze_frame(row, plot_angle=False)
    plt.show()

# **Competition Summary Statistics**

Here, we summarize the data by competition and year, including the number of matches, shots, and goals.

# **Categorical Variables Visualization**

This section visualizes the proportion of goals by various shot characteristics such as shot technique, body part used, and whether the shot was first-time.

In [None]:
# Convert match_date to datetime to extract the year
shots_df['match_year'] = pd.to_datetime(shots_df['match_date']).dt.year

# Create dataframe of relevant information from shots_df
matches_per_competition_year = shots_df[['match_id', 'competition', 'match_year']].drop_duplicates().groupby(['competition', 'match_year']).size()

# Get number of shots grouped by competition and year
shots_per_competition_year = shots_df.groupby(['competition', 'match_year']).size()

# Get number of goals grouped by competition and year
goals_per_competition_year = shots_df[shots_df['goal'] == 1].groupby(['competition', 'match_year']).size()

# Combine the results into a single DataFrame for easier comparison 
competition_summary_statistics = pd.DataFrame({
    'Matches': matches_per_competition_year,
    'Shots': shots_per_competition_year,
    'Goals': goals_per_competition_year
}).fillna(0).astype(int)

In [None]:
# Define the columns to include in the visualization
columns_to_plot = ['shot_body_part', 'shot_first_time', 'shot_technique', 'shot_type']

# Create a 2x2 subplot with title
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Proportion of Goals by Shot Characteristics', fontsize=16)

# Flatten the axes array for easier indexing
axes = axes.flatten()

# Loop through each column
for i, column in enumerate(columns_to_plot):
    # Calculate the number of shots and the proportion that resulted in goals for each variable
    grouped = shots_df.groupby(column).agg(
        goals=('goal', 'sum'),
        total=('goal', 'count')
    )
    grouped['proportion'] = grouped['goals'] / grouped['total']

    # Sort the grouped DataFrame by number of shots in descending order
    grouped = grouped.sort_values(by='total', ascending=False)

    # Plot the bar chart
    bars = axes[i].bar(grouped.index, grouped['proportion'], color='skyblue', edgecolor = 'black')

    # For each bar, get the bar height and sample size
    for bar, (index, row) in zip(bars, grouped.iterrows()):
        height = bar.get_height()
        sample_size = int(row['total'])
        axes[i].text(bar.get_x() + bar.get_width() / 2, height,
                     f'n={sample_size}', ha='center', va='bottom', fontsize=10)

    # Set the title of each subplot
    axes[i].set_title(f'Proportion of Goals by {column.replace("_", " ").title()}', fontsize=12)
    
    # Handle text wrapping for 'Overhead Kick' in 'shot_technique' to avoid overlapping text
    if column == 'shot_technique':
        labels = [label.get_text().replace('Overhead Kick', 'Overhead\nKick').replace('Diving Header', 'Diving\nHeader') for label in axes[i].get_xticklabels()]
        axes[i].set_xticklabels(labels)
    
    # Special handling for 'shot_first_time' to ensure True/False labels rather than 0 and 1
    if column == 'shot_first_time':
        axes[i].set_xticks([0, 1])
        axes[i].set_xticklabels(['False', 'True'])

# Adjust layout to reduce white space between plots
plt.tight_layout(rect=[0, 0, 1, 0.96], pad=2.5, w_pad=5.0, h_pad=5.0)
plt.show()

# **Shot Coordinates and Analysis**

This section calculates the shot distance and angle, then visualizes the distribution of these metrics.

# **Shot Location Heatmaps**

This section creates a heatmap of shot locations for all shots and for shots from the UEFA Euro 2024 competition.

# **Individual Shot Freeze Frames**

This section displays specific freeze frames by shot ID.

In [None]:
# Get distance of shot from byline
shots_df['shot_distance_fc'] = 120 - shots_df['x_shot']

# Function that calculates shot angle
def calculate_shot_angle_fc(row):
    
    # Define fixed points A and B representing the goalposts
    A = np.array([120, 36])
    B = np.array([120, 44])
    
    # Define point C, representing the position of the shot
    C = np.array([row['x_shot'], row['y_shot']])
    
    # Create vectors CA and CB
    CA = A - C
    CB = B - C
    
    # Calculate the dot product of CA and CB
    dot_product = np.dot(CA, CB)
    
    # Calculate the magnitudes of CA and CB
    magnitude_CA = np.linalg.norm(CA)
    magnitude_CB = np.linalg.norm(CB)
    
    # Calculate the cosine of the angle
    cosine_angle = dot_product / (magnitude_CA * magnitude_CB)
    
    # Ensure the cosine value is within the valid range [-1, 1]
    cosine_angle = np.clip(cosine_angle, -1, 1)
    
    # Calculate the angle in radians
    angle_radians = np.arccos(cosine_angle)
    
    # Convert the angle to degrees
    angle_degrees = np.degrees(angle_radians)
    
    return angle_degrees

# Apply the function to each row in the dataframe to create a new column 'shot_angle'
shots_df['shot_angle_fc'] = shots_df.apply(calculate_shot_angle_fc, axis=1)

# Histogram of Shots by Distance with Kernel Density Estimate
plt.figure(figsize=(10, 6))
bins = np.linspace(shots_df['shot_distance_fc'].min(), shots_df['shot_distance_fc'].max(), 30)

sns.histplot(
    shots_df['shot_distance_fc'],
    bins=bins,
    color='red',
    kde=True,
    alpha=1,
    label='Non-Goals'
)

plt.gca().lines[0].set_color('blue')

sns.histplot(
    shots_df[shots_df['goal'] == 1]['shot_distance_fc'],
    bins=bins,
    color='green',
    kde=False,
    alpha=1,
    label='Goals'
)

plt.xlabel('Shot Distance')
plt.ylabel('Frequency')
plt.title('Histogram of Shots by Distance with Kernel Density Estimate')
plt.legend()
plt.show()

# Histogram of Shots by Angle with Kernel Density Estimate
plt.figure(figsize=(10, 6))
bins = np.linspace(shots_df['shot_angle_fc'].min(), shots_df['shot_angle_fc'].max(), 30)

sns.histplot(
    shots_df['shot_angle_fc'],
    bins=bins,
    color='red',
    kde=True,
    alpha=1,
    label='Non-Goals'
)

plt.gca().lines[0].set_color('blue')

sns.histplot(
    shots_df[shots_df['goal'] == 1]['shot_angle_fc'],
    bins=bins,
    color='green',
    kde=False,
    alpha=1,
    label='Goals'
)

plt.xlabel('Shot Angle (degrees)')
plt.ylabel('Frequency')
plt.title('Histogram of Shots by Angle with Kernel Density Estimate')
plt.legend()
plt.show()

# Calculate the proportion of goals at less than 60 degrees and above 60 degrees
less_than_60 = shots_df[(shots_df['goal'] == 1) & (shots_df['shot_angle_fc'] < 20)]
greater_than_60 = shots_df[(shots_df['goal'] == 1) & (shots_df['shot_angle_fc'] >= 20)]
total_less_than_60 = len(shots_df[shots_df['shot_angle_fc'] < 20])
total_greater_than_60 = len(shots_df[shots_df['shot_angle_fc'] >= 20])

less_than_60_prop = len(less_than_60) / total_less_than_60
greater_than_60_prop = len(greater_than_60) / total_greater_than_60

less_than_60_prop, greater_than_60_prop

In [None]:
# Set up the pitch
pitch = VerticalPitch(pitch_color='#aabb97', line_color='white', line_zorder = 2,
          stripe_color='#c2d59d', stripe=True, pitch_type='statsbomb', axis = True, label = True, half = True)
fig, ax = pitch.draw(figsize=(20, 12))

# Define the bins on the pitch
bin_statistic = pitch.bin_statistic(shots_df.x_shot, shots_df.y_shot, statistic='count', bins=(50, 50))
# Add gaussian filter for smoothing
bin_statistic['statistic'] = scn.gaussian_filter(bin_statistic['statistic'], 1)
# Create the heatmap
pcm = pitch.heatmap(bin_statistic, ax=ax, cmap='hot', edgecolors='#22312b')
# Add the colorbar and format off-white
cbar = fig.colorbar(pcm, ax=ax, shrink=0.6)
cbar.outline.set_edgecolor('black')
ticks = plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='black')

# Convert match date to datetime form 
shots_df['match_date'] = pd.to_datetime(shots_df['match_date'])
# As the visualization is too crowded when all shots are used, use Euro 2024 competition only
eur_shots_df = shots_df[(shots_df['competition'] == 'Europe - UEFA Euro') & (shots_df['match_date'] >= '2024-01-01')]

# Plot each shot colored by its outcome
pitch = VerticalPitch(pitch_color='#aabb97', line_color='white', line_zorder = 2,
          stripe_color='#c2d59d', stripe=True, pitch_type='statsbomb', axis = True, label = True, half = True)
# draw
fig, ax = pitch.draw(figsize=(20, 12))
for _, shot in eur_shots_df.iterrows():
    color = 'darkgreen' if shot['goal'] else 'red'
    pitch.scatter(shot.x_shot, shot.y_shot, s=40, color=color, edgecolors='black', ax=ax, alpha=0.5, zorder=3)

# Add a legend for goals and no goals
goal_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='darkgreen', markersize=10, label='Goal', linestyle='None')
no_goal_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='No Goal', linestyle='None')
ax.legend(handles=[goal_patch, no_goal_patch], loc='lower right', frameon=True)

# Show the plot
plt.show()

In [None]:
# Specify shot_id for desired freezeframe
specific_id = '28cd796c-dc0c-45d3-a08a-5d405158e03b'

# Filter the DataFrame by the specific ID
specific_shot = shots_df[shots_df['id'] == specific_id]

# Check if the specific shot exists
if not specific_shot.empty:
    # Get the row corresponding to the specific shot
    row = specific_shot.iloc[0]
    # Draw the freeze frame for the specific shot
    fig = draw_freeze_frame(row, plot_angle=True)
    plt.show()
else:
    print(f"No shot found with ID {specific_id}")
    
# Repeat this block for other specific shot IDs as needed