# Feature Engineering for xG-NextGen Project

This notebook engineers advanced features from cleaned soccer shot data to improve xG modeling.

## Import Libraries

Load required Python packages and modules.

In [None]:
!pip install pandas numpy xgboost scikit-learn shap matplotlib



In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder
import os

## Load Cleaned Data

Import processed shots and freeze frame data.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set data paths
processed_dir = '/content/drive/MyDrive/xG-NextGen/data/processed/'
shots_path = os.path.join(processed_dir, 'shots.csv')
freeze_path = os.path.join(processed_dir, 'freeze_frames.csv')

# Load data
print("Loading cleaned data...")
shots_df = pd.read_csv(shots_path)
freeze_df = pd.read_csv(freeze_path)

print(f"Shots data: {shots_df.shape}")
print(f"Freeze frames: {freeze_df.shape}")

Mounted at /content/drive
Loading cleaned data...
Shots data: (87111, 14)
Freeze frames: (1110839, 6)


## Time-Based Feature

Extract minute of match from timestamp.

In [None]:
print("Engineering time-based features...")

# Convert timestamp to datetime
shots_df['timestamp'] = pd.to_datetime(shots_df['timestamp'])

# Extract minute of match
shots_df['minute'] = shots_df['timestamp'].dt.hour * 60 + \
                     shots_df['timestamp'].dt.minute + \
                     shots_df['timestamp'].dt.second / 60

# Drop original timestamp
shots_df = shots_df.drop('timestamp', axis=1)

Engineering time-based features...


## Geometric Features

Calculate shot distance and angle relative to goal.

In [None]:
print("Calculating geometric features...")
GOAL_CENTER = (120, 40)
GOAL_WIDTH  = 7.32

def compute_distance(x, y):
    return np.sqrt((x - GOAL_CENTER[0])**2 + (y - GOAL_CENTER[1])**2)

shots_df['distance'] = shots_df.apply(lambda r: compute_distance(r['x'], r['y']), axis=1)

def calculate_angle(row):
    x, y = row['x'], row['y']
    left_post  = (GOAL_CENTER[0], GOAL_CENTER[1] - GOAL_WIDTH/2)
    right_post = (GOAL_CENTER[0], GOAL_CENTER[1] + GOAL_WIDTH/2)

    a = np.sqrt((x - right_post[0])**2 + (y - right_post[1])**2)
    b = np.sqrt((x - left_post[0])**2 + (y - left_post[1])**2)
    c = GOAL_WIDTH

    # Calculate raw cosine
    if a > 0 and b > 0:
        cos_val = (a*a + b*b - c*c) / (2 * a * b)
    else:
        cos_val = 0
    # Clamp to valid domain
    cos_val = np.clip(cos_val, -1.0, 1.0)

    # Compute angle
    angle_rad = np.arccos(cos_val)
    return np.degrees(angle_rad)

shots_df['angle'] = shots_df.apply(calculate_angle, axis=1)
# Clean up any potential NaNs or negatives
shots_df['angle'] = shots_df['angle'].fillna(0)
shots_df.loc[shots_df['angle'] < 0, 'angle'] = 0

Calculating geometric features...


## Defensive Pressure Features

Compute defensive pressure metrics from freeze frame data.

In [None]:
print("Calculating defensive pressure features (vectorized)…")
import numpy as np

# 1. Reload shot locations with renamed columns
loc_df = pd.read_csv(
    os.path.join(processed_dir, 'shots.csv'),
    usecols=['shot_id', 'x', 'y']
)

# 2. Merge freeze frames to shot locations (add suffixes!)
merged_df = freeze_df.merge(
    loc_df,
    on='shot_id',
    suffixes=('_player', '_shot')
)

# 3. Compute distances from player to shot
merged_df['dist'] = np.hypot(
    merged_df['x_player'] - merged_df['x_shot'],
    merged_df['y_player'] - merged_df['y_shot']
)

# 4. Identify defenders and goalkeepers
is_defender = (~merged_df['teammate']) & (~merged_df['goalkeeper'])
is_keeper   = (~merged_df['teammate']) & ( merged_df['goalkeeper'])

# 5. Count defenders within 5 meters
defs_within_5m = merged_df[is_defender & (merged_df['dist'] <= 5)]
defender_counts = defs_within_5m.groupby('shot_id').size()

# 6. Find min distance to goalkeeper
gks = merged_df[is_keeper]
gk_min_dist = gks.groupby('shot_id')['dist'].min()

# 7. Map results back to main DataFrame
shots_df['defenders_in_5m'] = shots_df['shot_id'].map(defender_counts).fillna(0).astype(int)
shots_df['gk_distance'] = shots_df['shot_id'].map(gk_min_dist).fillna(50.0)

print("Done — vectorized defensive pressure features created.")

Calculating defensive pressure features (vectorized)…
Done — vectorized defensive pressure features created.


## Angular Defensive Pressure Features

In [None]:
print("Calculating angular defensive pressure (vectorized)…")
import numpy as np

# 1) Rebuild merged_df: join every freeze‐frame point to its shot x/y
merged_df = freeze_df.merge(
    shots_df[['shot_id','x','y']],
    on='shot_id',
    suffixes=('_player','_shot')
)

# Compute defender–shooter deltas and distances
merged_df['dx']   = merged_df['x_player'] - merged_df['x_shot']
merged_df['dy']   = merged_df['y_player'] - merged_df['y_shot']
merged_df['dist'] = np.hypot(merged_df['dx'], merged_df['dy'])

# Filter only opponent non-GKs within 5m
mask = (
    (~merged_df['teammate']) &
    (~merged_df['goalkeeper']) &
    (merged_df['dist'] > 0) &
    (merged_df['dist'] <= 5)
)
df_def = merged_df[mask].copy()

# Compute shot→goal and defender→shooter angles
df_def['goal_ang'] = np.arctan2(40 - df_def['y_shot'], 120 - df_def['x_shot'])
df_def['def_ang']  = np.arctan2(df_def['dy'], df_def['dx'])

# Compute minimal angular difference (deg)
df_def['ang_diff'] = np.abs(np.degrees(df_def['def_ang'] - df_def['goal_ang']))
df_def['ang_diff'] = np.where(df_def['ang_diff'] > 180,
                              360 - df_def['ang_diff'],
                              df_def['ang_diff'])

# Keep defenders within ±20°
df_def = df_def[df_def['ang_diff'] <= 20]

# Weight by exp(-distance)
df_def['pressure_contrib'] = np.exp(-df_def['dist'])

# 2) Aggregate per shot
pressure_series = df_def.groupby('shot_id')['pressure_contrib'].sum()

# 3) Map back onto shots_df
shots_df['angular_pressure'] = shots_df['shot_id'].map(pressure_series).fillna(0.0)

print("Done—angular_pressure added.")

Calculating angular defensive pressure (vectorized)…
Done—angular_pressure added.


## Assist Type Encoding

In [None]:
print("One-hot encoding assist types…")
shots_df['assist_type'] = shots_df['assist_type'].fillna('None')
assist_dums = pd.get_dummies(shots_df['assist_type'], prefix='assist', drop_first=True)
shots_df = pd.concat([shots_df,assist_dums],axis=1)
shots_df.drop('assist_type',axis=1,inplace=True)

One-hot encoding assist types…


## Pre-Shot Sequence

In [None]:
print("Pre-shot sequence (n_prev_passes) already numeric—no transform needed.")

Pre-shot sequence (n_prev_passes) already numeric—no transform needed.


## Categorical Feature Encoding

Encode categorical variables using one-hot encoding.

In [None]:
print("Encoding categorical features...")

# List of categorical columns to encode
cat_cols = ['body_part', 'shot_type', 'position']

# One-hot encode each categorical column
for col in cat_cols:
    # Handle missing values
    shots_df[col] = shots_df[col].fillna('Unknown')

    # One-hot encode
    dummies = pd.get_dummies(shots_df[col], prefix=col, drop_first=True)
    shots_df = pd.concat([shots_df, dummies], axis=1)

# Drop original categorical columns
shots_df = shots_df.drop(cat_cols, axis=1)

Encoding categorical features...


## Contextual Features

Incorporate match context features.

In [None]:
print("Adding contextual features...")

# Convert boolean to int
shots_df['is_home'] = shots_df['is_home'].astype(int)

# Create goal difference absolute value
shots_df['abs_goal_diff'] = abs(shots_df['goal_difference'])

# Drop original coordinates
shots_df = shots_df.drop(['x', 'y'], axis=1)

Adding contextual features...


## Final Feature Set

Prepare and save final feature set.

In [None]:
print("Saving final feature set...")

# Define feature columns (exclude IDs and target)
feature_cols = [col for col in shots_df.columns
                if col not in ['shot_id','match_id','shot_outcome','goal','freeze_frame']]

# Create features DataFrame
features_df = shots_df[feature_cols]

# Add target variable
features_df['goal'] = shots_df['goal']

# Save to CSV
features_path = os.path.join(processed_dir, 'features.csv')
features_df.to_csv(features_path, index=False)

print(f"Saved engineered features to {features_path}")
print(f"Final feature set shape: {features_df.shape}")

Saving final feature set...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df['goal'] = shots_df['goal']


Saved engineered features to /content/drive/MyDrive/xG-NextGen/data/processed/features.csv
Final feature set shape: (87111, 18)


## Feature Summary

Display summary of engineered features.

In [None]:
print("\nEngineered features summary:")
print(features_df.info())

print("\nNumerical features summary:")
print(features_df.describe())

print("\nTarget distribution:")
print(features_df['goal'].value_counts(normalize=True))


Engineered features summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87111 entries, 0 to 87110
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   goal_difference       87111 non-null  int64  
 1   is_home               87111 non-null  int64  
 2   n_prev_passes         87111 non-null  int64  
 3   minute                87111 non-null  float64
 4   distance              87111 non-null  float64
 5   angle                 87111 non-null  float64
 6   angular_pressure      87111 non-null  float64
 7   body_part_Left Foot   87111 non-null  bool   
 8   body_part_Other       87111 non-null  bool   
 9   body_part_Right Foot  87111 non-null  bool   
 10  shot_type_Free Kick   87111 non-null  bool   
 11  shot_type_Kick Off    87111 non-null  bool   
 12  shot_type_Open Play   87111 non-null  bool   
 13  shot_type_Penalty     87111 non-null  bool   
 14  abs_goal_diff         87111 non-null  in