# DeepShot: Feature Engineering

In [None]:
# ##HIDE##
import pandas as pd
import numpy as np
from pathlib import Path

# Setup directories
processed_dir = Path('../data/processed')
features_dir = processed_dir / 'features'
for directory in [processed_dir, features_dir]:
    directory.mkdir(parents=True, exist_ok=True)

In [3]:
# Load data
shots = pd.read_csv(processed_dir / 'standardized_shots.csv')
player = pd.read_csv(processed_dir / 'standardized_player.csv')
team = pd.read_csv(processed_dir / 'standardized_team.csv')

# Handle column name variations
column_mappings = {
    'PLAYER_NAME': 'player_name',
    'TEAM_NAME': 'team_name',
    'MINUTES_LEFT': 'mins_left',
    'SECONDS_LEFT': 'secs_left',
    'PERIOD': 'quarter',
    'QUARTER': 'quarter',
    'MARGIN': 'score_margin',
    'SHOT_MADE_FLAG': 'shot_made',
    'SHOT_MADE': 'shot_made'
}

# Apply mappings if needed
for old_col, new_col in column_mappings.items():
    if old_col in shots.columns and new_col not in shots.columns:
        shots.rename(columns={old_col: new_col}, inplace=True)

In [4]:
# 1. Spatial Features
shots['shot_distance'] = np.sqrt(shots['loc_x']**2 + shots['loc_y']**2)
shots['shot_angle'] = np.arctan2(shots['loc_x'], shots['loc_y']) * 180 / np.pi

# Court zones
conditions = [
    (shots['shot_distance'] < 4),
    (shots['shot_distance'] < 8) & (shots['shot_distance'] >= 4),
    (shots['shot_distance'] < 16) & (shots['shot_distance'] >= 8),
    (shots['shot_distance'] < 23.75) & (shots['shot_distance'] >= 16),
    (shots['shot_distance'] >= 23.75)
]
zones = ['Restricted Area', 'Paint', 'Mid-Range', 'Long Mid-Range', 'Three-Point']
shots['court_zone'] = np.select(conditions, zones, default='Unknown')
shots['corner_three'] = ((shots['court_zone'] == 'Three-Point') & (abs(shots['shot_angle']) > 45)).astype(int)

In [5]:
# 2. Game Context Features
# Handle missing columns
for col in ['mins_left', 'secs_left', 'quarter']:
    if col not in shots.columns and col.upper() in shots.columns:
        shots[col] = shots[col.upper()]

# Calculate time features if possible
if all(col in shots.columns for col in ['mins_left', 'secs_left', 'quarter']):
    shots['time_remaining_seconds'] = shots['mins_left'] * 60 + shots['secs_left']
    shots['period_type'] = np.where(shots['quarter'] <= 4, 'Regulation', 'Overtime')
    shots['end_of_period'] = ((shots['time_remaining_seconds'] < 120) & 
                             ((shots['quarter'] == 4) | (shots['period_type'] == 'Overtime'))).astype(int)

# Calculate score situation if possible
if 'score_margin' in shots.columns:
    conditions = [
        (shots['score_margin'] < -15),
        (shots['score_margin'] < -5) & (shots['score_margin'] >= -15),
        (shots['score_margin'] < 0) & (shots['score_margin'] >= -5),
        (shots['score_margin'] == 0),
        (shots['score_margin'] > 0) & (shots['score_margin'] <= 5),
        (shots['score_margin'] > 5) & (shots['score_margin'] <= 15),
        (shots['score_margin'] > 15)
    ]
    values = ['Large Deficit', 'Moderate Deficit', 'Small Deficit', 'Tied', 
             'Small Lead', 'Moderate Lead', 'Large Lead']
    shots['score_situation'] = np.select(conditions, values, default='Unknown')
    
    if 'time_remaining_seconds' in shots.columns:
        shots['clutch_situation'] = ((abs(shots['score_margin']) <= 5) & 
                                    (shots['time_remaining_seconds'] < 300) & 
                                    ((shots['quarter'] == 4) | (shots['period_type'] == 'Overtime'))).astype(int)

In [6]:
# 3. Historical Shot Features
# Handle missing columns
for col in ['player_name', 'court_zone', 'season', 'shot_made']:
    if col not in shots.columns and col.upper() in shots.columns:
        shots[col] = shots[col.upper()]

# Calculate shooting percentages
player_zone_season = shots.groupby(['player_name', 'court_zone', 'season']).agg(
    shots=('shot_made', 'count'),
    makes=('shot_made', 'sum')
).reset_index()

player_zone_season['shooting_pct'] = player_zone_season['makes'] / player_zone_season['shots']
player_zone_season['shooting_pct'] = player_zone_season['shooting_pct'].fillna(0.5)
player_zone_season['prior_season'] = player_zone_season['season'] + 1

# Merge to get prior season stats
shots_with_prior = shots.merge(
    player_zone_season[['player_name', 'court_zone', 'prior_season', 'shooting_pct']], 
    left_on=['player_name', 'court_zone', 'season'], 
    right_on=['player_name', 'court_zone', 'prior_season'], 
    how='left', 
    suffixes=('', '_prior')
)

# Add prior_pct column
if 'shooting_pct_prior' in shots_with_prior.columns:
    shots_with_prior.rename(columns={'shooting_pct_prior': 'prior_pct'}, inplace=True)
    shots_with_prior['prior_pct'] = shots_with_prior['prior_pct'].fillna(0.5)
else:
    shots_with_prior['prior_pct'] = 0.5

shots = shots_with_prior
if 'prior_season' in shots.columns:
    shots.drop('prior_season', axis=1, inplace=True)

In [7]:
# 4. Player Performance Features
# Map column names
player_column_mapping = {
    'pts': 'points',
    'fga': 'field_goal_attempts',
    'fta': 'free_throw_attempts',
    'tov': 'turnovers',
    'mp': 'minutes'
}

for old_name, new_name in player_column_mapping.items():
    if old_name in player.columns and new_name not in player.columns:
        player.rename(columns={old_name: new_name}, inplace=True)

# Add default values for missing columns
for col in ['points', 'field_goal_attempts', 'free_throw_attempts', 'turnovers', 'minutes']:
    if col not in player.columns:
        player[col] = 0

# Calculate features
player['true_shooting'] = player['points'] / (2 * (player['field_goal_attempts'] + 0.44 * player['free_throw_attempts']))
player['true_shooting'] = player['true_shooting'].replace([np.inf, -np.inf], np.nan).fillna(0)

player['usage_rate'] = (player['field_goal_attempts'] + 0.44 * player['free_throw_attempts'] + player['turnovers']) / player['minutes']
player['usage_rate'] = player['usage_rate'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Calculate experience if possible
if 'player' in player.columns and 'season' in player.columns:
    player_first_season = player.groupby('player')['season'].min().reset_index()
    player_first_season.rename(columns={'season': 'first_season'}, inplace=True)
    player = player.merge(player_first_season, on='player', how='left')
    player['experience'] = player['season'] - player['first_season']
    
    # Create experience bins
    bins = [-1, 2, 5, 9, 100]
    labels = ['Rookie (0-2)', 'Early Career (3-5)', 'Prime (6-9)', 'Veteran (10+)']
    player['career_stage'] = pd.cut(player['experience'], bins=bins, labels=labels, right=True)
else:
    player['experience'] = 0
    player['career_stage'] = 'Unknown'

In [8]:
# 5. Team Features
# Map column names
team_column_mapping = {
    'win': 'wins',
    'loss': 'losses',
    'pts_per_game': 'points_per_game',
    'pts_against_per_game': 'points_allowed_per_game',
    'fg3a': 'three_point_attempts',
    'fga': 'field_goal_attempts'
}

for old_name, new_name in team_column_mapping.items():
    if old_name in team.columns and new_name not in team.columns:
        team.rename(columns={old_name: new_name}, inplace=True)

# Add default values for missing columns
for col in ['wins', 'losses', 'points_per_game', 'points_allowed_per_game', 'pace', 'three_point_attempts', 'field_goal_attempts']:
    if col not in team.columns:
        team[col] = 0 if col != 'pace' else 100

# Calculate features
team['win_pct'] = team['wins'] / (team['wins'] + team['losses'])
team['win_pct'] = team['win_pct'].replace([np.inf, -np.inf], np.nan).fillna(0.5)

team['offensive_rating'] = team['points_per_game'] * (100 / team['pace'])
team['defensive_rating'] = team['points_allowed_per_game'] * (100 / team['pace'])
team['net_rating'] = team['offensive_rating'] - team['defensive_rating']

team['three_point_rate'] = team['three_point_attempts'] / team['field_goal_attempts']
team['three_point_rate'] = team['three_point_rate'].replace([np.inf, -np.inf], np.nan).fillna(0.25)

# Categorize playing style
pace_median = team['pace'].median()
team['pace_style'] = np.where(team['pace'] > pace_median, 'Fast', 'Slow')

three_pt_median = team['three_point_rate'].median()
team['shooting_style'] = np.where(team['three_point_rate'] > three_pt_median, 'Three-Heavy', 'Inside')

team['playing_style'] = team['pace_style'] + '-' + team['shooting_style']

In [9]:
# 6. Merge Features
# Prepare columns for merging
player_cols = [col for col in ['player', 'season', 'true_shooting', 'usage_rate', 'experience', 'career_stage'] 
               if col in player.columns]

team_cols = [col for col in ['team', 'season', 'win_pct', 'offensive_rating', 'defensive_rating', 'playing_style'] 
             if col in team.columns]

# Merge player features
if 'player_name' in shots.columns and 'player' in player.columns and 'season' in player.columns:
    shots_with_player = shots.merge(
        player[player_cols],
        left_on=['player_name', 'season'],
        right_on=['player', 'season'],
        how='left'
    )
    if 'player' in shots_with_player.columns:
        shots_with_player.drop('player', axis=1, inplace=True)
else:
    shots_with_player = shots

# Merge team features
if 'team_name' in shots_with_player.columns and 'team' in team.columns and 'season' in team.columns:
    final_shots = shots_with_player.merge(
        team[team_cols],
        left_on=['team_name', 'season'],
        right_on=['team', 'season'],
        how='left'
    )
    if 'team' in final_shots.columns:
        final_shots.drop('team', axis=1, inplace=True)
else:
    final_shots = shots_with_player

In [10]:
# 7. Save Features
final_shots.to_csv(features_dir / 'shots_with_features.csv', index=False)
player.to_csv(features_dir / 'player_features.csv', index=False)
team.to_csv(features_dir / 'team_features.csv', index=False)