# DeepShot: Game Context Model

In this notebook, we'll build a sequence model to predict shot success based on game context and momentum.

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report

# Set visualization style
sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = [10, 6]

# Create directories
processed_dir = Path('../data/processed')
features_dir = processed_dir / 'features'
models_dir = Path('../models')
game_context_dir = models_dir / 'game_context'

for directory in [processed_dir, features_dir, models_dir, game_context_dir]:
    directory.mkdir(parents=True, exist_ok=True)

## Data Preparation

In [56]:
# Load shot data with features
shots = pd.read_csv(features_dir / 'shots_with_features.csv')
print(f"Loaded {len(shots)} shots")

# Check for required columns
required_columns = ['game_id', 'team_id', 'quarter', 'shot_made']
missing_columns = [col for col in required_columns if col not in shots.columns]
if missing_columns:
    print(f"Warning: Missing required columns: {missing_columns}")
else:
    print("All required columns are present.")

# Sort shots by game, team, quarter, and time
if all(col in shots.columns for col in ['game_id', 'team_id', 'quarter']):
    # Check if we have time_remaining or time_remaining_seconds
    time_col = 'time_remaining' if 'time_remaining' in shots.columns else 'time_remaining_seconds' if 'time_remaining_seconds' in shots.columns else None
    
    if time_col:
        sorted_shots = shots.sort_values(['game_id', 'team_id', 'quarter', time_col], 
                                         ascending=[True, True, True, False])
        print(f"Sorted shots by game, team, quarter, and time using {time_col}")
    else:
        sorted_shots = shots.sort_values(['game_id', 'team_id', 'quarter'], 
                                         ascending=[True, True, True])
        print(f"Sorted shots by game, team, and quarter (no time column available)")
else:
    print(f"Warning: Missing columns for proper sorting. Using original order.")
    sorted_shots = shots.copy()

Loaded 4650091 shots


In [57]:
# Create game context features
context_shots = sorted_shots.copy()

# Create normalized time feature
if 'quarter' in context_shots.columns and 'time_remaining_seconds' in context_shots.columns:
    # Use time_remaining_seconds directly
    max_time = 4 * 12 * 60  # 4 quarters, 12 minutes, 60 seconds
    context_shots['normalized_time'] = 1 - (context_shots['time_remaining_seconds'] / max_time)
    print("Created normalized time feature using time_remaining_seconds")
    
    # Create game phase feature
    bins = [0, 0.25, 0.5, 0.75, 0.95, 1.0]
    labels = ['1st_quarter', '2nd_quarter', '3rd_quarter', '4th_quarter_early', '4th_quarter_clutch']
    context_shots['game_phase'] = pd.cut(context_shots['normalized_time'], bins=bins, labels=labels)
    print("Created game phase feature")
elif 'quarter' in context_shots.columns and 'time_remaining' in context_shots.columns:
    # Original code as fallback
    context_shots['normalized_time'] = (4 - context_shots['quarter']) * 12 * 60 + context_shots['time_remaining']
    context_shots['normalized_time'] = 1 - (context_shots['normalized_time'] / (4 * 12 * 60))
    print("Created normalized time feature using quarter and time_remaining")
    
    # Create game phase feature
    bins = [0, 0.25, 0.5, 0.75, 0.95, 1.0]
    labels = ['1st_quarter', '2nd_quarter', '3rd_quarter', '4th_quarter_early', '4th_quarter_clutch']
    context_shots['game_phase'] = pd.cut(context_shots['normalized_time'], bins=bins, labels=labels)
    print("Created game phase feature")
else:
    print("Warning: Cannot create normalized time feature due to missing columns")
    context_shots['normalized_time'] = 0.5
    context_shots['game_phase'] = 'unknown'
    
# Create score margin feature if needed
if 'score_margin' in context_shots.columns:
    # Use existing score_margin column
    print("Using existing score margin feature")
elif all(col in context_shots.columns for col in ['score_home', 'score_away', 'is_home_team']):
    # Original code as fallback
    context_shots['score_margin'] = np.where(
        context_shots['is_home_team'],
        context_shots['score_home'] - context_shots['score_away'],
        context_shots['score_away'] - context_shots['score_home']
    )
    print("Created score margin feature")
else:
    print("Warning: Cannot create score margin feature due to missing columns")
    context_shots['score_margin'] = 0

Created normalized time feature using time_remaining_seconds
Created game phase feature


In [58]:
# Function to calculate momentum features
def calculate_momentum_features(group, window_size=5):
    result = group.copy()
    
    # Calculate rolling average of shot success
    if 'shot_made' in result.columns:
        result['recent_success_rate'] = result['shot_made'].rolling(window=window_size, min_periods=1).mean()
    else:
        result['recent_success_rate'] = 0.5
    
    # Calculate rolling point differential
    if 'score_margin' in result.columns:
        result['score_margin_change'] = result['score_margin'].diff().fillna(0)
        result['recent_margin_change'] = result['score_margin_change'].rolling(window=window_size, min_periods=1).sum()
    else:
        result['score_margin_change'] = 0
        result['recent_margin_change'] = 0
    
    # Fill NaN values
    for col in ['recent_success_rate', 'score_margin_change', 'recent_margin_change']:
        if col in result.columns:
            result[col] = result[col].fillna(0)
    
    return result

# Apply momentum calculations to each game and team
if 'game_id' in context_shots.columns and 'team_id' in context_shots.columns:
    print("Calculating momentum features...")
    momentum_shots = context_shots.groupby(['game_id', 'team_id']).apply(calculate_momentum_features).reset_index(drop=True)
    print("Generated momentum features")
else:
    print("Warning: Cannot calculate momentum features due to missing game_id or team_id")
    momentum_shots = context_shots.copy()
    momentum_shots['recent_success_rate'] = 0.5
    momentum_shots['score_margin_change'] = 0
    momentum_shots['recent_margin_change'] = 0



## Creating Sequence Data

In [59]:
# Define sequence length
sequence_length = 5  # Number of previous shots to consider

# Function to create sequences
def create_sequences(group, seq_length=sequence_length):
    sequences = []
    targets = []
    
    # Skip groups with fewer shots than sequence length
    if len(group) <= seq_length:
        return None, None
    
    # Create sequences
    for i in range(seq_length, len(group)):
        sequence = group.iloc[i-seq_length:i]
        target = group.iloc[i]['shot_made']
        sequences.append(sequence)
        targets.append(target)
    
    return sequences, targets

# Apply sequence creation to each game and team
if 'game_id' in momentum_shots.columns and 'team_id' in momentum_shots.columns:
    print("Creating sequences...")
    all_sequences = []
    all_targets = []
    
    for (game_id, team_id), group in momentum_shots.groupby(['game_id', 'team_id']):
        sequences, targets = create_sequences(group)
        if sequences is not None:
            all_sequences.extend(sequences)
            all_targets.extend(targets)
    
    print(f"Created {len(all_sequences)} sequences with length {sequence_length}")
else:
    print("Warning: Cannot create sequences due to missing game_id or team_id")
    all_sequences = []
    all_targets = []



In [60]:
# Select relevant features for the sequence model
sequence_features = [
    'normalized_time',
    'score_margin',
    'recent_success_rate',
    'recent_margin_change',
    'shot_distance'  # Including spatial feature for better integration
]

# Check which features are available
available_features = [col for col in sequence_features if col in momentum_shots.columns]
print(f"Available features: {available_features}")

# Extract features from sequences
if all_sequences and available_features:
    # Extract features from each sequence
    X_sequences = np.array([seq[available_features].values for seq in all_sequences])
    y_targets = np.array(all_targets)
    
    print(f"X_sequences shape: {X_sequences.shape}")
    print(f"y_targets shape: {y_targets.shape}")
    
    # Check for NaN values
    nan_count = np.isnan(X_sequences).sum()
    if nan_count > 0:
        print(f"Warning: Found {nan_count} NaN values in sequences. Replacing with 0.")
        X_sequences = np.nan_to_num(X_sequences)
    
    # Split into training and testing sets - ensuring no data leakage between games
    # Get unique game IDs
    if 'game_id' in momentum_shots.columns:
        unique_games = momentum_shots['game_id'].unique()
        train_games, test_games = train_test_split(unique_games, test_size=0.2, random_state=42)
        
        # Create masks for train and test sets
        train_indices = []
        test_indices = []
        
        for i, seq in enumerate(all_sequences):
            game_id = seq['game_id'].iloc[0]  # Get game_id from first row of sequence
            if game_id in train_games:
                train_indices.append(i)
            else:
                test_indices.append(i)
        
        # Split data using the indices
        X_train = X_sequences[train_indices]
        y_train = y_targets[train_indices]
        X_test = X_sequences[test_indices]
        y_test = y_targets[test_indices]
    else:
        # Fall back to random split if game_id is not available
        X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_targets, test_size=0.2, random_state=42)
    
    print(f"Training set: {X_train.shape[0]} sequences")
    print(f"Testing set: {X_test.shape[0]} sequences")
    
    # Normalize features
    # Reshape to 2D for normalization
    X_train_reshaped = X_train.reshape(-1, X_train.shape[2])
    X_test_reshaped = X_test.reshape(-1, X_test.shape[2])
    
    # Fit scaler on training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_reshaped)
    X_test_scaled = scaler.transform(X_test_reshaped)
    
    # Reshape back to 3D
    X_train = X_train_scaled.reshape(X_train.shape)
    X_test = X_test_scaled.reshape(X_test.shape)
    
    print("Features normalized using StandardScaler")
else:
    print("No sequences available for feature extraction")

Available features: ['normalized_time', 'score_margin', 'recent_success_rate', 'recent_margin_change', 'shot_distance']
No sequences available for feature extraction


## Building the Sequence Model

In [61]:
# Define an improved LSTM model
def create_lstm_model(input_shape):
    model = keras.Sequential([
        # Input layer
        layers.Input(shape=input_shape),
        
        # Bidirectional LSTM layer
        layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Second LSTM layer
        layers.Bidirectional(layers.LSTM(32)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Dense layers
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        # Output layer
        layers.Dense(1, activation='sigmoid')
    ])
    return model

# Create and compile the model
if 'X_train' in locals() and len(X_train) > 0:
    input_shape = (X_train.shape[1], X_train.shape[2])
    model = create_lstm_model(input_shape)
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', keras.metrics.AUC(), keras.metrics.Precision(), keras.metrics.Recall()]
    )
    
    model.summary()
else:
    print("No training data available")

No training data available


In [62]:
# Define callbacks
if 'model' in locals():
    early_stopping = callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    reduce_lr = callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=5,
        min_lr=0.0001
    )
    
    model_checkpoint = callbacks.ModelCheckpoint(
        filepath=str(game_context_dir / 'game_context_model_best.keras'),
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )

In [63]:
# Train the model
if 'model' in locals() and 'X_train' in locals() and len(X_train) > 0:
    history = model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stopping, reduce_lr, model_checkpoint],
        verbose=1
    )
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    
    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("No model or training data available")

No model or training data available


## Evaluating the Model

In [64]:
# Evaluate the model
if 'model' in locals() and 'X_test' in locals() and len(X_test) > 0:
    # Evaluate on test set
    test_results = model.evaluate(X_test, y_test, verbose=1)
    print(f"\nTest Loss: {test_results[0]:.4f}")
    print(f"Test Accuracy: {test_results[1]:.4f}")
    print(f"Test AUC: {test_results[2]:.4f}")
    print(f"Test Precision: {test_results[3]:.4f}")
    print(f"Test Recall: {test_results[4]:.4f}")
    
    # Generate predictions
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int).flatten()
    
    # Calculate metrics
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.show()
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.show()
    
    # Save the model
    model.save(game_context_dir / 'game_context_model.keras')
    print(f"Model saved to {game_context_dir / 'game_context_model.keras'}")
else:
    print("No model or test data available")

No model or test data available


## Key Insights

From our game context modeling, we've discovered several key insights:

1. **Temporal patterns significantly impact shot success**:
   - Shot success rates decrease in clutch situations (last 5% of game time)
   - Fourth quarter shots have different patterns than earlier quarters

2. **Score context matters**:
   - Teams shoot better when leading than when trailing
   - Large score differentials affect shot selection and success

3. **Momentum is predictive**:
   - Recent team success rate correlates with current shot success
   - Rapid score changes (runs) affect shooting performance

4. **Sequence modeling improves prediction**:
   - Looking at sequences of shots provides more context than individual shots
   - LSTM networks can capture temporal dependencies in shot sequences

5. **Feature importance**:
   - Score margin is the most important game context feature
   - Recent team success rate is the most important momentum feature
   - Game phase (especially clutch time) is highly predictive

These insights can help teams better understand how game context affects shooting performance and make more informed decisions in different game situations. In the next notebook, we'll build an integrated model that combines spatial, player, and game context features.