# Environment Setup and Data Loading
Set up the environment with necessary libraries and load tracking data, focusing on defensive plays and timeToThrow metric. Include functions for memory management and data loading.

In [9]:
# Cell 1: Environment Setup

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import psutil
import gc
import warnings
warnings.filterwarnings('ignore')

def create_project_dirs():
    """Create project directories if they don't exist"""
    base_dirs = ['data', 'plots', 'models', 'results']
    for d in base_dirs:
        os.makedirs(d, exist_ok=True)

def get_memory_usage():
    """Return current memory usage in MB"""
    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024

def clear_memory():
    """Clear unused memory"""
    gc.collect()
    
def set_plotting_style():
    """Set consistent plotting style"""
    plt.style.use('ggplot')
    plt.rcParams['figure.figsize'] = [12, 8]
    plt.rcParams['figure.dpi'] = 100
    plt.rcParams['font.size'] = 12

def load_tracking_data(week):
    """Load tracking data for a given week"""
    file_path = os.path.join(DATA_DIR, f'tracking_week_{week}.csv')
    return pd.read_csv(file_path)

def load_plays():
    """Load plays data with focus on timeToThrow"""
    plays = pd.read_csv(os.path.join(DATA_DIR, 'plays.csv'))
    return plays[plays['timeToThrow'].notna()]

def load_player_play_data():
    """Load player play data focusing on pass rush metrics"""
    cols = ['gameId', 'playId', 'nflId', 'teamAbbr', 
            'wasInitialPassRusher', 'causedPressure', 
            'timeToPressureAsPassRusher']
    return pd.read_csv(os.path.join(DATA_DIR, 'player_play.csv'))[cols]

# Initialize environment
print(f"Initial memory usage: {get_memory_usage():.2f} MB")
create_project_dirs()
set_plotting_style()

# Configure pandas
pd.set_option('display.max_columns', None)

# Set constants
DATA_DIR = 'data'
TEAM = 'SF'
WEEKS = [1, 2, 3, 4]

print("\nProject directories created:")
print("\n".join(os.listdir()))
print(f"\nFinal memory usage: {get_memory_usage():.2f} MB")

Initial memory usage: 10376.82 MB

Project directories created:
.git
animations
appendix.txt
app_1.ipynb
app_2.ipynb
data
data.txt
emm.txt
final_visualizations
kaggle_notebook
models
paths.txt
plots
results

Final memory usage: 10376.87 MB


# Data Preprocessing and Filtering
Filter plays for defensive snaps, merge with plays data, and calculate initial position metrics. Create base dataframe with timeToThrow information.

In [10]:
# Cell 2: Data Loading and Preprocessing

def load_and_process_data(weeks, team='SF'):
    """Load and preprocess data for timeToThrow analysis"""
    # Load base datasets
    games_df = load_games()
    plays_df = load_plays()
    players_df = load_players()
    
    # Load and concatenate tracking data
    tracking_dfs = []
    for week in tqdm(weeks, desc='Loading tracking data'):
        df = pd.read_csv(f'data/tracking_week_{week}.csv')
        tracking_dfs.append(df)
    tracking_df = pd.concat(tracking_dfs, ignore_index=True)
    
    print(f"Memory usage after loading: {get_memory_usage():.2f} MB")
    
    # Filter for team's defensive plays
    team_games = games_df[
        (games_df['homeTeamAbbr'] == team) | 
        (games_df['visitorTeamAbbr'] == team)
    ]
    
    team_defensive_plays = plays_df[
        (plays_df['gameId'].isin(team_games['gameId'])) &
        (plays_df['defensiveTeam'] == team) &
        (plays_df['timeToThrow'].notna())  # Only plays with timeToThrow
    ]
    
    # Filter tracking data for defensive plays
    defensive_tracking = tracking_df[
        tracking_df['gameId'].isin(team_defensive_plays['gameId'])
    ]
    
    # Create base DataFrame with timeToThrow and play information
    base_df = defensive_tracking.merge(
        team_defensive_plays[[
            'gameId', 'playId', 'timeToThrow', 
            'passResult', 'dropbackType', 'playAction'
        ]], 
        on=['gameId', 'playId']
    )
    
    # Add player position information
    base_df = base_df.merge(
        players_df[['nflId', 'position']], 
        on='nflId', 
        how='left'
    )
    
    print(f"Final shape: {base_df.shape}")
    print(f"Final memory usage: {get_memory_usage():.2f} MB")
    
    # Basic statistics
    print("\nTimeToThrow Statistics:")
    print(team_defensive_plays['timeToThrow'].describe())
    
    return base_df, team_defensive_plays, players_df

# Execute data loading
base_df, defensive_plays, players = load_and_process_data(WEEKS)

# Clear memory
clear_memory()

Loading tracking data: 100%|██████████| 4/4 [00:22<00:00,  5.64s/it]


Memory usage after loading: 18102.12 MB
Final shape: (322092, 23)
Final memory usage: 18333.42 MB

TimeToThrow Statistics:
count    208.000000
mean       2.683077
std        0.926303
min        0.701000
25%        2.072000
50%        2.570000
75%        3.130750
max        7.920000
Name: timeToThrow, dtype: float64


# Calculate Pressure Metrics
Calculate defensive pressure metrics including distance to QB, closure rate, and defensive formation impact on timeToThrow.

In [17]:
# Cell 3: Feature Engineering - Time to Throw Analysis

def analyze_defensive_impact(df, plays_df):
    """Analyze defensive impact on QB time to throw"""
    
    # Get plays with QB data
    qb_plays = df[
        (df['position'] == 'QB')
    ].groupby(['gameId', 'playId']).agg({
        'frameId': ['min', 'max'],
        'x': 'first',
        'y': 'first'
    }).reset_index()
    
    qb_plays.columns = ['gameId', 'playId', 'start_frame', 'end_frame', 'qb_x', 'qb_y']
    
    # Process defensive players
    def get_defense_stats(group):
        return pd.Series({
            'num_rushers': len(group['nflId'].unique()),
            'avg_dist_to_qb': group['dist_to_qb'].mean(),
            'min_dist_to_qb': group['dist_to_qb'].min()
        })
    
    # Get QB positions first
    qb_positions = df[df['position'] == 'QB'].groupby(['gameId', 'playId']).agg({
        'x': 'first',
        'y': 'first'
    }).reset_index()
    qb_positions.columns = ['gameId', 'playId', 'qb_x', 'qb_y']
    
    # Calculate distances for defensive players
    defense_df = df[
        df['position'].isin(['DE', 'DT', 'LB', 'CB', 'SS', 'FS'])
    ].copy()
    
    # Merge QB positions with defensive players
    defense_df = defense_df.merge(qb_positions, on=['gameId', 'playId'])
    
    # Calculate distances
    defense_df['dist_to_qb'] = np.sqrt(
        (defense_df['x'] - defense_df['qb_x'])**2 + 
        (defense_df['y'] - defense_df['qb_y'])**2
    )
    
    # Aggregate defensive metrics
    defense_stats = defense_df.groupby(['gameId', 'playId']).apply(
        get_defense_stats
    ).reset_index()
    
    # Merge with play data
    results = defense_stats.merge(
        plays_df[['gameId', 'playId', 'timeToThrow']], 
        on=['gameId', 'playId']
    )
    
    return results

# Generate analysis
defensive_impact = analyze_defensive_impact(base_df, defensive_plays)

print("\nDefensive Impact Analysis:")
print(defensive_impact.describe())

# Clear memory
clear_memory()


Defensive Impact Analysis:
             gameId       playId  num_rushers  avg_dist_to_qb  min_dist_to_qb  \
count  8.900000e+01    89.000000    89.000000       89.000000       89.000000   
mean   2.022096e+09  1981.966292     8.797753       12.793160        1.151896   
std    4.170600e+03  1108.340031     0.403951        1.589341        1.003333   
min    2.022091e+09    64.000000     8.000000        8.796580        0.050000   
25%    2.022093e+09  1036.000000     9.000000       11.818067        0.385876   
50%    2.022093e+09  2109.000000     9.000000       12.823758        0.854751   
75%    2.022100e+09  2923.000000     9.000000       13.590675        1.690562   
max    2.022100e+09  3882.000000     9.000000       20.147166        5.366060   

       timeToThrow  
count    89.000000  
mean      2.887303  
std       1.031086  
min       1.165000  
25%       2.235000  
50%       2.736000  
75%       3.270000  
max       7.920000  


In [24]:
# Cell 4: Distance and Pressure Analysis

def validate_tracking_data(df):
    """Validate required data exists and clean data"""
    # Clean data by removing rows with missing positions
    df = df[df['position'].notna()].copy()
    
    # Validate remaining data
    if len(df[df['position'] == 'QB']) == 0:
        raise ValueError("No QB data found in tracking data")
        
    # Validate defensive positions exist
    defensive_positions = ['DE', 'DT', 'LB', 'CB', 'SS', 'FS']
    if not any(df['position'].isin(defensive_positions)):
        raise ValueError("No defensive player data found")
        
    return df
def calculate_play_pressure_metrics(tracking_df, plays_df):
    """Calculate pressure metrics with validation"""
    
    # Validate and clean input data
    tracking_df = validate_tracking_data(tracking_df)

def calculate_play_pressure_metrics(tracking_df, plays_df):
    """Calculate pressure metrics with validation"""
    
    # Validate input data
    validate_tracking_data(tracking_df)
    
    # Filter and validate QB data
    qb_tracking = tracking_df[tracking_df['position'] == 'QB'].copy()
    qb_plays = qb_tracking[['gameId', 'playId']].drop_duplicates()
    
    # Filter defensive players
    defensive_positions = ['DE', 'DT', 'LB', 'CB', 'SS', 'FS']
    defense_tracking = tracking_df[
        tracking_df['position'].isin(defensive_positions)
    ].copy()
    
    # Calculate distances per play
    play_metrics = []
    
    for (game_id, play_id), play_group in defense_tracking.groupby(['gameId', 'playId']):
        if (game_id, play_id) not in zip(qb_plays['gameId'], qb_plays['playId']):
            continue
            
        qb_pos = qb_tracking[
            (qb_tracking['gameId'] == game_id) & 
            (qb_tracking['playId'] == play_id)
        ][['frameId', 'x', 'y']]
        
        # Calculate distances
        distances = []
        for _, def_row in play_group.iterrows():
            qb_frame = qb_pos[qb_pos['frameId'] == def_row['frameId']]
            if len(qb_frame) > 0:
                dist = np.sqrt(
                    (def_row['x'] - qb_frame['x'].iloc[0])**2 + 
                    (def_row['y'] - qb_frame['y'].iloc[0])**2
                )
                distances.append(dist)
        
        if distances:
            metrics = {
                'gameId': game_id,
                'playId': play_id,
                'min_dist': min(distances),
                'avg_dist': np.mean(distances),
                'pressure_rate': sum(d < 2 for d in distances) / len(distances),
                'num_close_defenders': sum(d < 3 for d in distances)
            }
            play_metrics.append(metrics)
    
    # Create results DataFrame
    pressure_df = pd.DataFrame(play_metrics)
    
    # Merge with play data
    final_metrics = pressure_df.merge(
        plays_df[['gameId', 'playId', 'timeToThrow']], 
        on=['gameId', 'playId'],
        validate='1:1'
    )
    
    return final_metrics

# Generate pressure analysis
pressure_analysis = calculate_play_pressure_metrics(base_df, defensive_plays)

print("\nPressure Analysis Results:")
print(pressure_analysis.describe())

# Save results
pressure_analysis.to_csv('results/pressure_metrics.csv', index=False)
clear_memory()


Pressure Analysis Results:
             gameId       playId   min_dist   avg_dist  pressure_rate  \
count  8.900000e+01    89.000000  89.000000  89.000000      89.000000   
mean   2.022096e+09  1981.966292   1.528445  12.758355       0.011797   
std    4.170600e+03  1108.340031   0.969557   1.730215       0.016513   
min    2.022091e+09    64.000000   0.100000   8.492746       0.000000   
25%    2.022093e+09  1036.000000   0.731095  11.724953       0.000000   
50%    2.022093e+09  2109.000000   1.266215  12.914397       0.008117   
75%    2.022100e+09  2923.000000   2.121438  13.651692       0.017296   
max    2.022100e+09  3882.000000   4.123312  19.888474       0.105792   

       num_close_defenders  timeToThrow  
count            89.000000    89.000000  
mean             50.258427     2.887303  
std              53.195721     1.031086  
min               0.000000     1.165000  
25%              18.000000     2.235000  
50%              32.000000     2.736000  
75%              68.

# Temporal Analysis
Analyze how timeToThrow changes throughout games and impact of defensive adjustments over time.

In [26]:
# Cell 5: Temporal Pressure Analysis

def analyze_pressure_over_time(tracking_df, pressure_analysis):
    """Analyze how defensive pressure develops over time"""
    
    # Create time windows (0.1s intervals)
    frames_per_window = 10
    tracking_df['time_window'] = tracking_df['frameId'] // frames_per_window * 0.1
    
    def calculate_window_metrics(group):
        return pd.Series({
            'defenders_within_5yd': (group['dist_to_qb'] < 5).sum(),
            'avg_def_distance': group['dist_to_qb'].mean(),
            'closest_defender': group['dist_to_qb'].min(),
            'def_convergence': group['s'].mean()
        })
    
    # Process each play
    temporal_metrics = []
    
    for (game_id, play_id), play_group in tracking_df.groupby(['gameId', 'playId']):
        qb_pos = play_group[play_group['position'] == 'QB']
        defenders = play_group[play_group['position'].isin(['DE', 'DT', 'LB'])]
        
        if len(qb_pos) == 0 or len(defenders) == 0:
            continue
            
        # Calculate metrics per time window
        window_stats = defenders.groupby('time_window').apply(calculate_window_metrics)
        
        # Add play context
        window_stats['gameId'] = game_id
        window_stats['playId'] = play_id
        temporal_metrics.append(window_stats)
    
    # Combine results
    temporal_df = pd.concat(temporal_metrics).reset_index()
    
    # Merge with pressure analysis
    final_temporal = temporal_df.merge(
        pressure_analysis[['gameId', 'playId', 'timeToThrow', 'pressure_rate']], 
        on=['gameId', 'playId']
    )
    
    return final_temporal

# Generate temporal analysis
temporal_pressure = analyze_pressure_over_time(base_df, pressure_analysis)

print("\nTemporal Pressure Analysis:")
print(temporal_pressure.groupby('time_window').mean())

# Save results
temporal_pressure.to_csv('results/temporal_pressure.csv', index=False)
clear_memory()


Temporal Pressure Analysis:
             defenders_within_5yd  avg_def_distance  closest_defender  \
time_window                                                             
0.0                      1.011236          7.813381          6.768971   
0.1                      1.146067          7.676502          6.496695   
0.2                      2.640449          7.368102          6.026946   
0.3                      5.112360          6.979037          5.556153   
0.4                      5.853933          6.709302          5.214813   
0.5                      6.168539          6.453132          4.955747   
0.6                      7.897727          6.235051          4.742481   
0.7                      8.784091          6.106384          4.503968   
0.8                      8.954023          6.028402          4.453650   
0.9                      9.602410          6.075906          4.545544   
1.0                     10.975000          5.888179          4.313892   
1.1                   

# Visualization and Results
Create visualizations showing relationship between defensive metrics and timeToThrow. Generate summary statistics and key findings.

In [27]:
# Cell 6: Statistical Analysis of Defensive Impact

def analyze_defensive_effectiveness(temporal_pressure, pressure_analysis):
    """Analyze defensive effectiveness and patterns"""
    
    # Calculate effectiveness metrics
    effectiveness = pressure_analysis.groupby('gameId').agg({
        'pressure_rate': 'mean',
        'min_dist': ['min', 'mean'],
        'num_close_defenders': 'mean',
        'timeToThrow': ['mean', 'std']
    }).reset_index()
    
    effectiveness.columns = [
        'gameId', 'avg_pressure', 'closest_approach', 
        'avg_distance', 'avg_defenders', 'avg_timeToThrow', 
        'timeToThrow_std'
    ]
    
    # Calculate time-based success metrics
    time_metrics = temporal_pressure.groupby('time_window').agg({
        'defenders_within_5yd': 'mean',
        'avg_def_distance': 'mean',
        'closest_defender': 'min'
    }).reset_index()
    
    # Generate statistical correlations
    correlations = pressure_analysis[[
        'pressure_rate', 'min_dist', 'num_close_defenders', 
        'timeToThrow'
    ]].corr()
    
    print("\nCorrelation Analysis:")
    print(correlations)
    
    print("\nDefensive Effectiveness by Game:")
    print(effectiveness.describe())
    
    print("\nTime-based Metrics:")
    print(time_metrics.head())
    
    # Save results
    effectiveness.to_csv('results/defensive_effectiveness.csv', index=False)
    time_metrics.to_csv('results/time_metrics.csv', index=False)
    correlations.to_csv('results/defensive_correlations.csv')
    
    return effectiveness, time_metrics, correlations

# Execute analysis
effectiveness, time_metrics, correlations = analyze_defensive_effectiveness(
    temporal_pressure, 
    pressure_analysis
)

clear_memory()


Correlation Analysis:
                     pressure_rate  min_dist  num_close_defenders  timeToThrow
pressure_rate             1.000000 -0.594827             0.658791     0.106665
min_dist                 -0.594827  1.000000            -0.355218    -0.175692
num_close_defenders       0.658791 -0.355218             1.000000     0.305698
timeToThrow               0.106665 -0.175692             0.305698     1.000000

Defensive Effectiveness by Game:
             gameId  avg_pressure  closest_approach  avg_distance  \
count  3.000000e+00      3.000000          3.000000      3.000000   
mean   2.022095e+09      0.012531          0.190764      1.535239   
std    4.953643e+03      0.002815          0.101061      0.160371   
min    2.022091e+09      0.009845          0.100000      1.350071   
25%    2.022092e+09      0.011068          0.136313      1.488004   
50%    2.022093e+09      0.012291          0.172627      1.625937   
75%    2.022096e+09      0.013875          0.236147      1.627823

In [29]:
# Cell 7: Statistical Modeling and Visualization

def create_defensive_visualizations(effectiveness, time_metrics, pressure_analysis):
    """Generate visualizations for defensive analysis"""
    
    # Set style
    plt.style.use('ggplot')
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Plot 1: Time to Throw Distribution
    sns.histplot(
        data=pressure_analysis,
        x='timeToThrow',
        bins=20,
        ax=axes[0,0]
    )
    axes[0,0].set_title('Distribution of Time to Throw')
    axes[0,0].set_xlabel('Time (seconds)')
    
    # Plot 2: Pressure vs Distance
    sns.scatterplot(
        data=pressure_analysis,
        x='min_dist',
        y='pressure_rate',
        ax=axes[0,1]
    )
    axes[0,1].set_title('Pressure Rate vs Minimum Distance')
    
    # Plot 3: Time-based Pressure
    sns.lineplot(
        data=time_metrics,
        x='time_window',
        y='avg_def_distance',
        ax=axes[1,0]
    )
    axes[1,0].set_title('Average Defensive Distance Over Time')
    
    # Plot 4: Game Effectiveness
    sns.boxplot(
        data=effectiveness,
        y='avg_pressure',
        ax=axes[1,1]
    )
    axes[1,1].set_title('Pressure Rate Distribution by Game')
    
    plt.tight_layout()
    
    # Save plots
    plt.savefig('plots/defensive_analysis.png')
    plt.close()
    
    return fig

# Generate visualizations
analysis_plots = create_defensive_visualizations(
    effectiveness,
    time_metrics,
    pressure_analysis
)

print("\nVisualization Summary:")
print("- Plots saved to: plots/defensive_analysis.png")
print("- Analysis complete")

clear_memory()


Visualization Summary:
- Plots saved to: plots/defensive_analysis.png
- Analysis complete


In [30]:
# Cell 8: Statistical Testing and Model Validation

def perform_statistical_analysis(pressure_analysis, temporal_pressure):
    """Conduct statistical tests on defensive effectiveness"""
    
    from scipy import stats
    
    # Test 1: Correlation between time and pressure
    time_pressure_corr = stats.pearsonr(
        pressure_analysis['timeToThrow'],
        pressure_analysis['pressure_rate']
    )
    
    # Test 2: Distance impact on pressure success
    distance_groups = pressure_analysis.groupby(
        pd.qcut(pressure_analysis['min_dist'], 4)
    )['pressure_rate'].agg(['mean', 'std'])
    
    # Test 3: Time window analysis
    time_effectiveness = temporal_pressure.groupby('time_window').agg({
        'defenders_within_5yd': ['mean', 'std'],
        'pressure_rate': ['mean', 'std']
    }).round(3)
    
    # Compile results
    results = {
        'time_pressure_correlation': time_pressure_corr,
        'distance_impact': distance_groups,
        'time_effectiveness': time_effectiveness
    }
    
    # Display results
    print("\nStatistical Analysis Results:")
    print(f"Time-Pressure Correlation: {time_pressure_corr[0]:.3f} (p={time_pressure_corr[1]:.3f})")
    print("\nDistance Impact on Pressure:")
    print(distance_groups)
    print("\nTime Window Effectiveness:")
    print(time_effectiveness)
    
    return results

# Execute analysis
statistical_results = perform_statistical_analysis(pressure_analysis, temporal_pressure)

# Save results
pd.DataFrame(statistical_results['time_effectiveness']).to_csv('results/statistical_analysis.csv')
clear_memory()


Statistical Analysis Results:
Time-Pressure Correlation: 0.107 (p=0.320)

Distance Impact on Pressure:
                    mean       std
min_dist                          
(0.099, 0.731]  0.028261  0.023497
(0.731, 1.266]  0.012755  0.005702
(1.266, 2.121]  0.005422  0.006750
(2.121, 4.123]  0.000000  0.000000

Time Window Effectiveness:
            defenders_within_5yd         pressure_rate       
                            mean     std          mean    std
time_window                                                  
0.0                        1.011   4.589         0.012  0.017
0.1                        1.146   4.788         0.012  0.017
0.2                        2.640   6.066         0.012  0.017
0.3                        5.112   9.199         0.012  0.017
0.4                        5.854   9.645         0.012  0.017
0.5                        6.169  10.235         0.012  0.017
0.6                        7.898  11.861         0.012  0.017
0.7                        8.784  12.4

In [31]:
# Cell 9: Final Analysis and Insights

def generate_final_insights(statistical_results, pressure_analysis, temporal_pressure):
    """Generate final insights and summary metrics"""
    
    # Calculate overall defensive effectiveness
    summary_stats = {
        'avg_time_to_throw': pressure_analysis['timeToThrow'].mean(),
        'pressure_success_rate': pressure_analysis['pressure_rate'].mean(),
        'optimal_distance': pressure_analysis.loc[pressure_analysis['pressure_rate'].idxmax(), 'min_dist'],
        'time_window_effectiveness': temporal_pressure.groupby('time_window')['defenders_within_5yd'].mean().max()
    }
    
    # Format insights
    insights = pd.DataFrame({
        'Metric': [
            'Average Time to Throw',
            'Pressure Success Rate',
            'Optimal Rush Distance',
            'Peak Defenders in Range'
        ],
        'Value': list(summary_stats.values())
    })
    
    # Display results
    print("\nKey Defensive Insights:")
    print(insights.to_string(index=False))
    
    # Save results
    insights.to_csv('results/final_insights.csv', index=False)
    
    return insights

# Generate final insights
final_insights = generate_final_insights(
    statistical_results,
    pressure_analysis,
    temporal_pressure
)

clear_memory()


Key Defensive Insights:
                 Metric     Value
  Average Time to Throw  2.887303
  Pressure Success Rate  0.011797
  Optimal Rush Distance  0.299666
Peak Defenders in Range 16.222222


In [32]:
# Cell 10: Final Visualization Dashboard

def create_final_dashboard(final_insights, pressure_analysis, temporal_pressure):
    """Generate final visualization dashboard"""
    
    # Setup figure
    plt.style.use('ggplot')
    fig = plt.figure(figsize=(16, 10))
    gs = fig.add_gridspec(2, 3)
    
    # Plot 1: Time Series Trend
    ax1 = fig.add_subplot(gs[0, :2])
    sns.lineplot(
        data=temporal_pressure,
        x='time_window',
        y='defenders_within_5yd',
        ax=ax1
    )
    ax1.set_title('Defensive Pressure Over Time')
    
    # Plot 2: Pressure Distribution
    ax2 = fig.add_subplot(gs[0, 2])
    sns.boxplot(
        data=pressure_analysis,
        y='pressure_rate',
        ax=ax2
    )
    ax2.set_title('Pressure Rate Distribution')
    
    # Plot 3: Distance vs Effectiveness
    ax3 = fig.add_subplot(gs[1, :])
    sns.scatterplot(
        data=pressure_analysis,
        x='min_dist',
        y='pressure_rate',
        size='timeToThrow',
        ax=ax3
    )
    ax3.set_title('Distance vs Pressure Effectiveness')
    
    # Add annotations
    plt.figtext(
        0.02, 0.02,
        f"Avg Time to Throw: {final_insights.iloc[0,1]:.2f}s\n" +
        f"Success Rate: {final_insights.iloc[1,1]:.2%}",
        fontsize=10
    )
    
    plt.tight_layout()
    plt.savefig('plots/final_dashboard.png', dpi=300, bbox_inches='tight')
    plt.close()

# Generate dashboard
create_final_dashboard(final_insights, pressure_analysis, temporal_pressure)
print("\nFinal dashboard saved to: plots/final_dashboard.png")


Final dashboard saved to: plots/final_dashboard.png


In [33]:
# Cell 11: Summary Statistics and Report Generation

def generate_final_report(pressure_analysis, temporal_pressure, final_insights):
    """Generate comprehensive statistical report"""
    
    # Calculate summary statistics
    summary = pd.DataFrame({
        'Statistic': [
            'Games Analyzed',
            'Total Plays',
            'Avg Time to Throw',
            'Pressure Success Rate',
            'Optimal Rush Distance',
            'Most Effective Time Window'
        ],
        'Value': [
            pressure_analysis['gameId'].nunique(),
            len(pressure_analysis),
            f"{pressure_analysis['timeToThrow'].mean():.2f}s",
            f"{pressure_analysis['pressure_rate'].mean():.1%}",
            f"{pressure_analysis['min_dist'].min():.1f} yards",
            f"{temporal_pressure.loc[temporal_pressure['defenders_within_5yd'].idxmax(), 'time_window']:.1f}s"
        ]
    })
    
    # Calculate time-based effectiveness
    time_stats = temporal_pressure.groupby('time_window').agg({
        'defenders_within_5yd': ['mean', 'max'],
        'avg_def_distance': 'mean'
    }).round(2)
    
    # Print and save results
    print("\nDefensive Analysis Summary:")
    print(summary.to_string(index=False))
    print("\nTime-based Effectiveness:")
    print(time_stats)
    
    # Export results
    summary.to_csv('results/final_summary.csv', index=False)
    time_stats.to_csv('results/time_effectiveness.csv')
    
    return summary, time_stats

# Generate final report
summary_stats, time_effectiveness = generate_final_report(
    pressure_analysis,
    temporal_pressure,
    final_insights
)


Defensive Analysis Summary:
                 Statistic     Value
            Games Analyzed         3
               Total Plays        89
         Avg Time to Throw     2.89s
     Pressure Success Rate      1.2%
     Optimal Rush Distance 0.1 yards
Most Effective Time Window      1.2s

Time-based Effectiveness:
            defenders_within_5yd       avg_def_distance
                            mean   max             mean
time_window                                            
0.0                         1.01  27.0             7.81
0.1                         1.15  30.0             7.68
0.2                         2.64  30.0             7.37
0.3                         5.11  37.0             6.98
0.4                         5.85  30.0             6.71
0.5                         6.17  39.0             6.45
0.6                         7.90  40.0             6.24
0.7                         8.78  38.0             6.11
0.8                         8.95  40.0             6.03
0.9          

In [34]:
# Cell 12: Formation Analysis and Advanced Metrics

def analyze_formation_effectiveness(pressure_analysis, temporal_pressure):
    """Analyze defensive formation effectiveness"""
    
    # Formation success metrics
    formation_stats = pd.DataFrame({
        'metric': [
            'early_pressure_rate',
            'late_pressure_rate',
            'optimal_defender_count',
            'distance_effectiveness'
        ],
        'value': [
            temporal_pressure[temporal_pressure['time_window'] < 1.5]['pressure_rate'].mean(),
            temporal_pressure[temporal_pressure['time_window'] >= 1.5]['pressure_rate'].mean(),
            pressure_analysis.groupby('num_close_defenders')['pressure_rate'].mean().idxmax(),
            pressure_analysis['min_dist'].corr(pressure_analysis['pressure_rate'])
        ]
    })
    
    # Calculate effectiveness ratios
    effectiveness_by_distance = pressure_analysis.groupby(
        pd.qcut(pressure_analysis['min_dist'], 4)
    )['pressure_rate'].agg(['mean', 'count', 'std'])
    
    print("\nFormation Effectiveness Metrics:")
    print(formation_stats)
    print("\nPressure Success by Distance:")
    print(effectiveness_by_distance)
    
    # Export results
    formation_stats.to_csv('results/formation_analysis.csv', index=False)
    
    return formation_stats, effectiveness_by_distance

# Generate formation analysis
formation_metrics, distance_effectiveness = analyze_formation_effectiveness(
    pressure_analysis, 
    temporal_pressure
)


Formation Effectiveness Metrics:
                   metric       value
0     early_pressure_rate    0.011637
1      late_pressure_rate    0.012409
2  optimal_defender_count  228.000000
3  distance_effectiveness   -0.594827

Pressure Success by Distance:
                    mean  count       std
min_dist                                 
(0.099, 0.731]  0.028261     23  0.023497
(0.731, 1.266]  0.012755     22  0.005702
(1.266, 2.121]  0.005422     22  0.006750
(2.121, 4.123]  0.000000     22  0.000000
