# NBA Possession Pattern Discovery Analysis

**Date Range:** Oct 21, 2025 - Jan 1, 2026  
**Data Source:** team_game_logs (nba_data.db)  
**Approach:** Rule-based pattern discovery (no ML)

## Research Questions

1. **What wins games?** - Opportunity differential → win rate correlation
2. **Scoring environments** - Identify FT-driven, rebound-heavy, grind, shootout patterns
3. **Efficiency overrides** - When conversion rate matters more than possession volume
4. **Opponent context** - How opponent defensive pressure alters outcomes
5. **Data-first archetypes** - Cluster teams by TO%, OREB%, FTr
6. **Prop environments** - High assist/rebound/scoring conditions (player-agnostic)
7. **Failure analysis** - Games where possession patterns broke down

---

## Cell 1: Setup & Data Loading

In [None]:
# Setup
import sys
sys.path.append('/Users/malcolmlittle/NBA OVER UNDER SW')

from api.utils.possession_dataset_builder import build_possession_dataset
from api.utils.possession_metrics import *
from api.utils.pattern_analyzer import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')

# Load dataset
print('Loading possession dataset...')
df = build_possession_dataset(
    season='2025-26',
    start_date='2024-10-21',  # Will be converted to 2025-10-21
    end_date='2026-01-01',
    output_format='dataframe'
)

print(f'Total team-game records: {len(df)}')
print(f'Total games: {len(df) // 2}')
print(f'Date range: {df["game_date"].min()} to {df["game_date"].max()}')
print(f'\nColumns: {len(df.columns)}')

# Display sample
df.head()

## Cell 2: Q1 - What Wins Games? (Opportunity Differential)

In [None]:
# Q1: Analyze opportunity_diff → win rate
print('=== Q1: What Wins Games? ===')

q1_results = analyze_opportunity_differential_patterns(df)

print(f"\nCorrelation (opportunity_diff vs game_win): {q1_results['correlation']}")
print(f"\nWin Rate by Opportunity Differential Bucket:")
for bucket, stats in q1_results['win_rate_by_bucket'].items():
    print(f"  {bucket}: {stats['win_rate']:.1%} ({stats['games']} games)")

print(f"\nFailure Games (won opportunity but lost): {len(q1_results['failure_games'])}")

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Scatter: opportunity_diff vs game_win
ax1.scatter(df['opportunity_diff'], df['game_win'], alpha=0.3, s=20)
ax1.axvline(0, color='red', linestyle='--', alpha=0.5, label='Zero opportunity diff')
ax1.set_xlabel('Opportunity Differential')
ax1.set_ylabel('Game Win (0 or 1)')
ax1.set_title('Opportunity Differential vs Game Outcome')
ax1.legend()

# Bar: win rate by bucket
buckets = list(q1_results['win_rate_by_bucket'].keys())
win_rates = [q1_results['win_rate_by_bucket'][b]['win_rate'] for b in buckets]
ax2.bar(buckets, win_rates, color='steelblue')
ax2.set_xlabel('Opportunity Differential Bucket (Q1=lowest, Q5=highest)')
ax2.set_ylabel('Win Rate')
ax2.set_title('Win Rate by Opportunity Differential Quintile')
ax2.axhline(0.5, color='red', linestyle='--', alpha=0.5, label='50% baseline')
ax2.legend()

plt.tight_layout()
plt.show()

# Save findings
pd.DataFrame(q1_results['failure_games']).to_csv(
    '/Users/malcolmlittle/NBA OVER UNDER SW/analysis/outputs/findings/q1_failure_games.csv',
    index=False
)
print("\nFailure games saved to q1_failure_games.csv")

## Cell 3: Q2 - Scoring Environments

In [None]:
# Q2: Identify scoring environments
print('=== Q2: Scoring Environments ===')

q2_results = analyze_scoring_environments(df)

print(f"\nEnvironment Distribution:")
for env, stats in q2_results.items():
    print(f"  {env}: {stats['count']} games | Avg PPP: {stats['avg_ppp']:.3f} | Win Rate: {stats['win_rate']:.1%}")

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart: environment frequency
envs = list(q2_results.keys())
counts = [q2_results[e]['count'] for e in envs]
ax1.pie(counts, labels=envs, autopct='%1.1f%%', startangle=90)
ax1.set_title('Scoring Environment Distribution')

# Bar chart: avg PPP by environment
avg_ppps = [q2_results[e]['avg_ppp'] for e in envs]
ax2.barh(envs, avg_ppps, color='coral')
ax2.set_xlabel('Average PPP')
ax2.set_title('Scoring Efficiency by Environment')
ax2.axvline(df['ppp'].mean(), color='red', linestyle='--', alpha=0.5, label='Overall avg')
ax2.legend()

plt.tight_layout()
plt.show()

## Cell 4: Q3 - Efficiency Overrides

In [None]:
# Q3: When conversion matters more than volume
print('=== Q3: Efficiency Overrides ===')

q3_results = analyze_efficiency_overrides(df)

print(f"\nOverride Games (lost opportunity_diff but won): {q3_results['total_override_games']}")
print(f"Average PPP Advantage: {q3_results['avg_ppp_advantage']:.3f}")
print(f"Average Conversion Score: {q3_results['avg_conversion_score']:.1f}")

# Visualization
if q3_results['override_games']:
    override_df = pd.DataFrame(q3_results['override_games'])
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Scatter: opportunity_diff vs ppp_advantage
    ax1.scatter(override_df['opportunity_diff'], override_df['ppp_advantage'], s=50, color='green', alpha=0.6)
    ax1.set_xlabel('Opportunity Differential (negative = deficit)')
    ax1.set_ylabel('PPP Advantage')
    ax1.set_title('Efficiency Overcoming Opportunity Deficit')
    ax1.axhline(0, color='red', linestyle='--', alpha=0.5)
    ax1.axvline(0, color='red', linestyle='--', alpha=0.5)
    
    # Histogram: conversion scores
    ax2.hist(override_df['conversion_score'], bins=15, color='purple', alpha=0.7, edgecolor='black')
    ax2.set_xlabel('Conversion Score')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Conversion Scores in Override Games')
    ax2.axvline(override_df['conversion_score'].mean(), color='red', linestyle='--', label='Mean')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Save findings
    override_df.to_csv(
        '/Users/malcolmlittle/NBA OVER UNDER SW/analysis/outputs/findings/q3_override_games.csv',
        index=False
    )
    print("\nOverride games saved to q3_override_games.csv")
else:
    print("\nNo override games to visualize")

## Cell 5: Q4 - Opponent Context Effects

In [None]:
# Q4: Opponent defensive pressure
print('=== Q4: Opponent Context Effects ===')

q4_results = analyze_opponent_context_effects(df)

print(f"\nTO Pressure Effect (opp_TO_pct → team_TO_pct corr): {q4_results['to_pressure_effect']:.3f}")
print(f"OREB Pressure Effect (opp_OREB_pct → team_OREB_pct corr): {q4_results['oreb_pressure_effect']:.3f}")
print(f"\nPace Matchup Bins:")
for bucket, stats in q4_results['pace_matchup_bins'].items():
    print(f"  {bucket}: Win Rate {stats['win_rate']:.1%} | Avg PPP {stats['avg_ppp']:.3f} ({stats['games']} games)")

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Scatter: opp_TO_pct vs team_TO_pct
ax1.scatter(df['opp_TO_pct'], df['TO_pct'], alpha=0.3, s=20, color='orange')
ax1.set_xlabel('Opponent TO% Allowed')
ax1.set_ylabel('Team TO%')
ax1.set_title(f'Defensive Pressure on Turnovers (r={q4_results["to_pressure_effect"]:.3f})')

# Bar: win rate by pace matchup
pace_buckets = list(q4_results['pace_matchup_bins'].keys())
pace_win_rates = [q4_results['pace_matchup_bins'][b]['win_rate'] for b in pace_buckets]
ax2.bar(pace_buckets, pace_win_rates, color='teal')
ax2.set_xlabel('Pace Matchup Bucket')
ax2.set_ylabel('Win Rate')
ax2.set_title('Win Rate by Pace Differential')
ax2.axhline(0.5, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## Cell 6: Q5 - Possession Archetypes

In [None]:
# Q5: Cluster teams by TO%, OREB%, FTr ONLY
print('=== Q5: Possession Archetypes (TO%, OREB%, FTr) ===')

q5_results = cluster_by_possession_behavior(df)

print(f"\nTotal Archetypes: {len(q5_results['archetype_distribution'])}")
print(f"\nTop 5 Archetypes:")
for archetype, count in list(q5_results['archetype_distribution'].items())[:5]:
    print(f"  {archetype}: {count} teams")

# Display team percentiles
team_percentiles_df = pd.DataFrame(q5_results['team_percentiles'])
print(f"\nTeam Percentiles (sample):")
print(team_percentiles_df[['team_id', 'archetype', 'avg_TO_pct', 'avg_OREB_pct', 'avg_FTr', 'games']].head(10))

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))

# Bar chart: archetype distribution
top_10_archetypes = dict(list(q5_results['archetype_distribution'].items())[:10])
ax.barh(list(top_10_archetypes.keys()), list(top_10_archetypes.values()), color='navy')
ax.set_xlabel('Number of Teams')
ax.set_ylabel('Archetype')
ax.set_title('Top 10 Possession Archetypes (TO%/OREB%/FTr)')

plt.tight_layout()
plt.show()

# Save findings
team_percentiles_df.to_csv(
    '/Users/malcolmlittle/NBA OVER UNDER SW/analysis/outputs/findings/q5_team_archetypes.csv',
    index=False
)
print("\nTeam archetypes saved to q5_team_archetypes.csv")

## Cell 7: Q6 - Prop Environments

In [None]:
# Q6: High assist/rebound/scoring prop environments
print('=== Q6: Prop Environments ===')

q6_results = identify_prop_environments(df)

print(f"\nHigh Scoring Games: {len(q6_results['high_scoring_games'])}")
print(f"High Assist Games: {len(q6_results['high_assist_games'])}")
print(f"High Rebound Games: {len(q6_results['high_rebound_games'])}")
print(f"Multi-Prop Games (2+ tags): {len(q6_results['multi_prop_games'])}")

# Visualization
prop_counts = [
    len(q6_results['high_scoring_games']),
    len(q6_results['high_assist_games']),
    len(q6_results['high_rebound_games']),
    len(q6_results['multi_prop_games'])
]
prop_labels = ['High Scoring', 'High Assists', 'High Rebounds', 'Multi-Prop']

fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(prop_labels, prop_counts, color=['red', 'blue', 'green', 'purple'])
ax.set_ylabel('Number of Games')
ax.set_title('Prop Environment Distribution')
ax.set_xticklabels(prop_labels, rotation=15)

plt.tight_layout()
plt.show()

# Save findings
if q6_results['multi_prop_games']:
    pd.DataFrame(q6_results['multi_prop_games']).to_csv(
        '/Users/malcolmlittle/NBA OVER UNDER SW/analysis/outputs/findings/q6_multi_prop_games.csv',
        index=False
    )
    print("\nMulti-prop games saved to q6_multi_prop_games.csv")

## Cell 8: Q7 - Failure Analysis

In [None]:
# Q7: Games where possession patterns broke down
print('=== Q7: Failure Analysis ===')

q7_results = analyze_failure_cases(df)

print(f"\nTotal Failure Games: {q7_results['total_failures']}")
print(f"Average Failure Severity: {q7_results['avg_failure_severity']:.1f}")
print(f"\nCommon Patterns in Failures:")
print(f"  Environment Distribution: {q7_results['common_patterns']['environment_distribution']}")
print(f"  Average Pace: {q7_results['common_patterns']['avg_pace']:.1f}")
print(f"  Avg Opponent PPP Advantage: {q7_results['common_patterns']['avg_opp_ppp_advantage']:.3f}")

# Visualization
if q7_results['failure_games']:
    failure_df = pd.DataFrame(q7_results['failure_games'])
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Scatter: opportunity_diff vs failure_severity
    ax1.scatter(failure_df['opportunity_diff'], failure_df['failure_severity'], 
                s=80, color='red', alpha=0.6, edgecolor='black')
    ax1.set_xlabel('Opportunity Differential (should have won)')
    ax1.set_ylabel('Failure Severity Score')
    ax1.set_title('Failure Game Severity')
    
    # Histogram: conversion scores in failures
    ax2.hist(failure_df['conversion_score'], bins=12, color='orange', alpha=0.7, edgecolor='black')
    ax2.set_xlabel('Conversion Score')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Conversion Scores in Failure Games')
    ax2.axvline(60, color='red', linestyle='--', label='Threshold (60)')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Save findings
    failure_df.to_csv(
        '/Users/malcolmlittle/NBA OVER UNDER SW/analysis/outputs/findings/q7_failure_games.csv',
        index=False
    )
    print("\nFailure games saved to q7_failure_games.csv")
else:
    print("\nNo failure games to visualize")

# Summary
print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)
print("\nAll findings saved to /analysis/outputs/findings/")