In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

import nfl_data_py as nfl

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

In [3]:
print("Available functions:")
print([func for func in dir(nfl) if not func.startswith('_')])

Available functions:
['HTTPError', 'Iterable', 'ThreadPoolExecutor', 'appdirs', 'as_completed', 'cache_pbp', 'clean_nfl_data', 'datetime', 'import_combine_data', 'import_contracts', 'import_depth_charts', 'import_draft_picks', 'import_draft_values', 'import_ftn_data', 'import_ids', 'import_injuries', 'import_ngs_data', 'import_officials', 'import_pbp_data', 'import_players', 'import_qbr', 'import_sc_lines', 'import_schedules', 'import_seasonal_data', 'import_seasonal_pfr', 'import_seasonal_rosters', 'import_snap_counts', 'import_team_desc', 'import_weekly_data', 'import_weekly_pfr', 'import_weekly_rosters', 'import_win_totals', 'logging', 'name', 'numpy', 'os', 'pandas', 'see_pbp_cols', 'see_weekly_cols', 'warn']


In [4]:
# Explore the weekly data 
print("Available columns in weekly data:")
weekly_cols = nfl.see_weekly_cols()
print(weekly_cols)

Available columns in weekly data:
Index(['player_id', 'player_name', 'player_display_name', 'position',
       'position_group', 'headshot_url', 'recent_team', 'season', 'week',
       'season_type', 'opponent_team', 'completions', 'attempts',
       'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards',
       'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards',
       'passing_yards_after_catch', 'passing_first_downs', 'passing_epa',
       'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards',
       'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost',
       'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions',
       'receptions', 'targets', 'receiving_yards', 'receiving_tds',
       'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
       'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
       'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
       'wopr', 's

In [5]:
print("\nLoading 2023 weekly data...")
weekly_2023 = nfl.import_weekly_data([2023])

print(f"\nDataset shape: {weekly_2023.shape}")
print(f"Columns: {len(weekly_2023.columns)}")
print(f"Date range: {weekly_2023['week'].min()} to {weekly_2023['week'].max()}")

# Show first few rows
print("\nFirst 5 rows:")
print(weekly_2023.head())


Loading 2023 weekly data...
Downcasting floats.

Dataset shape: (5653, 53)
Columns: 53
Date range: 1 to 22

First 5 rows:
    player_id player_name player_display_name position position_group  \
0  00-0023459   A.Rodgers       Aaron Rodgers       QB             QB   
1  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
2  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
3  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
4  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   

                                        headshot_url recent_team  season  \
0  https://static.www.nfl.com/image/upload/f_auto...         NYJ    2023   
1  https://static.www.nfl.com/image/private/f_aut...         CHI    2023   
2  https://static.www.nfl.com/image/private/f_aut...         CHI    2023   
3  https://static.www.nfl.com/image/private/f_aut...         CHI    2023   
4  https://static.www.nfl.com/image/private/f_aut...      

In [6]:
print("Dataset info:")
print(f"Shape: {weekly_2023.shape}")
print(f"Unique players: {weekly_2023['player_name'].nunique()}")
print(f"Unique weeks: {sorted(weekly_2023['week'].unique())}")


Dataset info:
Shape: (5653, 53)
Unique players: 574
Unique weeks: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [7]:
# Step 3: Examine the data structure and key fantasy stats
print("Dataset info:")
print(f"Shape: {weekly_2023.shape}")
print(f"Unique players: {weekly_2023['player_name'].nunique()}")
print(f"Unique weeks: {sorted(weekly_2023['week'].unique())}")

# Look at positions
print("\nPosition breakdown:")
print(weekly_2023['position'].value_counts())

Dataset info:
Shape: (5653, 53)
Unique players: 574
Unique weeks: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

Position breakdown:
WR     2322
RB     1403
TE     1122
QB      690
FB       82
P        12
CB        4
T         3
OLB       3
ILB       3
SS        3
FS        2
DT        1
G         1
MLB       1
DE        1
Name: position, dtype: int64


In [8]:
# Check for any missing fantasy points data
print(f"\nMissing fantasy_points_ppr: {weekly_2023['fantasy_points_ppr'].isna().sum()}")


Missing fantasy_points_ppr: 0


In [9]:
# Let's look at a sample of high-scoring players
print("\nTop 10 highest PPR scores in 2023:")
top_games = weekly_2023.nlargest(10, 'fantasy_points_ppr')[['player_name', 'position', 'week', 'opponent_team', 'fantasy_points_ppr']]
print(top_games)


Top 10 highest PPR scores in 2023:
      player_name position  week opponent_team  fantasy_points_ppr
3587      J.Chase       WR     5           ARI           52.200001
443      A.Cooper       WR    16           HOU           51.500000
5384     D.Achane       RB     3           DEN           51.299999
1895      D.Moore       WR     5           WAS           49.000000
917   C.McCaffrey       RB     4           ARI           48.700001
163       K.Allen       WR     3           MIN           45.459999
526     R.Mostert       RB     3           DEN           45.200001
773        T.Hill       WR     1           LAC           44.500000
4708       B.Hall       RB    16           WAS           43.099998
369       D.Adams       WR     3           PIT           42.200001


In [11]:
print("\nSample player data (first player with >10 games):")
player_counts = weekly_2023['player_name'].value_counts()
sample_player = player_counts[player_counts >= 10].index[0]
sample_data = weekly_2023[weekly_2023['player_name'] == sample_player][['week', 'opponent_team', 'fantasy_points_ppr', 'targets', 'receptions', 'receiving_yards']]
print(f"\n{sample_player}'s 2023 season:")
print(sample_data.head(10))


Sample player data (first player with >10 games):

J.Williams's 2023 season:
      week opponent_team  fantasy_points_ppr  targets  receptions  \
755     15            LA                -0.2        1           0   
1416     1           TEN                 7.2        2           2   
1417     2           CAR                 2.9        0           0   
1418     7           JAX                 1.4        0           0   
1419     8           IND                 4.8        1           1   
1420     9           CHI                 3.2        3           2   
1421    10           MIN                 2.0        1           1   
1422    12           ATL                 3.0        2           2   
1423    13           DET                 2.6        1           1   
1424    14           CAR                 4.3        1           0   

      receiving_yards  
755               0.0  
1416              7.0  
1417              0.0  
1418              0.0  
1419              8.0  
1420              

In [40]:
# Step 4: Filter to fantasy-relevant positions and check data quality

# Filter to main fantasy positions
fantasy_positions = ['QB', 'RB', 'WR', 'TE','K']
fantasy_data = weekly_2023[weekly_2023['position'].isin(fantasy_positions)].copy()

print(f"Filtered dataset shape: {fantasy_data.shape}")
print(f"Removed {weekly_2023.shape[0] - fantasy_data.shape[0]} non-fantasy position records")

Filtered dataset shape: (5537, 53)
Removed 116 non-fantasy position records


In [41]:
key_columns = ['fantasy_points_ppr', 'targets', 'carries', 'attempts', 'week', 'opponent_team']
print("\nMissing values in key columns:")
for col in key_columns:
    missing = fantasy_data[col].isna().sum()
    print(f"{col}: {missing} ({missing/len(fantasy_data)*100:.1f}%)")


Missing values in key columns:
fantasy_points_ppr: 0 (0.0%)
targets: 0 (0.0%)
carries: 0 (0.0%)
attempts: 0 (0.0%)
week: 0 (0.0%)
opponent_team: 0 (0.0%)


In [42]:
# Look at fantasy points distribution by position
print("\nFantasy points distribution by position:")
fantasy_summary = fantasy_data.groupby('position')['fantasy_points_ppr'].agg(['count', 'mean', 'std', 'min', 'max'])
print(fantasy_summary.round(2))


Fantasy points distribution by position:
          count   mean   std   min        max
position                                     
QB          690  13.37  8.91 -2.86  40.799999
RB         1403   8.64  7.81 -1.10  51.299999
TE         1122   6.20  5.92 -0.50  37.299999
WR         2322   8.25  7.81 -1.10  52.200001


In [43]:
# Import schedules (has game-level data)
schedules_2023 = nfl.import_schedules([2023])
print(f"Schedules columns: {schedules_2023.columns.tolist()}")

Schedules columns: ['game_id', 'season', 'game_type', 'week', 'gameday', 'weekday', 'gametime', 'away_team', 'away_score', 'home_team', 'home_score', 'location', 'result', 'total', 'overtime', 'old_game_id', 'gsis', 'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_rest', 'home_rest', 'away_moneyline', 'home_moneyline', 'spread_line', 'away_spread_odds', 'home_spread_odds', 'total_line', 'under_odds', 'over_odds', 'div_game', 'roof', 'surface', 'temp', 'wind', 'away_qb_id', 'home_qb_id', 'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach', 'referee', 'stadium_id', 'stadium']


In [44]:
print("\nSample schedule data:")
print(schedules_2023[['week', 'away_team', 'home_team', 'away_score', 'home_score']].head())


Sample schedule data:
      week away_team home_team  away_score  home_score
6421     1       DET        KC        21.0        20.0
6422     1       CAR       ATL        10.0        24.0
6423     1       HOU       BAL         9.0        25.0
6424     1       CIN       CLE         3.0        24.0
6425     1       JAX       IND        31.0        21.0


In [45]:
# Check if there are any play-by-play functions that might have defensive stats
print("\nChecking play-by-play data capabilities...")
print("Available PBP columns:")
pbp_cols = nfl.see_pbp_cols()
print(f"Number of PBP columns: {len(pbp_cols)}")


Checking play-by-play data capabilities...
Available PBP columns:
Number of PBP columns: 372


In [46]:
# Look for defensive-related columns in PBP data
defensive_related = [col for col in pbp_cols if any(word in col.lower() for word in ['sack', 'int', 'fumble', 'safety', 'block', 'defense', 'tackle'])]
print(f"\nDefensive-related PBP columns ({len(defensive_related)}):")
for col in defensive_related[:20]:  # Show first 20 to avoid overwhelming output
    print(f"  {col}")


Defensive-related PBP columns (93):
  extra_point_result
  two_point_conv_result
  opp_safety_prob
  safety_prob
  extra_point_prob
  two_point_conversion_prob
  punt_blocked
  interception
  fumble_forced
  fumble_not_forced
  fumble_out_of_bounds
  solo_tackle
  safety
  tackled_for_loss
  fumble_lost
  sack
  extra_point_attempt
  two_point_attempt
  fumble
  assist_tackle


In [47]:
# Let's see what game-level data is available in schedules
print("\nGame-level data available in schedules:")
game_cols = [col for col in schedules_2023.columns if any(word in col.lower() for word in ['score', 'yard', 'time', 'down', 'turnover'])]
print(game_cols)


Game-level data available in schedules:
['gametime', 'away_score', 'home_score', 'overtime']


In [48]:
# Define defensive scoring settings
defense_scoring = {
    'defense_td': 6,
    'sacks': 1,
    'interceptions': 2,
    'fumble_recovery': 2,
    'safety': 2,
    'forced_fumble': 1,
    'blocked_kick': 2
}

# Points allowed scoring tiers
def points_allowed_score(points_allowed):
    """Calculate fantasy points based on points allowed"""
    if pd.isna(points_allowed):
        return 0
    elif points_allowed == 0:
        return 10
    elif 1 <= points_allowed <= 6:
        return 7
    elif 7 <= points_allowed <= 13:
        return 4
    elif 14 <= points_allowed <= 20:
        return 1
    elif 21 <= points_allowed <= 27:  
        return -1
    elif 28 <= points_allowed <= 34:
        return -1
    elif points_allowed >= 35:
        return -4
    else:
        return 0

# Load 2023 data and we'll filter to week 1 after loading
pbp_sample = nfl.import_pbp_data([2023])
print(f"PBP full shape: {pbp_sample.shape}")

# Filter to just week 1 for exploration
pbp_sample = pbp_sample[pbp_sample['week'] == 1]
print(f"PBP week 1 shape: {pbp_sample.shape}")

2023 done.
Downcasting floats.
PBP full shape: (49665, 391)
PBP week 1 shape: (2816, 391)


In [21]:
# Look for defensive stat columns
defensive_cols = [col for col in pbp_sample.columns if any(word in col.lower() 
                 for word in ['sack', 'int', 'fumble', 'safety', 'block', 'defense', 'tackle', 'td'])]
print(f"\nDefensive columns found ({len(defensive_cols)}):")
for col in defensive_cols:
    print(f"  {col}")


Defensive columns found (104):
  extra_point_result
  two_point_conv_result
  td_team
  td_player_name
  td_player_id
  opp_safety_prob
  opp_td_prob
  safety_prob
  td_prob
  extra_point_prob
  two_point_conversion_prob
  punt_blocked
  interception
  fumble_forced
  fumble_not_forced
  fumble_out_of_bounds
  solo_tackle
  safety
  tackled_for_loss
  fumble_lost
  own_kickoff_recovery_td
  sack
  extra_point_attempt
  two_point_attempt
  fumble
  assist_tackle
  lateral_sack_player_id
  lateral_sack_player_name
  interception_player_id
  interception_player_name
  lateral_interception_player_id
  lateral_interception_player_name
  blocked_player_id
  blocked_player_name
  tackle_for_loss_1_player_id
  tackle_for_loss_1_player_name
  tackle_for_loss_2_player_id
  tackle_for_loss_2_player_name
  forced_fumble_player_1_team
  forced_fumble_player_1_player_id
  forced_fumble_player_1_player_name
  forced_fumble_player_2_team
  forced_fumble_player_2_player_id
  forced_fumble_player_2_pla

In [22]:
# Load schedules to get points allowed
schedules_2023 = nfl.import_schedules([2023])
print(f"\nSchedule data shape: {schedules_2023.shape}")
print("\nSchedule columns:")
print(schedules_2023.columns.tolist())


Schedule data shape: (285, 46)

Schedule columns:
['game_id', 'season', 'game_type', 'week', 'gameday', 'weekday', 'gametime', 'away_team', 'away_score', 'home_team', 'home_score', 'location', 'result', 'total', 'overtime', 'old_game_id', 'gsis', 'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_rest', 'home_rest', 'away_moneyline', 'home_moneyline', 'spread_line', 'away_spread_odds', 'home_spread_odds', 'total_line', 'under_odds', 'over_odds', 'div_game', 'roof', 'surface', 'temp', 'wind', 'away_qb_id', 'home_qb_id', 'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach', 'referee', 'stadium_id', 'stadium']


In [23]:
# Check what game-level stats are available
print("\nSample game data:")
game_sample = schedules_2023[['week', 'away_team', 'home_team', 'away_score', 'home_score']].head()
print(game_sample)


Sample game data:
      week away_team home_team  away_score  home_score
6421     1       DET        KC        21.0        20.0
6422     1       CAR       ATL        10.0        24.0
6423     1       HOU       BAL         9.0        25.0
6424     1       CIN       CLE         3.0        24.0
6425     1       JAX       IND        31.0        21.0


In [24]:
# Step 8: Aggregate team defense stats by week

def create_team_defense_stats(pbp_data, schedules_data):
    """
    Aggregate defensive stats by team and week from play-by-play data
    """
    
    # Filter out rows where defteam is null
    pbp_clean = pbp_data[pbp_data['defteam'].notna()].copy()
    
    # Aggregate defensive stats by team and week
    defense_stats = pbp_clean.groupby(['season', 'week', 'defteam']).agg({
        'sack': 'sum',
        'interception': 'sum', 
        'fumble_forced': 'sum',
        'safety': 'sum',
        'punt_blocked': 'sum',
        # We'll need to handle fumble recoveries and defensive TDs separately
    }).reset_index()
    
    # Handle fumble recoveries (need to check if defending team recovered)
    fumble_recoveries = pbp_clean[pbp_clean['fumble_recovery_1_team'].notna()].groupby(['season', 'week', 'fumble_recovery_1_team']).size().reset_index(name='fumble_recovery')
    fumble_recoveries = fumble_recoveries.rename(columns={'fumble_recovery_1_team': 'defteam'})
    
    # Handle defensive TDs (where td_team equals defteam)
    def_tds = pbp_clean[pbp_clean['td_team'].notna()].copy()
    def_tds['is_def_td'] = def_tds['td_team'] == def_tds['defteam']
    defensive_tds = def_tds[def_tds['is_def_td']].groupby(['season', 'week', 'defteam']).size().reset_index(name='defense_td')
    
    # Merge all defensive stats
    defense_stats = defense_stats.merge(fumble_recoveries, on=['season', 'week', 'defteam'], how='left')
    defense_stats = defense_stats.merge(defensive_tds, on=['season', 'week', 'defteam'], how='left')
    
    # Fill NaN values with 0
    stat_columns = ['sack', 'interception', 'fumble_forced', 'safety', 'punt_blocked', 'fumble_recovery', 'defense_td']
    defense_stats[stat_columns] = defense_stats[stat_columns].fillna(0)
    
    # Add points allowed from schedule data
    # Reshape schedule data to get points allowed by each team
    schedule_away = schedules_data[['season', 'week', 'away_team', 'home_score']].copy()
    schedule_away.columns = ['season', 'week', 'team', 'points_allowed']
    
    schedule_home = schedules_data[['season', 'week', 'home_team', 'away_score']].copy()
    schedule_home.columns = ['season', 'week', 'team', 'points_allowed']
    
    points_allowed = pd.concat([schedule_away, schedule_home], ignore_index=True)
    
    # Merge with defense stats
    defense_stats = defense_stats.merge(
        points_allowed, 
        left_on=['season', 'week', 'defteam'], 
        right_on=['season', 'week', 'team'], 
        how='left'
    ).drop('team', axis=1)
    
    return defense_stats

In [25]:
# Test the function with our sample data
print("Creating team defense stats...")
defense_week1 = create_team_defense_stats(pbp_sample, schedules_2023)

print(f"\nDefense stats shape: {defense_week1.shape}")
print("\nSample defense stats:")
print(defense_week1.head())


Creating team defense stats...

Defense stats shape: (32, 11)

Sample defense stats:
   season  week defteam  sack  interception  fumble_forced  safety  \
0    2023     1     ARI   6.0           1.0            2.0     0.0   
1    2023     1     ATL   2.0           2.0            1.0     0.0   
2    2023     1     BAL   5.0           0.0            1.0     0.0   
3    2023     1     BUF   3.0           1.0            1.0     0.0   
4    2023     1     CAR   4.0           0.0            1.0     0.0   

   punt_blocked  fumble_recovery  defense_td  points_allowed  
0           0.0              3.0         1.0            20.0  
1           0.0              2.0         0.0            10.0  
2           0.0              2.0         0.0             9.0  
3           0.0              1.0         0.0            22.0  
4           0.0              1.0         0.0            24.0  


In [26]:
print("\nDefense stats summary:")
stat_cols = ['sack', 'interception', 'fumble_forced', 'fumble_recovery', 'defense_td', 'safety', 'punt_blocked']
print(defense_week1[stat_cols].describe())


Defense stats summary:
            sack  interception  fumble_forced  fumble_recovery  defense_td  \
count  32.000000      32.00000      32.000000        32.000000   32.000000   
mean    2.687500       0.78125       0.937500         1.187500    0.250000   
std     1.821688       0.87009       0.913607         1.029798    0.508001   
min     0.000000       0.00000       0.000000         0.000000    0.000000   
25%     1.750000       0.00000       0.000000         0.750000    0.000000   
50%     3.000000       1.00000       1.000000         1.000000    0.000000   
75%     4.000000       1.00000       1.250000         2.000000    0.000000   
max     7.000000       3.00000       3.000000         4.000000    2.000000   

       safety  punt_blocked  
count    32.0          32.0  
mean      0.0           0.0  
std       0.0           0.0  
min       0.0           0.0  
25%       0.0           0.0  
50%       0.0           0.0  
75%       0.0           0.0  
max       0.0           0.0  


In [27]:
# Step 9: Calculate team defense fantasy points

def calculate_defense_fantasy_points(defense_df):
    """
    Calculate fantasy points for team defenses based on custom scoring
    """
    df = defense_df.copy()
    
    # Apply points allowed scoring
    df['points_allowed_score'] = df['points_allowed'].apply(points_allowed_score)
    
    # Calculate total fantasy points
    df['defense_fantasy_points'] = (
        df['defense_td'] * defense_scoring['defense_td'] +
        df['sack'] * defense_scoring['sacks'] +
        df['interception'] * defense_scoring['interceptions'] +
        df['fumble_recovery'] * defense_scoring['fumble_recovery'] +
        df['safety'] * defense_scoring['safety'] +
        df['fumble_forced'] * defense_scoring['forced_fumble'] +
        df['punt_blocked'] * defense_scoring['blocked_kick'] +
        df['points_allowed_score']
    )
    
    return df

# Apply fantasy scoring to our defense stats
defense_week1_scored = calculate_defense_fantasy_points(defense_week1)

print("Team Defense Fantasy Points (Week 1 sample):")
print(defense_week1_scored[['defteam', 'points_allowed', 'sack', 'interception', 'fumble_recovery', 'defense_fantasy_points']].sort_values('defense_fantasy_points', ascending=False))

# Now let's create the full season defense stats
print("\nCreating full season defense stats (this will take a moment)...")

# Load full PBP data for 2023
pbp_2023 = nfl.import_pbp_data([2023])
print(f"Full PBP data loaded: {pbp_2023.shape}")

# Create full season defense stats
defense_2023 = create_team_defense_stats(pbp_2023, schedules_2023)
defense_2023_scored = calculate_defense_fantasy_points(defense_2023)

print(f"\nFull season defense stats: {defense_2023_scored.shape}")
print(f"Weeks covered: {sorted(defense_2023_scored['week'].unique())}")

# Show top defensive performances
print("\nTop 10 defensive performances of 2023:")
top_def = defense_2023_scored.nlargest(10, 'defense_fantasy_points')[
    ['week', 'defteam', 'points_allowed', 'sack', 'interception', 'fumble_recovery', 'defense_td', 'defense_fantasy_points']
]
print(top_def)

# Season averages by team
print("\nSeason averages by team:")
team_averages = defense_2023_scored.groupby('defteam')['defense_fantasy_points'].agg(['mean', 'std', 'min', 'max']).sort_values('mean', ascending=False)
print(team_averages.round(2).head(10))

# Save for later use
print("\nDefense data ready for modeling!")

Team Defense Fantasy Points (Week 1 sample):
   defteam  points_allowed  sack  interception  fumble_recovery  \
8      DAL             0.0   7.0           2.0              2.0   
0      ARI            20.0   6.0           1.0              3.0   
11      GB            20.0   4.0           1.0              3.0   
24     NYJ            16.0   5.0           3.0              0.0   
28      SF             7.0   5.0           2.0              2.0   
1      ATL            10.0   2.0           2.0              2.0   
2      BAL             9.0   5.0           0.0              2.0   
25     PHI            20.0   2.0           1.0              1.0   
14     JAX            21.0   4.0           1.0              3.0   
22      NO            15.0   3.0           3.0              1.0   
30     TEN            16.0   4.0           1.0              1.0   
7      CLE             3.0   2.0           0.0              1.0   
10     DET            20.0   0.0           1.0              1.0   
13     IND       

In [28]:
scoring_settings = {
    # Passing
    'passing_yards': 0.04,  # 1 point per 25 yards (25 * 0.04 = 1)
    'passing_tds': 4,       # 4 points per TD (vs standard 4 or 6)
    'interceptions': -1,    # -1 points per INT
    'passing_2pt_conversions': 2,
    
    # Rushing
    'rushing_yards': 0.1,   # 1 point per 10 yards
    'rushing_tds': 6,       # 6 points per TD
    'rushing_2pt_conversions': 2,
    
    # Receiving
    'receiving_yards': 0.1, # 1 point per 10 yards
    'receiving_tds': 6,     # 6 points per TD
    'receptions': 1,        # 1 point per reception (PPR)
    'receiving_2pt_conversions': 2,
    
    # Fumbles
    'rushing_fumbles_lost': -2,
    'receiving_fumbles_lost': -2,
    'sack_fumbles_lost': -2,
    
    # Special teams
    'special_teams_tds': 6,

    # team defense
    
}


In [29]:
# Step 10: Explore kicking data and setup scoring

# First, let's see what kicking-related columns we have in our weekly data
print("Kicking-related columns in weekly data:")
kicking_cols = [col for col in fantasy_data.columns if any(word in col.lower() 
               for word in ['kick', 'field', 'extra', 'fg', 'xp', 'pat'])]
print(kicking_cols)

Kicking-related columns in weekly data:
[]


In [30]:
# Check if we have kickers in our position data
print("\nKicker data in weekly stats:")
kicker_data = fantasy_data[fantasy_data['position'] == 'K'] if 'K' in fantasy_data['position'].values else pd.DataFrame()
print(f"Kicker records found: {len(kicker_data)}")


Kicker data in weekly stats:
Kicker records found: 0


In [31]:
# Let's also check play-by-play data for kicking events
print("\nKicking-related columns in play-by-play data:")
pbp_kicking_cols = [col for col in pbp_sample.columns if any(word in col.lower() 
                   for word in ['kick', 'field', 'extra', 'fg', 'xp', 'pat'])]
print(pbp_kicking_cols)


Kicking-related columns in play-by-play data:
['side_of_field', 'field_goal_result', 'kick_distance', 'extra_point_result', 'opp_fg_prob', 'fg_prob', 'extra_point_prob', 'kickoff_inside_twenty', 'kickoff_in_endzone', 'kickoff_out_of_bounds', 'kickoff_downed', 'kickoff_fair_catch', 'own_kickoff_recovery', 'own_kickoff_recovery_td', 'extra_point_attempt', 'field_goal_attempt', 'kickoff_attempt', 'kickoff_returner_player_name', 'kickoff_returner_player_id', 'lateral_kickoff_returner_player_id', 'lateral_kickoff_returner_player_name', 'kicker_player_name', 'kicker_player_id', 'own_kickoff_recovery_player_id', 'own_kickoff_recovery_player_name', 'defensive_extra_point_attempt', 'defensive_extra_point_conv', 'home_opening_kickoff', 'xpass']


In [32]:
# Look at some sample kicking plays
print("\nSample field goal attempts:")
fg_attempts = pbp_sample[pbp_sample['field_goal_attempt'] == 1]
if len(fg_attempts) > 0:
    print(f"Found {len(fg_attempts)} field goal attempts in week 1")
    print(fg_attempts[['kicker_player_name', 'kick_distance', 'field_goal_result']].head())
else:
    print("No field goal attempts found in sample")


Sample field goal attempts:
Found 66 field goal attempts in week 1
    kicker_player_name  kick_distance field_goal_result
36            M.Prater           28.0              made
46            M.Prater           54.0              made
89              J.Slye           30.0              made
102           M.Prater           37.0              made
156             J.Slye           33.0              made


In [33]:
print("\nSample extra point attempts:")
xp_attempts = pbp_sample[pbp_sample['extra_point_attempt'] == 1]
if len(xp_attempts) > 0:
    print(f"Found {len(xp_attempts)} extra point attempts in week 1")
    print(xp_attempts[['kicker_player_name', 'extra_point_result']].head())
else:
    print("No extra point attempts found in sample")


Sample extra point attempts:
Found 63 extra point attempts in week 1
    kicker_player_name extra_point_result
26              J.Slye               good
80            M.Prater               good
135             J.Slye               good
232             T.Bass               good
305         G.Zuerlein               good


In [34]:
# Check if we need to go back to the original weekly data to include kickers
print("\nChecking original weekly data for kickers...")
weekly_2023_kickers = weekly_2023[weekly_2023['position'] == 'K']
print(f"Kickers in original data: {len(weekly_2023_kickers)}")


Checking original weekly data for kickers...
Kickers in original data: 0


In [35]:
if len(weekly_2023_kickers) > 0:
    print("\nSample kicker weekly stats:")
    print(weekly_2023_kickers[['player_name', 'week', 'fantasy_points', 'fantasy_points_ppr']].head())

In [36]:
# Step 11: Build kicker fantasy scoring system

# Define kicking scoring settings
kicking_scoring = {
    'pat_made': 1,      # Extra point made
    'pat_missed': -1,   # Extra point missed
    'fg_missed': -1,    # Field goal missed
}

def fg_distance_points(distance):
    """Calculate points based on field goal distance"""
    if pd.isna(distance):
        return 0
    elif distance <= 19:
        return 3
    elif 20 <= distance <= 29:
        return 3
    elif 30 <= distance <= 39:
        return 3
    elif 40 <= distance <= 49:
        return 4
    elif 50 <= distance <= 59:
        return 5
    elif distance >= 60:
        return 6
    else:
        return 0

def create_kicker_stats(pbp_data):
    """
    Aggregate kicker stats by player and week from play-by-play data
    """
    
    # Field goal attempts
    fg_data = pbp_data[pbp_data['field_goal_attempt'] == 1].copy()
    fg_data = fg_data[fg_data['kicker_player_name'].notna()]
    
    # Extra point attempts  
    xp_data = pbp_data[pbp_data['extra_point_attempt'] == 1].copy()
    xp_data = xp_data[xp_data['kicker_player_name'].notna()]
    
    # Aggregate field goals
    fg_stats = fg_data.groupby(['season', 'week', 'kicker_player_name']).agg({
        'field_goal_result': lambda x: (x == 'made').sum(),  # FG made
        'kick_distance': 'count'  # Total FG attempts
    }).reset_index()
    fg_stats.columns = ['season', 'week', 'player_name', 'fg_made', 'fg_attempts']
    fg_stats['fg_missed'] = fg_stats['fg_attempts'] - fg_stats['fg_made']
    
    # Calculate FG points by distance
    fg_distance_stats = []
    for _, row in fg_data.iterrows():
        if row['field_goal_result'] == 'made':
            points = fg_distance_points(row['kick_distance'])
            fg_distance_stats.append({
                'season': row['season'],
                'week': row['week'],
                'player_name': row['kicker_player_name'],
                'fg_points': points
            })
    
    fg_points_df = pd.DataFrame(fg_distance_stats)
    if len(fg_points_df) > 0:
        fg_points_agg = fg_points_df.groupby(['season', 'week', 'player_name'])['fg_points'].sum().reset_index()
    else:
        fg_points_agg = pd.DataFrame(columns=['season', 'week', 'player_name', 'fg_points'])
    
    # Aggregate extra points
    xp_stats = xp_data.groupby(['season', 'week', 'kicker_player_name']).agg({
        'extra_point_result': lambda x: (x == 'good').sum(),  # XP made
        'extra_point_attempt': 'count'  # Total XP attempts
    }).reset_index()
    xp_stats.columns = ['season', 'week', 'player_name', 'xp_made', 'xp_attempts']
    xp_stats['xp_missed'] = xp_stats['xp_attempts'] - xp_stats['xp_made']
    
    # Merge all kicker stats
    kicker_stats = fg_stats.merge(xp_stats, on=['season', 'week', 'player_name'], how='outer')
    kicker_stats = kicker_stats.merge(fg_points_agg, on=['season', 'week', 'player_name'], how='left')
    
    # Fill NaN values with 0
    stat_columns = ['fg_made', 'fg_attempts', 'fg_missed', 'xp_made', 'xp_attempts', 'xp_missed', 'fg_points']
    kicker_stats[stat_columns] = kicker_stats[stat_columns].fillna(0)
    
    return kicker_stats

# Test with week 1 sample
print("Creating kicker stats for week 1...")
kicker_week1 = create_kicker_stats(pbp_sample)

# Let's also check what the actual field goal and extra point results look like
print("\nField goal results breakdown:")
if len(pbp_sample[pbp_sample['field_goal_attempt'] == 1]) > 0:
    fg_results = pbp_sample[pbp_sample['field_goal_attempt'] == 1]['field_goal_result'].value_counts()
    print(fg_results)

print("\nExtra point results breakdown:")
if len(pbp_sample[pbp_sample['extra_point_attempt'] == 1]) > 0:
    xp_results = pbp_sample[pbp_sample['extra_point_attempt'] == 1]['extra_point_result'].value_counts()
    print(xp_results)

print(f"Kicker stats shape: {kicker_week1.shape}")
print("\nSample kicker stats:")
print(kicker_week1.head())

# Calculate fantasy points
def calculate_kicker_fantasy_points(kicker_df):
    """Calculate fantasy points for kickers"""
    df = kicker_df.copy()
    
    df['kicker_fantasy_points'] = (
        df['fg_points'] +  # Distance-based FG points
        df['xp_made'] * kicking_scoring['pat_made'] +
        df['xp_missed'] * kicking_scoring['pat_missed'] +
        df['fg_missed'] * kicking_scoring['fg_missed']
    )
    
    return df

kicker_week1_scored = calculate_kicker_fantasy_points(kicker_week1)

print("\nKicker Fantasy Points (Week 1 sample):")
print(kicker_week1_scored[['player_name', 'fg_made', 'fg_missed', 'xp_made', 'xp_missed', 'kicker_fantasy_points']].sort_values('kicker_fantasy_points', ascending=False))

Creating kicker stats for week 1...

Field goal results breakdown:
made       59
missed      5
blocked     2
Name: field_goal_result, dtype: int64

Extra point results breakdown:
good      59
failed     4
Name: extra_point_result, dtype: int64
Kicker stats shape: (32, 10)

Sample kicker stats:
   season  week player_name  fg_made  fg_attempts  fg_missed  xp_made  \
0    2023     1   A.Carlson      1.0          1.0        0.0      5.0   
1    2023     1    B.Aubrey      2.0          2.0        0.0      4.0   
2    2023     1     B.Grupe      3.0          3.0        0.0      1.0   
3    2023     1     B.Maher      3.0          5.0        2.0      3.0   
4    2023     1   B.McManus      1.0          1.0        0.0      4.0   

   xp_attempts  xp_missed  fg_points  
0          5.0        0.0        5.0  
1          5.0        1.0        6.0  
2          1.0        0.0       11.0  
3          3.0        0.0       12.0  
4          4.0        0.0        4.0  

Kicker Fantasy Points (Week 1 s

In [37]:
# Step 11 (Revised): Build kicker fantasy scoring system

# First, let's examine the actual values in the kicking columns
print("Examining field goal attempts in week 1:")
fg_attempts = pbp_sample[pbp_sample['field_goal_attempt'] == 1]
print(f"Total FG attempts: {len(fg_attempts)}")

if len(fg_attempts) > 0:
    print("\nField goal result values:")
    print(fg_attempts['field_goal_result'].value_counts())
    
    print("\nSample field goal data:")
    print(fg_attempts[['kicker_player_name', 'kick_distance', 'field_goal_result']].head(10))

print("\nExamining extra point attempts in week 1:")
xp_attempts = pbp_sample[pbp_sample['extra_point_attempt'] == 1]
print(f"Total XP attempts: {len(xp_attempts)}")

if len(xp_attempts) > 0:
    print("\nExtra point result values:")
    print(xp_attempts['extra_point_result'].value_counts())
    
    print("\nSample extra point data:")
    print(xp_attempts[['kicker_player_name', 'extra_point_result']].head(10))

# Check for any null values in key columns
print("\nChecking for missing data:")
print(f"FG attempts with missing kicker: {fg_attempts['kicker_player_name'].isna().sum()}")
print(f"FG attempts with missing distance: {fg_attempts['kick_distance'].isna().sum()}")
print(f"FG attempts with missing result: {fg_attempts['field_goal_result'].isna().sum()}")

print(f"XP attempts with missing kicker: {xp_attempts['kicker_player_name'].isna().sum()}")
print(f"XP attempts with missing result: {xp_attempts['extra_point_result'].isna().sum()}")

Examining field goal attempts in week 1:
Total FG attempts: 66

Field goal result values:
made       59
missed      5
blocked     2
Name: field_goal_result, dtype: int64

Sample field goal data:
    kicker_player_name  kick_distance field_goal_result
36            M.Prater           28.0              made
46            M.Prater           54.0              made
89              J.Slye           30.0              made
102           M.Prater           37.0              made
156             J.Slye           33.0              made
201             T.Bass           40.0              made
219         G.Zuerlein           26.0              made
248             T.Bass           34.0              made
264         G.Zuerlein           43.0              made
317         G.Zuerlein           30.0              made

Examining extra point attempts in week 1:
Total XP attempts: 63

Extra point result values:
good      59
failed     4
Name: extra_point_result, dtype: int64

Sample extra point data:
    k

In [None]:
# Step 12: Build correct kicker scoring system

def fg_distance_points(distance):
    """Calculate points based on field goal distance"""
    if pd.isna(distance):
        return 0
    elif distance <= 19:
        return 3
    elif 20 <= distance <= 29:
        return 3
    elif 30 <= distance <= 39:
        return 3
    elif 40 <= distance <= 49:
        return 4
    elif 50 <= distance <= 59:
        return 5
    elif distance >= 60:
        return 6
    else:
        return 0

def create_kicker_stats_correct(pbp_data):
    """
    Aggregate kicker stats by player and week from play-by-play data
    Using correct field goal and extra point result values
    """
    
    # Field goal attempts
    fg_data = pbp_data[pbp_data['field_goal_attempt'] == 1].copy()
    fg_data = fg_data[fg_data['kicker_player_name'].notna()]
    
    # Extra point attempts  
    xp_data = pbp_data[pbp_data['extra_point_attempt'] == 1].copy()
    xp_data = xp_data[xp_data['kicker_player_name'].notna()]
    
    kicker_stats_list = []
    
    # Process each kicker's stats
    for kicker in fg_data['kicker_player_name'].unique():
        kicker_fg = fg_data[fg_data['kicker_player_name'] == kicker]
        kicker_xp = xp_data[xp_data['kicker_player_name'] == kicker]
        
        # Group by week for this kicker
        for week in pbp_data['week'].unique():
            week_fg = kicker_fg[kicker_fg['week'] == week]
            week_xp = kicker_xp[kicker_xp['week'] == week]
            
            if len(week_fg) > 0 or len(week_xp) > 0:
                # Field goal stats
                fg_made = len(week_fg[week_fg['field_goal_result'] == 'made'])
                fg_missed = len(week_fg[week_fg['field_goal_result'].isin(['missed', 'blocked'])])
                fg_attempts = len(week_fg)
                
                # Calculate FG points by distance
                fg_points = 0
                for _, fg in week_fg[week_fg['field_goal_result'] == 'made'].iterrows():
                    fg_points += fg_distance_points(fg['kick_distance'])
                
                # Extra point stats
                xp_made = len(week_xp[week_xp['extra_point_result'] == 'good'])
                xp_missed = len(week_xp[week_xp['extra_point_result'] == 'failed'])
                xp_attempts = len(week_xp)
                
                # Get season (should be consistent within the data)
                season = pbp_data['season'].iloc[0] if 'season' in pbp_data.columns else 2023
                
                kicker_stats_list.append({
                    'season': season,
                    'week': week,
                    'player_name': kicker,
                    'fg_made': fg_made,
                    'fg_missed': fg_missed,
                    'fg_attempts': fg_attempts,
                    'fg_points': fg_points,
                    'xp_made': xp_made,
                    'xp_missed': xp_missed,
                    'xp_attempts': xp_attempts
                })
    
    return pd.DataFrame(kicker_stats_list)

def calculate_kicker_fantasy_points(kicker_df):
    """Calculate fantasy points for kickers based on custom scoring"""
    df = kicker_df.copy()
    
    df['kicker_fantasy_points'] = (
        df['fg_points'] +  # Distance-based FG points
        df['xp_made'] * 1 +  # XP made: 1 point
        df['xp_missed'] * -1 +  # XP missed: -1 point
        df['fg_missed'] * -1  # FG missed: -1 point
    )
    
    return df

# Test with week 1 sample
print("Creating corrected kicker stats for week 1...")
kicker_week1_correct = create_kicker_stats_correct(pbp_sample)
print(f"Kicker stats shape: {kicker_week1_correct.shape}")

if len(kicker_week1_correct) > 0:
    print("\nSample kicker stats:")
    print(kicker_week1_correct.head())
    
    # Calculate fantasy points
    kicker_week1_scored = calculate_kicker_fantasy_points(kicker_week1_correct)
    
    print("\nKicker Fantasy Points (Week 1):")
    display_cols = ['player_name', 'fg_made', 'fg_missed', 'xp_made', 'xp_missed', 'kicker_fantasy_points']
    print(kicker_week1_scored[display_cols].sort_values('kicker_fantasy_points', ascending=False))
    
    # Show distance breakdown for made field goals
    print("\nField goal distance breakdown:")
    made_fgs = pbp_sample[(pbp_sample['field_goal_attempt'] == 1) & (pbp_sample['field_goal_result'] == 'made')]
    if len(made_fgs) > 0:
        distance_summary = made_fgs.groupby('kicker_player_name')['kick_distance'].agg(['count', 'mean', 'min', 'max'])
        print(distance_summary.round(1))
else:
    print("No kicker stats found")

In [50]:
# Step 13 (Complete): Create complete fantasy dataset with all positions

# First, redefine the custom fantasy scoring function
scoring_settings = {
    # Passing
    'passing_yards': 0.04,  # 1 point per 25 yards (25 * 0.04 = 1)
    'passing_tds': 4,       # 4 points per TD
    'interceptions': -2,    # -2 points per INT
    'passing_2pt_conversions': 2,
    
    # Rushing
    'rushing_yards': 0.1,   # 1 point per 10 yards
    'rushing_tds': 6,       # 6 points per TD
    'rushing_2pt_conversions': 2,
    
    # Receiving
    'receiving_yards': 0.1, # 1 point per 10 yards
    'receiving_tds': 6,     # 6 points per TD
    'receptions': 1,        # 1 point per reception (PPR)
    'receiving_2pt_conversions': 2,
    
    # Fumbles
    'rushing_fumbles_lost': -2,
    'receiving_fumbles_lost': -2,
    'sack_fumbles_lost': -2,
    
    # Special teams
    'special_teams_tds': 6,
}

def calculate_custom_fantasy_points(row, scoring_settings):
    """Calculate fantasy points based on custom scoring settings"""
    points = 0
    
    # Go through each scoring category
    for stat, point_value in scoring_settings.items():
        if stat in row and pd.notna(row[stat]):
            points += row[stat] * point_value
    
    return points

print("Creating comprehensive fantasy dataset for 2023...")

# Apply custom scoring to individual players
print("\nApplying custom fantasy scoring to individual players...")
fantasy_data['custom_fantasy_points'] = fantasy_data.apply(
    lambda row: calculate_custom_fantasy_points(row, scoring_settings), 
    axis=1
)

print(f"Individual player data shape: {fantasy_data.shape}")
print(f"Sample custom scoring:")
sample_players = fantasy_data.nlargest(5, 'custom_fantasy_points')[
    ['player_display_name', 'position', 'week', 'fantasy_points_ppr', 'custom_fantasy_points']
]
print(sample_players)

# Create kicker data (we already have the functions from previous steps)
print("\nCreating full season kicker stats...")
print("This may take a moment to process play-by-play data...")

kicker_2023_full = create_kicker_stats_correct(pbp_2023)
kicker_2023_scored = calculate_kicker_fantasy_points(kicker_2023_full)

print(f"Kicker data created: {len(kicker_2023_scored)} records")

# Format kicker data
kicker_formatted = kicker_2023_scored.rename(columns={
    'player_name': 'player_display_name',
    'kicker_fantasy_points': 'custom_fantasy_points'
})
kicker_formatted['position'] = 'K'
kicker_formatted['position_group'] = 'K'

# Format defense data (we already created this)
defense_formatted = defense_2023_scored.rename(columns={
    'defteam': 'player_display_name',
    'defense_fantasy_points': 'custom_fantasy_points'
})
defense_formatted['position'] = 'DEF'
defense_formatted['position_group'] = 'DEF'

# Combine all data
print("\nCombining all fantasy positions...")
base_cols = ['season', 'week', 'player_display_name', 'position', 'position_group', 'custom_fantasy_points']

final_fantasy_data = pd.concat([
    fantasy_data[base_cols],
    kicker_formatted[base_cols],
    defense_formatted[base_cols]
], ignore_index=True)

print(f"\n🎉 Final comprehensive dataset created!")
print(f"Shape: {final_fantasy_data.shape}")
print(f"Unique players/teams: {final_fantasy_data['player_display_name'].nunique()}")
print(f"Weeks covered: {sorted(final_fantasy_data['week'].unique())}")

# Position breakdown
print("\nFinal position breakdown:")
position_counts = final_fantasy_data['position'].value_counts().sort_index()
print(position_counts)

# Average points by position
print("\nAverage fantasy points by position:")
avg_by_position = final_fantasy_data.groupby('position')['custom_fantasy_points'].agg(['count', 'mean', 'std']).round(2)
print(avg_by_position)

# Top performers by position
print("\nTop performer in each position:")
for pos in sorted(final_fantasy_data['position'].unique()):
    top_player = final_fantasy_data[final_fantasy_data['position'] == pos].nlargest(1, 'custom_fantasy_points')
    if len(top_player) > 0:
        player_info = top_player.iloc[0]
        print(f"{pos}: {player_info['player_display_name']} - Week {player_info['week']} - {player_info['custom_fantasy_points']:.1f} pts")

print(f"\n✅ Phase 1 Complete! Ready for Phase 2: Draft Analysis & Modeling")
print(f"📊 Dataset includes: {', '.join(sorted(final_fantasy_data['position'].unique()))}")

Creating comprehensive fantasy dataset for 2023...

Applying custom fantasy scoring to individual players...
Individual player data shape: (5537, 54)
Sample custom scoring:
      player_display_name position  week  fantasy_points_ppr  \
3587        Ja'Marr Chase       WR     5           52.200001   
443          Amari Cooper       WR    16           51.500000   
5384        De'Von Achane       RB     3           51.299999   
1895             DJ Moore       WR     5           49.000000   
917   Christian McCaffrey       RB     4           48.700001   

      custom_fantasy_points  
3587                   52.2  
443                    51.5  
5384                   51.3  
1895                   49.0  
917                    48.7  

Creating full season kicker stats...
This may take a moment to process play-by-play data...
Kicker data created: 563 records

Combining all fantasy positions...

🎉 Final comprehensive dataset created!
Shape: (6670, 6)
Unique players/teams: 620
Weeks covered: [1

In [51]:
# Export dataset for Phase 2

import os
from datetime import datetime

# Create a data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Export the main dataset
print("Exporting fantasy football dataset...")

# Save as CSV (human readable, good for inspection)
final_fantasy_data.to_csv('data/fantasy_data_2023.csv', index=False)
print("✅ Saved: data/fantasy_data_2023.csv")

# Save as pickle (preserves data types, faster loading)
final_fantasy_data.to_pickle('data/fantasy_data_2023.pkl')
print("✅ Saved: data/fantasy_data_2023.pkl")

# Also save component datasets for reference
fantasy_data.to_csv('data/individual_players_2023.csv', index=False)
defense_2023_scored.to_csv('data/team_defense_2023.csv', index=False)
kicker_2023_scored.to_csv('data/kickers_2023.csv', index=False)

print("✅ Saved component datasets")

# Save metadata and info about the dataset
metadata = {
    'created_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'total_records': len(final_fantasy_data),
    'unique_players': final_fantasy_data['player_display_name'].nunique(),
    'positions': sorted(final_fantasy_data['position'].unique()),
    'weeks_covered': sorted(final_fantasy_data['week'].unique()),
    'scoring_settings': scoring_settings,
    'defense_scoring': defense_scoring,
    'kicking_scoring': {
        'pat_made': 1,
        'pat_missed': -1, 
        'fg_missed': -1,
        'fg_distance_tiers': {
            '0-19': 3, '20-29': 3, '30-39': 3,
            '40-49': 4, '50-59': 5, '60+': 6
        }
    }
}

import json
with open('data/dataset_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("✅ Saved: data/dataset_metadata.json")

# Create a data summary
print("\n📋 Dataset Summary:")
print(f"Main dataset: {final_fantasy_data.shape[0]:,} records, {final_fantasy_data.shape[1]} columns")
print(f"Date range: Weeks {final_fantasy_data['week'].min()}-{final_fantasy_data['week'].max()}")
print(f"File sizes:")
print(f"  - fantasy_data_2023.csv: ~{os.path.getsize('data/fantasy_data_2023.csv')/1024:.1f} KB")
print(f"  - fantasy_data_2023.pkl: ~{os.path.getsize('data/fantasy_data_2023.pkl')/1024:.1f} KB")

# Show what files were created
print(f"\n📁 Files created in 'data/' directory:")
for file in sorted(os.listdir('data')):
    print(f"  - {file}")

print(f"\n🚀 Ready for Phase 2!")
print(f"Recommended imports for your new notebook:")
print(f"""
import pandas as pd
import numpy as np

# Load the main dataset
df = pd.read_pickle('data/fantasy_data_2023.pkl')
# OR: df = pd.read_csv('data/fantasy_data_2023.csv')

# Load metadata
import json
with open('data/dataset_metadata.json', 'r') as f:
    metadata = json.load(f)
    
print(f"Dataset loaded: {{df.shape[0]:,}} records")
print(f"Positions: {{metadata['positions']}}")
""")

Exporting fantasy football dataset...
✅ Saved: data/fantasy_data_2023.csv
✅ Saved: data/fantasy_data_2023.pkl
✅ Saved component datasets


TypeError: Object of type int64 is not JSON serializable

## Old Code

In [4]:
print("\n📊 Getting sample data (2023, Week 1)...")
sample_data = nfl.import_weekly_data([2023], columns=['player_display_name', 'position', 'recent_team', 'week', 'fantasy_points_ppr'])



📊 Getting sample data (2023, Week 1)...
Downcasting floats.


In [None]:
print(f"Sample data shape: {sample_data.shape}")
print(f"Columns available: {len(sample_data.columns)}")
sample_data.head()

In [9]:
print("Loading 2023 data to see all available columns...")
full_data_2023 = nfl.import_weekly_data([2023])

Loading 2023 data to see all available columns...
Downcasting floats.


In [10]:
# Group columns by category for easier understanding
stat_columns = [col for col in full_data_2023.columns if any(x in col for x in ['yards', 'tds', 'attempts', 'completions', 'carries', 'receptions', 'targets'])]
info_columns = [col for col in full_data_2023.columns if any(x in col for x in ['player', 'team', 'position', 'week', 'season'])]
fantasy_columns = [col for col in full_data_2023.columns if 'fantasy' in col]

In [14]:
print(f"\n📊 STATISTICAL COLUMNS ({len(stat_columns)}):")
for col in stat_columns[:10]:  # Show first 10
    print(f"  - {col}")
if len(stat_columns) > 10:
    print(f"  ... and {len(stat_columns)-10} more")

print(f"\n📋 INFO COLUMNS ({len(info_columns)}):")
for col in info_columns:
    print(f"  - {col}")

print(f"\n🏆 FANTASY COLUMNS ({len(fantasy_columns)}):")
for col in fantasy_columns:
    print(f"  - {col}")


📊 STATISTICAL COLUMNS (18):
  - completions
  - attempts
  - passing_yards
  - passing_tds
  - sack_yards
  - passing_air_yards
  - passing_yards_after_catch
  - carries
  - rushing_yards
  - rushing_tds
  ... and 8 more

📋 INFO COLUMNS (11):
  - player_id
  - player_name
  - player_display_name
  - position
  - position_group
  - recent_team
  - season
  - week
  - season_type
  - opponent_team
  - special_teams_tds

🏆 FANTASY COLUMNS (2):
  - fantasy_points
  - fantasy_points_ppr


In [None]:
# Show a few players from different positions
sample_players = full_data_2023[full_data_2023['week'] == 1].groupby('position').head(2)
display_columns = ['player_display_name', 'position', 'recent_team', 'fantasy_points_ppr', 'passing_yards', 'rushing_yards', 'receiving_yards']
print("Sample players from Week 1, 2023:")
sample_players[display_columns].head(10)

Sample players from Week 1, 2023:


Unnamed: 0,player_display_name,position,recent_team,fantasy_points_ppr,passing_yards,rushing_yards,receiving_yards
0,Aaron Rodgers,QB,NYJ,0.0,0.0,0.0,0.0
12,Matthew Stafford,QB,LA,14.46,334.0,11.0,0.0
49,Randall Cobb,WR,NYJ,0.0,0.0,0.0,0.0
90,Marvin Jones,WR,DET,0.8,0.0,0.0,8.0
142,Zach Ertz,TE,ARI,8.1,0.0,0.0,21.0
208,Latavius Murray,RB,BUF,2.7,0.0,8.0,9.0
312,Logan Thomas,TE,WAS,8.3,0.0,0.0,43.0
354,Jerick McKinnon,RB,KC,2.0,0.0,0.0,10.0
504,Michael Burton,FB,DEN,1.3,0.0,0.0,3.0
747,C.J. Ham,FB,MIN,2.7,0.0,0.0,7.0


In [None]:
# Cell 5: Create our data collection class (interactive version)
class NFLDataCollector:
    """
    Interactive NFL Data Collector for Jupyter learning
    """
    
    def __init__(self):
        self.player_stats = None
        self.years = None
        self.fantasy_positions = ['QB', 'RB', 'WR', 'TE', 'K', 'DEF']
        print("🏈 NFL Data Collector initialized!")
        print("Ready to collect fantasy football data...")
        
    def collect_data(self, years=[2022, 2023], sample_size=None):
        """
        Collect player statistics - with option to sample for faster learning
        """
        print(f"📊 Collecting data for years: {years}")
        
        # Get the data
        self.player_stats = nfl.import_weekly_data(years)
        self.years = years
        
        # Sample if requested (good for initial exploration)
        if sample_size:
            self.player_stats = self.player_stats.sample(sample_size, random_state=42)
            print(f"🎯 Sampled {sample_size} records for faster processing")
        
        print(f"✅ Collected {len(self.player_stats):,} player-week records")
        print(f"📅 Years: {sorted(self.player_stats['season'].unique())}")
        print(f"👥 Positions: {sorted(self.player_stats['position'].unique())}")
        print(f"🏟️ Teams: {self.player_stats['recent_team'].nunique()} teams")
        
        return self.player_stats
    
    def quick_explore(self):
        """
        Quick data exploration
        """
        if self.player_stats is None:
            print("❌ No data collected yet! Run collect_data() first.")
            return
            
        df = self.player_stats
        
        print("🔍 QUICK DATA EXPLORATION")
        print("=" * 30)
        
        print(f"📊 Dataset shape: {df.shape}")
        print(f"👤 Unique players: {df['player_display_name'].nunique():,}")
        print(f"📅 Date range: {df['season'].min()} - {df['season'].max()}")
        print(f"📈 Week range: {df['week'].min()} - {df['week'].max()}")
        
        print(f"\n📋 Position breakdown:")
        print(df['position'].value_counts().head(10))
        
        print(f"\n🏆 Fantasy points summary:")
        if 'fantasy_points_ppr' in df.columns:
            print(df['fantasy_points_ppr'].describe())
        else:
            print("No pre-calculated fantasy points found - we'll create our own!")
        
        print(f"\n❓ Missing data check:")
        missing = df.isnull().sum()
        missing = missing[missing > 0].sort_values(ascending=False)
        if len(missing) > 0:
            print(missing.head(10))
        else:
            print("No missing data - great!")
    
    def create_custom_fantasy_points(self, scoring='half_ppr'):
        """
        Create our own fantasy points calculation
        """
        if self.player_stats is None:
            print("❌ No data collected yet!")
            return
            
        print(f"🏆 Creating custom fantasy points ({scoring} scoring)...")
        
        df = self.player_stats.copy()
        
        # Initialize
        df['custom_fantasy_points'] = 0.0
        
        # Passing (1 pt per 25 yards, 4 pts per TD, -1 per INT)
        df['custom_fantasy_points'] += df['passing_yards'].fillna(0) / 25
        df['custom_fantasy_points'] += df['passing_tds'].fillna(0) * 4
        df['custom_fantasy_points'] -= df['interceptions'].fillna(0) * 1
        
        # Rushing (1 pt per 10 yards, 6 pts per TD)
        df['custom_fantasy_points'] += df['rushing_yards'].fillna(0) / 10
        df['custom_fantasy_points'] += df['rushing_tds'].fillna(0) * 6
        
        # Receiving (1 pt per 10 yards, 6 pts per TD)
        df['custom_fantasy_points'] += df['receiving_yards'].fillna(0) / 10
        df['custom_fantasy_points'] += df['receiving_tds'].fillna(0) * 6
        
        # Reception bonus
        if scoring == 'ppr':
            df['custom_fantasy_points'] += df['receptions'].fillna(0) * 1.0
        elif scoring == 'half_ppr':
            df['custom_fantasy_points'] += df['receptions'].fillna(0) * 0.5
        
        # Fumbles
        df['custom_fantasy_points'] -= df['fumbles_lost'].fillna(0) * 2
        
        # Round to 2 decimal places
        df['custom_fantasy_points'] = df['custom_fantasy_points'].round(2)
        
        self.player_stats = df
        
        print(f"✅ Custom fantasy points created!")
        print(f"📊 Average points per game: {df['custom_fantasy_points'].mean():.2f}")
        print(f"🏆 Highest single game: {df['custom_fantasy_points'].max():.2f}")
        print(f"👤 Player with highest game: {df[df['custom_fantasy_points'] == df['custom_fantasy_points'].max()]['player_display_name'].iloc[0]}")
        
        return df
    
    def show_top_performers(self, position=None, metric='custom_fantasy_points'):
        """
        Show top performers
        """
        if self.player_stats is None:
            print("❌ No data available!")
            return
            
        df = self.player_stats
        
        if position:
            df = df[df['position'] == position]
            title = f"🏆 TOP {position} PERFORMERS"
        else:
            title = "🏆 TOP OVERALL PERFORMERS"
            
        print(title)
        print("=" * len(title))
        
        # Total points
        total_points = df.groupby('player_display_name')[metric].sum().sort_values(ascending=False)
        
        print(f"\n📊 By total {metric}:")
        for i, (player, points) in enumerate(total_points.head(10).items(), 1):
            print(f"{i:2d}. {player}: {points:.1f} points")
        
        # Average points
        avg_points = df.groupby('player_display_name')[metric].mean().sort_values(ascending=False)
        
        print(f"\n📈 By average {metric} per game:")
        for i, (player, points) in enumerate(avg_points.head(10).items(), 1):
            games = df[df['player_display_name'] == player].shape[0]
            print(f"{i:2d}. {player}: {points:.1f} ppg ({games} games)")


In [20]:
# Initialize our collector
collector = NFLDataCollector()

🏈 NFL Data Collector initialized!
Ready to collect fantasy football data...


In [21]:
print("\n📊 First, let's collect a sample to explore...")
sample_data = collector.collect_data(years=[2023], sample_size=5000)


📊 First, let's collect a sample to explore...
📊 Collecting data for years: [2023]
Downcasting floats.
🎯 Sampled 5000 records for faster processing
✅ Collected 5,000 player-week records
📅 Years: [2023]
👥 Positions: ['CB', 'DE', 'DT', 'FB', 'FS', 'G', 'ILB', 'OLB', 'P', 'QB', 'RB', 'SS', 'T', 'TE', 'WR']
🏟️ Teams: 32 teams


In [22]:
collector.quick_explore()

🔍 QUICK DATA EXPLORATION
📊 Dataset shape: (5000, 53)
👤 Unique players: 575
📅 Date range: 2023 - 2023
📈 Week range: 1 - 22

📋 Position breakdown:
WR     2064
RB     1245
TE      986
QB      611
FB       70
P         9
CB        3
OLB       3
FS        2
T         2
Name: position, dtype: int64

🏆 Fantasy points summary:
count    5000.000000
mean        8.367821
std         7.789565
min        -3.100000
25%         2.200000
50%         6.200000
75%        12.600000
max        52.200001
Name: fantasy_points_ppr, dtype: float64

❓ Missing data check:
dakota             4452
pacr               4381
passing_epa        4371
rushing_epa        2890
racr                959
receiving_epa       938
target_share        938
air_yards_share     938
wopr                938
dtype: int64
