In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import glob

# Set up paths
data_dir = Path('/Users/maxwhalen/Documents/GitHub/Big-Data-Bowl-26/data')
train_dir = data_dir / 'train'

print("Loading data...")
print(f"Data directory: {data_dir}")
print(f"Train directory: {train_dir}")


Loading data...
Data directory: /Users/maxwhalen/Documents/GitHub/Big-Data-Bowl-26/data
Train directory: /Users/maxwhalen/Documents/GitHub/Big-Data-Bowl-26/data/train


In [2]:
# Load all input files (training data)
input_files = sorted(glob.glob(str(train_dir / 'input_2023_*.csv')))
print(f"Found {len(input_files)} input files")

# Load all input data into a single dataframe
input_dfs = []
for file in input_files:
    week = Path(file).stem.split('_')[-1]
    df = pd.read_csv(file)
    df['week'] = week  # Add week identifier
    input_dfs.append(df)
    print(f"Loaded {file.split('/')[-1]}: {len(df):,} rows")

input_data = pd.concat(input_dfs, ignore_index=True)
print(f"\nTotal input data: {len(input_data):,} rows")
print(f"Columns: {list(input_data.columns)}")


Found 18 input files
Loaded input_2023_w01.csv: 285,714 rows
Loaded input_2023_w02.csv: 288,586 rows
Loaded input_2023_w03.csv: 297,757 rows
Loaded input_2023_w04.csv: 272,475 rows
Loaded input_2023_w05.csv: 254,779 rows
Loaded input_2023_w06.csv: 270,676 rows
Loaded input_2023_w07.csv: 233,597 rows
Loaded input_2023_w08.csv: 281,011 rows
Loaded input_2023_w09.csv: 252,796 rows
Loaded input_2023_w10.csv: 260,372 rows
Loaded input_2023_w11.csv: 243,413 rows
Loaded input_2023_w12.csv: 294,940 rows
Loaded input_2023_w13.csv: 233,755 rows
Loaded input_2023_w14.csv: 279,972 rows
Loaded input_2023_w15.csv: 281,820 rows
Loaded input_2023_w16.csv: 316,417 rows
Loaded input_2023_w17.csv: 277,582 rows
Loaded input_2023_w18.csv: 254,917 rows

Total input data: 4,880,579 rows
Columns: ['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id', 'play_direction', 'absolute_yardline_number', 'player_name', 'player_height', 'player_weight', 'player_birth_date', 'player_position', 'player_side', 

In [None]:
# Load all output files (target predictions)
output_files = sorted(glob.glob(str(train_dir / 'output_2023_*.csv')))
print(f"Found {len(output_files)} output files")

# Load all output data into a single dataframe
output_dfs = []
for file in output_files:
    week = Path(file).stem.split('_')[-1]
    df = pd.read_csv(file)
    df['week'] = week  # Add week identifier
    output_dfs.append(df)
    print(f"Loaded {file.split('/')[-1]}: {len(df):,} rows")

output_data = pd.concat(output_dfs, ignore_index=True)
print(f"\nTotal output data: {len(output_data):,} rows")
print(f"Columns: {list(output_data.columns)}")


In [None]:
# Load supplementary data (play-level metadata)
supplementary_data = pd.read_csv(data_dir / 'supplementary_data.csv')
print(f"Supplementary data: {len(supplementary_data):,} rows")
print(f"Columns: {list(supplementary_data.columns)}")


In [None]:
# Display summary information
print("=" * 80)
print("DATA SUMMARY")
print("=" * 80)
print(f"\nInput Data Shape: {input_data.shape}")
print(f"Output Data Shape: {output_data.shape}")
print(f"Supplementary Data Shape: {supplementary_data.shape}")

print(f"\nUnique games: {input_data['game_id'].nunique()}")
print(f"Unique plays: {input_data['play_id'].nunique()}")
print(f"Unique players: {input_data['nfl_id'].nunique()}")

print("\n" + "=" * 80)
print("INPUT DATA - First few rows:")
print("=" * 80)
display(input_data.head())

print("\n" + "=" * 80)
print("OUTPUT DATA - First few rows:")
print("=" * 80)
display(output_data.head())

print("\n" + "=" * 80)
print("SUPPLEMENTARY DATA - First few rows:")
print("=" * 80)
display(supplementary_data.head())
