In [3]:
import duckdb
import os
import requests
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
engine = create_engine(DATABASE_URL)

# Target seasons
seasons = [2021, 2022, 2023, 2024]
base_url = "https://github.com/nflverse/nflfastR-data/releases/download/player_stats/player_stats_{year}.parquet"
local_dir = "data/raw/nflfastr"
os.makedirs(local_dir, exist_ok=True)

# Download files
for season in seasons:
    url = base_url.format(year=season)
    local_path = os.path.join(local_dir, f"player_stats_{season}.parquet")
    if not os.path.exists(local_path):
        print(f"Downloading {season} data...")
        response = requests.get(url, allow_redirects=True)
        if response.headers['Content-Type'] != 'application/octet-stream':
            print("Warning: Downloaded content is not a Parquet file!")
        with open(local_path, 'wb') as file:
            file.write(response.content)
        print(f"✅ Saved: {local_path}")
    else:
        print(f"Already exists: {local_path}")

# Ingest via DuckDB
files = [os.path.join(local_dir, f"player_stats_{season}.parquet") for season in seasons]
query = "SELECT season, week, player_name, passing_yards, rushing_yards, receiving_yards, passing_tds + rushing_tds + receiving_tds AS touchdowns, fantasy_points_ppr AS fantasy_points FROM read_parquet('{}')"
dfs = [duckdb.query(query.format(file)).to_df() for file in files]
df = pd.concat(dfs)


Downloading 2021 data...
✅ Saved: data/raw/nflfastr/player_stats_2021.parquet
Downloading 2022 data...
✅ Saved: data/raw/nflfastr/player_stats_2022.parquet
Downloading 2023 data...
✅ Saved: data/raw/nflfastr/player_stats_2023.parquet
Downloading 2024 data...
✅ Saved: data/raw/nflfastr/player_stats_2024.parquet


InvalidInputException: Invalid Input Error: File 'data/raw/nflfastr/player_stats_2021.parquet' too small to be a Parquet file

In [6]:
import requests
import os

url = "https://github.com/nflverse/nflverse-data/releases/download/player_stats/player_stats_2022.parquet"
local_path = "data/raw/nflfastr/player_stats_2022.parquet"

os.makedirs(os.path.dirname(local_path), exist_ok=True)

response = requests.get(url, allow_redirects=True)

# Check if content is a Parquet file by content-type
content_type = response.headers.get('Content-Type', '')
print(f"Content-Type: {content_type}")
if 'html' in content_type.lower():
    raise ValueError("Downloaded content is HTML, not a Parquet file. Check URL or access permissions.")

with open(local_path, 'wb') as f:
    f.write(response.content)

print(f"Downloaded {local_path} ({len(response.content)} bytes)")

Content-Type: application/octet-stream
Downloaded data/raw/nflfastr/player_stats_2022.parquet (338789 bytes)


In [17]:
df = pd.read_parquet('../data/raw/nflfastr/player_stats_2019.parquet')
df.head(20)

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,recent_team,season,week,season_type,...,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,fantasy_points,fantasy_points_ppr
0,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,1,REG,...,0.0,,0,,,,,0.0,25.64,25.64
1,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,2,REG,...,0.0,,0,,,,,0.0,24.66,24.66
2,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,3,REG,...,0.0,,0,,,,,0.0,20.14,20.14
3,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,4,REG,...,0.0,,0,,,,,0.0,3.7,3.7
4,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,5,REG,...,0.0,,0,,,,,0.0,23.92,23.92
5,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,6,REG,...,0.0,,0,,,,,0.0,21.96,21.96
6,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,7,REG,...,0.0,,0,,,,,0.0,11.96,11.96
7,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,8,REG,...,0.0,,0,,,,,0.0,18.36,18.36
8,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,9,REG,...,0.0,,0,,,,,0.0,13.4,13.4
9,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2019,11,REG,...,0.0,,0,,,,,0.0,8.54,8.54


In [25]:
import requests
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

load_dotenv()
DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
engine = create_engine(DATABASE_URL)

response = requests.get('https://api.sleeper.app/v1/players/nfl')
players_data = response.json()
players_df = pd.DataFrame.from_dict(players_data, orient='index')

depth_chart_df = players_df[['full_name', 'position', 'depth_chart_position', 'team']].copy()
depth_chart_df = depth_chart_df.rename(columns={
    'full_name': 'name',
    'depth_chart_position': 'depth_position'
})
depth_chart_df = depth_chart_df[depth_chart_df['name'].notnull() & depth_chart_df['position'].notnull()]

# Link team ID from existing teams table
teams_query = pd.read_sql('SELECT team_id, abbreviation FROM teams', engine)
depth_chart_df = depth_chart_df.merge(teams_query, how='left', left_on='team', right_on='abbreviation')

# Link player ID from existing players table
players_query = pd.read_sql('SELECT player_id, name FROM players', engine)
depth_chart_df = depth_chart_df.merge(players_query, how='left', on='name')

# Drop rows where we are missing player_id or team_id because those are required for our schema
final_df = depth_chart_df[['player_id', 'team_id', 'position', 'depth_position']].copy()

# Drop rows where player_id or team_id is None or NaN
final_df = final_df.dropna(subset=['player_id', 'team_id'])

# Ensure integers (Postgres needs int, not float from pandas NaNs)
final_df['player_id'] = final_df['player_id'].astype(int)
final_df['team_id'] = final_df['team_id'].astype(int)

# Depth position can be null, keep as is or fill with 0 if needed
final_df['depth_position'] = final_df['depth_position'].fillna(0)

final_df.to_sql('depth_chart', engine, if_exists='replace', index=False)
print("✅ Depth chart ingested into PostgreSQL.")


✅ Depth chart ingested into PostgreSQL.


In [23]:
final_df

Unnamed: 0,player_id,team_id,position,depth_position
6,6,14,CB,NB
8,8,30,RB,0
12,12,1,CB,0
14,14,9,K,K
18,18,29,DE,LOLB
...,...,...,...,...
15281,11079,10,CB,DB
15285,11082,32,WR,RWR
15294,11091,32,CB,DB
15295,11092,8,LB,ROLB


In [26]:
import requests
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

load_dotenv()
DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
engine = create_engine(DATABASE_URL)

response = requests.get('https://api.sleeper.app/v1/players/nfl')
players_data = response.json()
players_df = pd.DataFrame.from_dict(players_data, orient='index')

depth_chart_df = players_df[['full_name', 'position', 'depth_chart_position', 'team']].copy()
depth_chart_df = depth_chart_df.rename(columns={
    'full_name': 'name',
    'depth_chart_position': 'depth_position'
})
depth_chart_df = depth_chart_df[depth_chart_df['name'].notnull() & depth_chart_df['position'].notnull()]

# Link team ID from existing teams table
teams_query = pd.read_sql('SELECT team_id, abbreviation FROM teams', engine)
depth_chart_df = depth_chart_df.merge(teams_query, how='left', left_on='team', right_on='abbreviation')

# Link player ID from existing players table
players_query = pd.read_sql('SELECT player_id, name FROM players', engine)
depth_chart_df = depth_chart_df.merge(players_query, how='left', on='name')

# Drop rows where we are missing player_id or team_id because those are required for our schema
final_df = depth_chart_df[['player_id', 'team_id', 'position', 'depth_position']].copy()

# Drop rows where player_id or team_id is None or NaN
final_df = final_df.dropna(subset=['player_id', 'team_id'])

# Ensure integers (Postgres needs int, not float from pandas NaNs)
final_df['player_id'] = final_df['player_id'].astype(int)
final_df['team_id'] = final_df['team_id'].astype(int)

# Depth position can be null, keep as is or fill with 0 if needed
final_df['depth_position'] = final_df['depth_position'].fillna(0)

final_df.to_sql('depth_chart', engine, if_exists='replace', index=False)
print("✅ Depth chart ingested into PostgreSQL.")


✅ Depth chart ingested into PostgreSQL.


In [28]:
final_df.isnull().sum()

player_id         0
team_id           0
position          0
depth_position    0
dtype: int64