In [None]:
# -------------------------------------------------------------------------
# POSSIBLE FEATURES FOR NFL MONEYLINE / BETTING MODELS
#
# DIRECT FROM NFL API (nfl_data_py / nflfastR schedules, rosters, weekly)
#   - game_id, season, week, date, weekday, season_type (reg/post), game_type
#   - home_team, away_team, home_score, away_score
#   - stadium, location, roof, surface, neutral_site, international
#   - home_coach, away_coach, referee
#   - kickoff gametime, local_time, time_zone
#   - Vegas odds if present: spread_line, total_line, over_under_line,
#     home_moneyline, away_moneyline
#   - weather fields (sometimes included): temp, wind, weather description
#
# FROM PLAY-BY-PLAY (aggregated per team per game)
#   - pass attempts, completions, yards, TD, INT, sacks, sack_yards
#   - rush attempts, yards, TD
#   - first downs, penalties, penalty_yards
#   - turnovers (fumbles lost + INT)
#   - offensive plays count
#   - field goal attempts, makes; XP attempts, makes
#   - punts, punt yards
#   - drive-level: points per drive, turnover per drive, average start field pos
#
# FROM WEEKLY / SEASONAL PLAYER STATS (nfl_data_py)
#   - QB passing yards, TD, INT, attempts, sacks
#   - RB carries, rush yards, TD
#   - WR/TE targets, receptions, rec yards, TD
#   - Defensive stats: tackles, sacks, interceptions
#   - Snap counts (limited), availability indicators
#
# TEAM / PLAYER META
#   - roster info: QB experience, rookie flags, player height/weight
#   - draft pedigree, combine results (speed, bench, vertical)
#   - injury reports (status: Q, O, IR, PUP) if historical data is complete
#
# DERIVED / ENGINEERED (needs rolling windows)
#   - win_pct_last3, win_pct_last5, avg_margin_last5
#   - yards_per_play (offense & defense)
#   - pass vs rush efficiency (EPA/play if using pbp)
#   - 3rd down %, 4th down %, red zone TD %
#   - turnover margin, penalty yds/play
#   - pace: plays per minute, neutral pass rate, PROE (pass rate over expected)
#   - streak length (wins/losses), one-score game record
#
# CONTEXTUAL
#   - rest days since last game, bye week flag, short week flag
#   - travel distance, time zone difference
#   - primetime game flag
#   - divisional/conference/interconference game flag
#   - altitude of venue, dome vs outdoor
#
# EXTERNAL / NOT IN NFL API
#   - betting market movement: opening vs closing line, line moves
#   - public % bets vs handle (needs sportsbook/odds API)
#   - live weather forecast (hourly temp, wind, rain/snow)
#   - power ratings: ELO, SRS, external team ranks (538, PFF, DVOA, etc.)
#   - media sentiment, buzz, attendance
#
# REMINDER:
#   - Decide your "betting time" (Friday night vs pre-kickoff).
#   - Only include features that would be known at that time (avoid leakage).
# -------------------------------------------------------------------------

<!DOCTYPE html> <html lang="en"> <head><link rel="sitemap" href="/sitemap.txt"><meta name="color-scheme" content="dark light"><script>
      function setTheme(document) {
        const metaColorScheme = document.querySelector(
          'meta[name="color-scheme"]',
        )

        if (window.colorScheme) {
          if (window.colorScheme === 'dark') {
            document.documentElement.classList.add('dark')
            metaColorScheme.setAttribute('content', 'dark')
          } else {
            document.documentElement.classList.remove('dark')
            metaColorScheme.setAttribute('content', 'light')
          }
        } else {
          if (
            localStorage.theme === 'dark' ||
            (!('theme' in localStorage) &&
              window.matchMedia('(prefers-color-scheme: dark)').matches)
          ) {
            document.documentElement.classList.add('dark')
            metaColorScheme.setAttribute('content', 'dark')
          } else {
            document.doc

Add NFL injury report csv (injuries.csv)

In [None]:
# First install nflreadpy if not already installed
# pip install nflreadpy

import nflreadpy as nfl
import os

def export_injuries(seasons, output_path_csv):
    """
    Load injury reports for given seasons and save as CSV.

    Args:
        seasons (list of int): e.g. list(range(2014, 2025))
        output_path_csv (str): path to write CSV file, e.g. "injuries_2014_2024.csv"
    """
    # Load injury reports for given seasons
    injuries = nfl.load_injuries(seasons=seasons)

    # Optionally, ensure output directory exists
    os.makedirs(os.path.dirname(output_path_csv), exist_ok=True)

    # Write out
    injuries.write_csv(output_path_csv)
    print(f"Saved injuries data for seasons {seasons} to {output_path_csv}")

# define seasons 2014-2024 inclusive
seasons = list(range(2014, 2025))
output_file = "../raw/injuries.csv"
export_injuries(seasons, output_file)

NFL DATA PY for gamelogs, dates, and in game stats

In [None]:
# pip install nfl_data_py pandas
import pandas as pd
import nfl_data_py as nfl

# --- seasons span (edit as needed) ---
seasons = list(range(2014, 2024))

# 1) Base schedule with scores + Winner
sch = nfl.import_schedules(seasons)

base_cols = [
    "game_id","season","week","gameday",
    "home_team","away_team","home_score","away_score"
]
df = (
    sch[base_cols]
    .rename(columns={"gameday": "date"})
    .assign(
        Winner=lambda d: d.apply(
            lambda r: r["home_team"] if r["home_score"] > r["away_score"]
            else (r["away_team"] if r["away_score"] > r["home_score"] else "TIE"),
            axis=1
        )
    )
)

# 2) Team-game stats from PBP (aggregate ONCE)
pbp = nfl.import_pbp_data(seasons, downcast=True)
p = pbp[pbp["posteam"].notna()].copy()
grp = p.groupby(["game_id","posteam"], as_index=False)

def isum(s):  # integer-safe sum for 0/1 flags
    return s.fillna(0).astype(int).sum()

def fsum(s):  # float-safe sum (yards, etc.)
    return s.fillna(0).sum()

team_stats = grp.agg(
    pass_cmp=("complete_pass", isum),
    pass_att=("pass_attempt", isum),
    pass_yds=("passing_yards", fsum),
    pass_td =("pass_touchdown", isum),
    pass_int=("interception", isum),
    pass_sacked=("sack", isum),
    pass_sacked_yds=("sack_yards", fsum) if "sack_yards" in p.columns else ("play_id", lambda s: 0),
    rush_att=("rush_attempt", isum),
    rush_yds=("rushing_yards", fsum),
    rush_td =("rush_touchdown", isum),
    first_down=("first_down", isum),
    penalties=("penalty", isum),
    penalties_yds=("penalty_yards", fsum) if "penalty_yards" in p.columns else ("play_id", lambda s: 0),
    fumbles_lost=("fumble_lost", isum) if "fumble_lost" in p.columns else ("play_id", lambda s: 0),
    plays_offense=("play_id", "count"),
    # Kicking / XP (optional)
    fga=("field_goal_attempt", isum) if "field_goal_attempt" in p.columns else ("play_id", lambda s: 0),
    fgm=("field_goal_result", lambda s: (p.loc[s.index,"field_goal_result"].fillna("").eq("made")).sum())
        if "field_goal_result" in p.columns else ("play_id", lambda s: 0),
    xpa=("extra_point_attempt", isum) if "extra_point_attempt" in p.columns else ("play_id", lambda s: 0),
    xpm=("extra_point_result", lambda s: (p.loc[s.index,"extra_point_result"].fillna("").eq("good")).sum())
        if "extra_point_result" in p.columns else ("play_id", lambda s: 0),
    # Punting (optional)
    punt=("punt", isum) if "punt" in p.columns else ("play_id", lambda s: 0),
    punt_yds=("punt_yards", fsum) if "punt_yards" in p.columns else ("play_id", lambda s: 0),
)

team_stats["turnovers"] = team_stats["pass_int"] + team_stats.get("fumbles_lost", 0)

# 3) Spread into home_/away_ and merge ONCE
home = (team_stats.rename(columns={"posteam":"home_team"})
                    .add_prefix("home_")
                    .rename(columns={"home_game_id":"game_id","home_home_team":"home_team"}))

away = (team_stats.rename(columns={"posteam":"away_team"})
                    .add_prefix("away_")
                    .rename(columns={"away_game_id":"game_id","away_away_team":"away_team"}))

df = (df
      .merge(home, on=["game_id","home_team"], how="left")
      .merge(away, on=["game_id","away_team"], how="left"))

# 4) Merge schedule extras (ONLY what exists; once)
SCHED_OPTIONAL = [
    "spread_line","total_line","over_under_line",
    "home_moneyline","away_moneyline",
    "stadium","roof","surface","weather","temp","wind",
    "neutral_site","international","game_type","season_type",
    "weekday","gametime","location",
    "home_coach","away_coach","referee"
]
avail = [c for c in SCHED_OPTIONAL if c in sch.columns]
if avail:
    sched_extras = sch[["game_id"] + avail].copy()
    for c in ["spread_line","total_line","over_under_line","home_moneyline","away_moneyline","temp","wind"]:
        if c in sched_extras.columns:
            sched_extras[c] = pd.to_numeric(sched_extras[c], errors="coerce")
    df = df.merge(sched_extras, on="game_id", how="left")

# 5) FINAL SAFETY: remove any accidental _x/_y duplicates (keep _x)
x_bases = {c[:-2] for c in df.columns if c.endswith("_x")}
for base in x_bases:
    x = base + "_x"
    y = base + "_y"
    if x in df.columns and y in df.columns:
        df.drop(columns=[y], inplace=True)
        df.rename(columns={x: base}, inplace=True)

# If any lone _x or _y remain without a partner, strip the suffix.
rename_map = {}
for c in df.columns:
    if c.endswith("_x") and c[:-2] not in df.columns:
        rename_map[c] = c[:-2]
    if c.endswith("_y") and c[:-2] not in df.columns:
        rename_map[c] = c[:-2]
if rename_map:
    df = df.rename(columns=rename_map)

# df is now de-duplicated and ready.

2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
Downcasting floats.


In [None]:
extra_sched_cols = [
    "season_type", "neutral_site", "international",
    "weather", "over_under_line", "attendance"
]
have = [c for c in extra_sched_cols if c in sch.columns]
if have:
    extras = sch[["game_id"] + have].copy()
    # numeric-ish coercion (safe)
    for c in ["over_under_line", "attendance"]:
        if c in extras.columns:
            extras[c] = pd.to_numeric(extras[c], errors="coerce")
    df = df.merge(extras, on="game_id", how="left")

Data Overview

In [44]:
print("CURRENT COLUMNS:")
for c in df.columns:
    print(c)

CURRENT COLUMNS:
game_id
season
week
date
home_team
away_team
home_score
away_score
Winner
home_pass_cmp
home_pass_att
home_pass_yds
home_pass_td
home_pass_int
home_pass_sacked
home_pass_sacked_yds
home_rush_att
home_rush_yds
home_rush_td
home_first_down
home_penalties
home_penalties_yds
home_fumbles_lost
home_plays_offense
home_fga
home_fgm
home_xpa
home_xpm
home_punt
home_punt_yds
home_turnovers
away_pass_cmp
away_pass_att
away_pass_yds
away_pass_td
away_pass_int
away_pass_sacked
away_pass_sacked_yds
away_rush_att
away_rush_yds
away_rush_td
away_first_down
away_penalties
away_penalties_yds
away_fumbles_lost
away_plays_offense
away_fga
away_fgm
away_xpa
away_xpm
away_punt
away_punt_yds
away_turnovers
spread_line
total_line
home_moneyline
away_moneyline
stadium
roof
surface
temp
wind
game_type
weekday
gametime
location
home_coach
away_coach
referee


In [6]:
# save
df.to_csv("../raw/schedules_raw.csv", index=False)
print(df.head())

NameError: name 'df' is not defined

Rows: 2725
Columns: ['game_id', 'season', 'week', 'date', 'home_team', 'away_team', 'home_score', 'away_score', 'Winner', 'home_pass_cmp'] ...


Unnamed: 0,game_id,season,week,date,home_team,away_team,home_score,away_score,Winner,home_pass_cmp,...,surface,temp,wind,game_type,weekday,gametime,location,home_coach,away_coach,referee
0,2014_01_GB_SEA,2014,1,9/4/2014,SEA,GB,36,16,SEA,19.0,...,fieldturf,71.0,11.0,REG,Thursday,20:30,Home,Pete Carroll,Mike McCarthy,John Parry
1,2014_01_NO_ATL,2014,1,9/7/2014,ATL,NO,37,34,ATL,31.0,...,fieldturf,,,REG,Sunday,13:00,Home,Mike Smith,Sean Payton,Bill Leavy
2,2014_01_CIN_BAL,2014,1,9/7/2014,BAL,CIN,16,23,CIN,35.0,...,sportturf,74.0,8.0,REG,Sunday,13:00,Home,John Harbaugh,Marvin Lewis,Gene Stetatore
3,2014_01_BUF_CHI,2014,1,9/7/2014,CHI,BUF,20,23,BUF,34.0,...,grass,74.0,3.0,REG,Sunday,13:00,Home,Marc Trestman,Doug Marrone,Brad Allen
4,2014_01_WAS_HOU,2014,1,9/7/2014,HOU,WAS,17,6,HOU,14.0,...,grass,,,REG,Sunday,13:00,Home,Bill O'Brien,Jay Gruden,Jerome Boger
