In [None]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import string 
import time 
import nfl_data_py as nfl
import re

## Recommended data fields (per player)
- Identity: `player_id`, `name`, `position`, `draft_year`
- Draft capital: `draft_round`, `draft_pick_overall`, `team`
- College production (final season + career): rushing/receiving/passing as appropriate
- Context: `conference`, `age_on_draft_day`, `height_in`, `weight_lb`, `forty_yd`, etc.
- NFL rookie outcomes (for training): rushing/receiving/passing/fumbles/INT, games played

In [46]:
COLUMN_NAMES = ["player_id", "name", "position", "team", "draft_year", "draft_round", "draft_pick_overall", "age_on_draft_day", "conference", "conference_strength", "height_in", "weight_lb", "forty_yd\nC_Games", "C_Rec", "C_RecYds", "C_RecTD", "C_RushAtt", "C_RushYds", "C_RushTD", "C_PassAtt", "C_PassYds", "C_PassTD", "C_INT\nCar_Games", "Car_Rec", "Car_RecYds", "Car_RecTD", "Car_RushAtt", "Car_RushYds", "Car_RushTD", "Car_PassAtt", "Car_PassYds", "Car_PassTD", "Car_INT\nTeam_RecYds", "Team_RecTD", "Team_RushYds", "Team_RushTD", "Team_PassYds", "Team_PassTD\nRec", "RecYds", "RecTD", "RushYds", "RushTD", "PassYds", "PassTD", "Fumbles", "INT"]
df = pd.DataFrame(columns=COLUMN_NAMES)
print(df)

Empty DataFrame
Columns: [player_id, name, position, team, draft_year, draft_round, draft_pick_overall, age_on_draft_day, conference, conference_strength, height_in, weight_lb, forty_yd
C_Games, C_Rec, C_RecYds, C_RecTD, C_RushAtt, C_RushYds, C_RushTD, C_PassAtt, C_PassYds, C_PassTD, C_INT
Car_Games, Car_Rec, Car_RecYds, Car_RecTD, Car_RushAtt, Car_RushYds, Car_RushTD, Car_PassAtt, Car_PassYds, Car_PassTD, Car_INT
Team_RecYds, Team_RecTD, Team_RushYds, Team_RushTD, Team_PassYds, Team_PassTD
Rec, RecYds, RecTD, RushYds, RushTD, PassYds, PassTD, Fumbles, INT]
Index: []

[0 rows x 46 columns]


## Get Draft Data 
Download Excel files for the drafts of the last 5 years from https://www.pro-football-reference.com/

These are the fields of the above datasets:
- Rnd -- Round selected in draft
- Supplemental draft round indicated with 'S'
- Pick -- Overall Selection in Draft
- Pos -- Position
- Age -- Age as of September 1 of the draft year
- To -- Last Year
- Misc
- AP1 -- First-team all-pro selections
- PB -- Pro Bowl Selections
- St -- Number of years as primary starter for his team at his position
- Approx Val
- wAV -- Weighted Career Approximate Value.
- See the glossary for details on how we compute Approximate Value.
- The career AV is computed by summing
- 100 percent of the AV of his best season,
- 95 percent of the AV of his next-best season,
- 90 percent of the AV of his third-best season,
- and so on
- DrAV -- Weighted AV accumulated for team that drafted this player
- Games
- G -- Games played
#### Passing
- Cmp -- Passes completed
- Att -- Passes attempted
- Yds -- Yards Gained by Passing (For teams, sack yardage is deducted from this total)
- TD -- Passing Touchdowns
- Int -- Interceptions thrown
#### Rushing
- Att -- Rushing Attempts (sacks not included in NFL)
- Yds -- Rushing Yards Gained (sack yardage is not included by NFL)
- TD -- Rushing Touchdowns
#### Receiving
- Rec -- Receptions
- Yds -- Receiving Yards
- TD -- Receiving Touchdowns
#### Tackles
- Solo -- Tackles
- Before 1994: unofficial and inconsistently recorded from team to team. For amusement only.
- 1994-now: unofficial but consistently recorded.
#### Def Interceptions
- Int -- Passes intercepted on defense
- Sk -- Sacks (official since 1982, based on play-by-play, game film, and other research since 1960)

In [62]:
# Get Drafted College Football Players from the last 5 years 

df_2024 = pd.read_csv("2024_draft.csv")
df_2024["draft_year"] = 2024
df_2023 = pd.read_csv("2023_draft.csv")
df_2023["draft_year"] = 2023
df_2022 = pd.read_csv("2022_draft.csv")
df_2022["draft_year"] = 2022
df_2021 = pd.read_csv("2021.csv")
df_2021["draft_year"] = 2021
df_2020 = pd.read_csv("2020_draft.csv")
df_2020["draft_year"] = 2020
df_2019 = pd.read_csv("2019_draft.csv")
df_2019["draft_year"] = 2019


drafts_df = pd.concat([df_2020, df_2021, df_2022, df_2023, df_2024], axis=0)

for name in drafts_df["Player"]:
    if isinstance(name, str):  # Only process if it's a string
        clean_name = re.sub(r'[^\w\s-]', '', name.lower())  # remove punctuation 
        drafts_df.loc[drafts_df["Player"] == name, "url_name"] = clean_name.replace(" ", "-")

print(len(drafts_df))
print(drafts_df[:5])



1297
   Rnd  Pick   Tm          Player Pos   Age      To  AP1   PB   St  ...  \
0  1.0   1.0  CIN      Joe Burrow  QB  23.0  2024.0  0.0  2.0  5.0  ...   
1  1.0   2.0  WAS     Chase Young  DE  21.0  2024.0  0.0  1.0  1.0  ...   
2  1.0   3.0  DET     Jeff Okudah  CB  21.0  2024.0  0.0  0.0  2.0  ...   
3  1.0   4.0  NYG   Andrew Thomas   T  21.0  2024.0  0.0  0.0  5.0  ...   
4  1.0   5.0  MIA  Tua Tagovailoa  QB  22.0  2024.0  0.0  1.0  5.0  ...   

   Unnamed: 20  Receiving  Unnamed: 22  Unnamed: 23  Unnamed: 24  Unnamed: 25  \
0          NaN        NaN          NaN          NaN          NaN          NaN   
1          NaN        NaN          NaN          NaN          NaN          NaN   
2          NaN        NaN          NaN          NaN          NaN          NaN   
3          NaN        NaN          NaN          NaN          NaN          NaN   
4          NaN        NaN          NaN          NaN          NaN          NaN   

   Unnamed: 26  Unnamed: 27  Unnamed: 28        url_name 

In [63]:
# Filter for relevant positions
positions = ['QB', 'TE', 'WR', 'RB']

drafts_df = drafts_df[drafts_df["Pos"].isin(positions)]
print(len(drafts_df))

314


In [64]:
# Add draft info to master df 
df["name"], df["position"], df["team"], df["draft_year"], df["draft_round"], df["draft_pick_overall"], df["age_on_draft_day"] = drafts_df["Player"], drafts_df["Pos"], drafts_df["Tm"], drafts_df["draft_year"], drafts_df["Rnd"], drafts_df["Pick"], drafts_df["Age"]
print(df[:2])

df["url_name"] = drafts_df["url_name"] # MAKE SURE ORDER and length OF NAMES IS THE SAME!


  player_id            name position team  draft_year  draft_round  \
0       NaN      Joe Burrow       QB  CIN        2020          1.0   
4       NaN  Tua Tagovailoa       QB  MIA        2020          1.0   

   draft_pick_overall  age_on_draft_day conference conference_strength  ...  \
0                 1.0              23.0        NaN                 NaN  ...   
4                 5.0              22.0        NaN                 NaN  ...   

  Team_PassYds Team_PassTD\nRec RecYds RecTD RushYds RushTD PassYds PassTD  \
0          NaN              NaN    NaN   NaN     NaN    NaN     NaN    NaN   
4          NaN              NaN    NaN   NaN     NaN    NaN     NaN    NaN   

  Fumbles  INT  
0     NaN  NaN  
4     NaN  NaN  

[2 rows x 46 columns]


## College Football Stats

In [75]:
def get_player_stats(player_names):
    all_players_data = {}
    for name in player_names:
        found = False 
        for page in range(1,10): 
            url = f"https://www.sports-reference.com/cfb/players/{name}-{page}.html"
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    print(f"Found data for {name} at {url}")
                    soup = BeautifulSoup(response.text, "html.parser")

                    # Get tables 
                    tables = pd.read_html(response.text)

                    # Get height and weight
                    body_metrics = {}
                    paragraphs = soup.find_all("p")
                    for p in paragraphs:
                        spans = p.find_all("span")
                        if len(spans) >= 2:  # first span = height, second span = weight
                            feet, inches = map(int, spans[0].get_text().split("-"))
                            body_metrics["height"] = feet * 12 + inches
                            # Weight
                            body_metrics["weight"] = int(spans[1].get_text().replace("lb", "").strip())
                            break
                    # There are mulitple tables per page, map them to their html ids
                    table_map = {}
                    for i, table in enumerate(soup.find_all("table")):
                        table_id = table.get("id", f"table_{i}")
                        table_map[table_id] = tables[i]

                    # Filter only the tables we care about
                    relevant_ids = ["passing_standard", "receiving_standard", "punt_return_standard"]
                    player_tables = {tid: df for tid, df in table_map.items() if tid in relevant_ids}

                    all_players_data[name] = {
                        "height_in": body_metrics["height"],
                        "weight_lb": body_metrics["weight"],
                        "tables": player_tables
                    }

                    found = True
                    break
            except Exception as e:
                print(f"Error fetching {url}: {e}")
        if not found:
            print(f"No page found for {name}")

    return all_players_data 

In [81]:
player_names = df["url_name"]
college_stats = get_player_stats(player_names)
print(college_stats["bryce-young"]["tables"]["passing_standard"].head())
print(college_stats["bryce-young"]["height_in"])

Found data for joe-burrow at https://www.sports-reference.com/cfb/players/joe-burrow-1.html
Error fetching https://www.sports-reference.com/cfb/players/joe-burrow-1.html: lxml not found, please install it
No page found for joe-burrow
Found data for tua-tagovailoa at https://www.sports-reference.com/cfb/players/tua-tagovailoa-1.html
Error fetching https://www.sports-reference.com/cfb/players/tua-tagovailoa-1.html: lxml not found, please install it
No page found for tua-tagovailoa
Found data for justin-herbert at https://www.sports-reference.com/cfb/players/justin-herbert-1.html
Error fetching https://www.sports-reference.com/cfb/players/justin-herbert-1.html: lxml not found, please install it


KeyboardInterrupt: 

In [None]:
# Add relevent info to master df 
for name in player_names:
    df.loc[df["url_name"] == name, "height_in"] = test_stats[name]["height_in"]
    df.loc[df["url_name"] == name, "weight_lb"] = test_stats[name]["weight_lb"]

print(df["weight_lb"][:2])
print(df[:2])

72
0    194
1    202
Name: weight_lb, dtype: int64
  player_id         name position team  draft_year  draft_round  \
0       NaN  Bryce Young       QB  CAR        2023          1.0   
1       NaN  C.J. Stroud       QB  HOU        2023          1.0   

   draft_pick_overall  age_on_draft_day conference conference_strength  ...  \
0                 1.0              22.0        NaN                 NaN  ...   
1                 2.0              21.0        NaN                 NaN  ...   

   RecYds  RecTD RushYds RushTD PassYds PassTD Fumbles  INT          NaN  \
0     NaN    NaN     NaN    NaN     NaN    NaN     NaN  NaN  Bryce Young   
1     NaN    NaN     NaN    NaN     NaN    NaN     NaN  NaN  C.J. Stroud   

      url_name  
0  bryce-young  
1    cj-stroud  

[2 rows x 48 columns]


In [None]:
def get_nfl_pages(years):
    nfl_player_pages = []

    # Add headers to mimic a real browser -- site was blocking scraping 
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }

    for year in years:
        url = f"https://www.pro-football-reference.com/years/{year}/draft.htm"
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                print(f"Found data for {year} draft at {url}")
                soup = BeautifulSoup(response.text, "html.parser")

                # Get tables 
                tables = pd.read_html(response.text)

                # Get player nfl stats pages 
                base_url = "https://www.pro-football-reference.com/"

                container = soup.find("div", {"id": "div_drafts"})
                if container:
                    table = container.find("table")
                    if table:
                        tbody = table.find("tbody")
                        if tbody:
                            rows = tbody.find_all("tr")
                            for row in rows:
                                player_cell = row.find("td", {"data-stat": "player"})
                                if player_cell:
                                    link_tag = player_cell.find("a")
                                    if link_tag and "href" in link_tag.attrs:
                                        relative_url = link_tag["href"]
                                        full_url = base_url + relative_url
                                        player_name = link_tag.get_text(strip=True)
                                        nfl_player_pages.append({"name": player_name, "url": full_url})
                        else:
                            print(f"No tbody found for {year}")
                    else:
                        print(f"No table found in div_drafts for {year}")
                else:
                    print(f"No div_drafts container found for {year}")
            else:
                print(f"HTTP {response.status_code} for {url}")
                
        except Exception as e:
            print(f"Error fetching {url}: {e}")
    return nfl_player_pages

In [78]:
years = ["2023", "2024"]
nfl_pages = get_nfl_pages(["2023"])
print(nfl_pages)

HTTP 403 for https://www.pro-football-reference.com/years/2023/draft.htm
[]


In [None]:
def get_nfl_stats(nfl_player_pages):
    all_nfl_data = {}

    # Add headers to help with blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    for entry in nfl_player_pages:
        name = entry["name"]
        url = entry["url"]
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                print(f"Found data for {name} at {url}")
                soup = BeautifulSoup(response.text, "html.parser")

                # Get tables 
                tables = pd.read_html(response.text)
                # There are mulitple tables per page, map them to their html ids
                table_map = {}
                for i, table in enumerate(soup.find_all("table")):
                    table_id = table.get("id", f"table_{i}")
                    table_map[table_id] = tables[i]

                # Filter only the tables we care about
                relevant_ids = ["passing", "rushing_and_receiving"]
                player_tables = {tid: df for tid, df in table_map.items() if tid in relevant_ids}

                all_nfl_data[name] = player_tables

                # Add delay to be respectful
                time.sleep(2)
                
            else:
                print(f"HTTP {response.status_code} for {name} at {url}")
        except Exception as e:
            print(f"Error fetching {url}: {e}")
    # Check for any data at all
    if not all_nfl_data:
        print("No data found for any players - likely blocked by site")
    else:
        print(f"Successfully retrieved data for {len(all_nfl_data)} players")

    return all_nfl_data 
        

In [None]:
urls = [{"name": "Bryce Young", "url": "https://www.pro-football-reference.com/players/Y/YounBr01.htm"}]
rookie_data = get_nfl_stats(urls)

No page found for Bryce Young


## Get Rookie Season Data: Method 2 - Using NFL_data_py package
https://pypi.org/project/nfl-data-py/

option: 
nfl.import_seasonal_data(years, s_type)
Returns seasonal data, including various calculated market share stats specific to receivers

years (List[int]) : required, list of years to pull data for (earliest available is 1999)

s_type (str) : optional (default 'REG') season type to include in average ('ALL','REG','POST')


nfl.import_seasonal_pfr(s_type, years)
Returns a dataframe of season-aggregated data sourced from players' pages on pro-football-reference.com. E.g. Patrick Mahomes

s_type (str) : required, the type of stat data to request. Must be one of pass, rec, or rush.

years (List[int]) : optional, years to return data for

In [65]:
def get_player_id_map(names: list[str], years: list[int]) -> dict:
    """
    Build a mapping of player_id -> 'First Last' given player names and corresponding years.
    
    """
    if len(names) != len(years):
        raise ValueError("names and years must have the same length")
    
    player_map = {}
    
    # Loop through each (name, year) pair
    for name, year in zip(names, years):
        roster = nfl.import_seasonal_rosters([year])
        #roster["player_name"] = roster["first_name"] + " " + roster["last_name"]
        
        match = roster[roster["player_name"] == name]
        
        if not match.empty:
            player_id = match.iloc[0]["player_id"]
            player_map[player_id] = {
                "name": name,
                "rookie_year": year,
                "position": match.iloc[0]["position"],
                "team": match.iloc[0]["team"],
                "height_in": match.iloc[0].get("height_in"),
                "weight_lb": match.iloc[0].get("weight_lb"),
                "age_on_draft_day": match.iloc[0].get("age_on_draft_day")
            }
        else:
            print(f"No match found for {name} in {year}")
    
    return player_map


In [67]:
names = ["Patrick Mahomes", "C.J. Stroud", "Justin Jefferson"]
years = [2017, 2023, 2022]

player_map = get_player_id_map(df["name"], df["draft_year"])
print(player_map)

No match found for Henry Ruggs III in 2020
No match found for Michael Pittman Jr. in 2020
No match found for Laviska Shenault Jr. in 2020
No match found for KJ Hamler in 2020
No match found for AJ Dillon in 2020
No match found for Lynn Bowden Jr. in 2020
No match found for Anthony McFarland Jr. in 2020
No match found for Cole McDonald in 2020
No match found for Kenneth Walker III in 2022
No match found for Velus Jones Jr. in 2022
No match found for Brian Robinson Jr. in 2022
No match found for Calvin Austin III in 2022
No match found for Michael Woods II in 2022
No match found for Grant Dubose in 2023
No match found for Michael Penix in 2024
No match found for Brian Thomas in 2024
No match found for Audric Estime in 2024
No match found for Joe Milton in 2024
{'00-0036442': {'name': 'Joe Burrow', 'rookie_year': 2020, 'position': 'QB', 'team': 'CIN', 'height_in': None, 'weight_lb': None, 'age_on_draft_day': None}, '00-0036212': {'name': 'Tua Tagovailoa', 'rookie_year': 2020, 'position': 

In [68]:
def fill_master_with_player_info(master_df: pd.DataFrame, player_map: dict) -> pd.DataFrame:
    """
    Fill in player_id and other info in master DataFrame from player_map
    """
    master_df = master_df.copy()
    
    # Create DataFrame from player_map
    player_info_df = pd.DataFrame.from_dict(player_map, orient='index')
    player_info_df.reset_index(inplace=True)
    player_info_df.rename(columns={"index": "player_id"}, inplace=True)
    
    # Merge with master on 'name'
    merged_df = master_df.merge(player_info_df, on="name", how="left")
    
    return merged_df


In [69]:
df = fill_master_with_player_info(df, player_map)

In [70]:
# Get player stats for specific season
def get_rookie_stats(player_info_map: dict):
    all_stats = []
    
    for player_id, info in player_info_map.items():
        rookie_year = info["rookie_year"]
        seasonal_stats = nfl.import_seasonal_data([rookie_year])
        
        player_stats = seasonal_stats[seasonal_stats["player_id"] == player_id].copy()
        player_stats["player_name"] = info["name"]
        player_stats["position"] = info["position"]
        player_stats["team"] = info["team"]
        all_stats.append(player_stats)
    
    # Combine all players into a single DataFrame
    return pd.concat(all_stats, ignore_index=True)

In [71]:
rookie_stats = get_rookie_stats(player_map)

print(rookie_stats[:5])

    player_id  season season_type  completions  attempts  passing_yards  \
0  00-0036442    2020         REG          264       404         2688.0   
1  00-0036212    2020         REG          186       290         1814.0   
2  00-0036355    2020         REG          396       595         4336.0   
3  00-0036407    2020         REG            0         0            0.0   
4  00-0036358    2020         REG            0         0            0.0   

   passing_tds  interceptions  sacks  sack_yards  ...    rtd_sh    rfd_sh  \
0           13            5.0   32.0       231.0  ...  0.000000  0.000000   
1           11            5.0   20.0       136.0  ...  0.000000  0.000000   
2           31           10.0   32.0       218.0  ...  0.000000  0.000000   
3            0            0.0    0.0         0.0  ...  0.142857  0.221591   
4            0            0.0    1.0         4.0  ...  0.200000  0.199134   

   rtdfd_sh       dom     w8dom    yptmpa    ppr_sh     player_name  position  \
0  0.

In [28]:
# First, let's see what columns are available
seasonal_stats = nfl.import_seasonal_data([2023])
print("Available columns:")
print(seasonal_stats.columns.tolist())
roster_data = nfl.import_seasonal_rosters([2023])
print(roster_data.columns.tolist())

Available columns:
['player_id', 'season', 'season_type', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share', 'wopr_x', 'special_teams_tds', 'fantasy_points', 'fantasy_points_ppr', 'games', 'tgt_sh', 'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh', 'rfd_sh', 'rtdfd_sh', 'dom', 'w8dom', 'yptmpa', 'ppr_sh']
['season', 'team', 'position', 'depth_chart_positi

## Fantasy Scoring 
Assumptions for Half-PPR + TE Premium: 
- Passing TD: 4 points
- Passing Yards: 1 point per 25 yards
- Interceptions: -2 points
- Rushing TD: 6 points
- Rushing Yards: 1 point per 10 yards
- Receptions: 0.5 points (Half PPR)
- Receiving TD: 6 points
- Receiving Yards: 1 point per 10 yards
- Fumbles Lost: -2 points
- Tight End Premium: +1 point per reception if the player is a TE

In [26]:
def calculate_halfppr_tep(stats: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate Half-PPR fantasy points with Tight End Premium.

    Args:
        stats: Seasonal stats including player_id, position, receptions, etc.

    Returns:
        pd.DataFrame: Original DataFrame with new 'fantasy_points_halfppr_tep' column
    """
    df = stats.copy()
    
    # Base fantasy points
    df['fantasy_points_halfppr_tep'] = (
        df.get('passing_yards', 0) / 25 +
        df.get('passing_tds', 0) * 4 +
        df.get('interceptions', 0) * -2 +
        df.get('rushing_yards', 0) / 10 +
        df.get('rushing_tds', 0) * 6 +
        df.get('receiving_yards', 0) / 10 +
        df.get('receiving_tds', 0) * 6 +
        df.get('receptions', 0) * 0.5 +
        df.get('rushing_fumbles_lost', 0) * -2 +
        df.get('receiving_fumbles_lost', 0) * -2
    )
    
    # Tight End Premium: +1 point per reception for TEs
    if 'position' in df.columns:
        df.loc[df['position'] == 'TE', 'fantasy_points_halfppr_tep'] += df.loc[df['position'] == 'TE', 'receptions']
    
    return df


In [34]:
rookie_fnt_pts = calculate_halfppr_tep(rookie_stats)
print(rookie_fnt_pts[['player_name', 'position', 'fantasy_points_halfppr_tep']])

        player_name position  fantasy_points_halfppr_tep
0   Patrick Mahomes       QB                       10.36
1  Justin Jefferson       WR                      302.66


## Add Rookie Data to master data frame  

In [None]:
def add_rookie_stats_to_master(master_df: pd.DataFrame, rookie_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Only adds the following stats:
    Rec, RushYds, RecYds, PassYds, RushTD, RecTD, PassTD, Fumbles, INT

    """
    # Select relevant columns from rookie stats
    rookie_columns_map = {
        'receptions': 'Rec',
        'rushing_yards': 'RushYds',
        'receiving_yards': 'RecYds',
        'passing_yards': 'PassYds',
        'rushing_tds': 'RushTD',
        'receiving_tds': 'RecTD',
        'passing_tds': 'PassTD',
        'rushing_fumbles_lost': 'Fumbles',
        'receiving_fumbles_lost': 'Fumbles',  # Combine fumbles into single column
        'interceptions': 'INT'
    }

    # Copy rookie stats to avoid modifying original
    rookie_copy = rookie_stats.copy()

    # Combine fumbles from rushing and receiving
    rookie_copy['Fumbles'] = rookie_copy.get('rushing_fumbles_lost', 0) + rookie_copy.get('receiving_fumbles_lost', 0)

    # Build new DataFrame with player_id + mapped columns
    rookie_for_merge = pd.DataFrame({'player_id': rookie_copy['player_id']})
    
    for col, new_col in rookie_columns_map.items():
        if col in rookie_copy.columns:
            rookie_for_merge[new_col] = rookie_copy[col]

    # Drop duplicates in case a stat was counted twice (like fumbles)
    rookie_for_merge = rookie_for_merge.groupby('player_id', as_index=False).sum()

    # Merge rookie stats into master df on player_id
    merged_df = master_df.merge(rookie_for_merge, on='player_id', how='left')

    return merged_df


## College Stats: Using CFBD API
https://api.collegefootballdata.com/#/

In [74]:

API_KEY = "jhx47mf3+GY8ZsAD51GHBMi7THo3VtKVg6MXZ+AkXNsvS+MoM0/5qz+F3N5hjS7B"
BASE_URL = "https://api.collegefootballdata.com"
HEADERS = {"Authorization": f"Bearer {API_KEY}"}


In [86]:
def get_player_season_stats(year: int, team: str = None, position: str = None) -> pd.DataFrame:
    """Pull season-level stats for all players in a given year."""
    url = f"{BASE_URL}/stats/player/season"
    params = {"year": year}
    if team: params["team"] = team
    if position: params["position"] = position
    
    r = requests.get(url, headers=HEADERS, params=params)
    df = pd.DataFrame(r.json())
    
    # Convert stat (string) to numeric
    df["stat"] = pd.to_numeric(df["stat"], errors="coerce")
    
    df_wide = df.pivot_table(
        index=["playerId", "player", "position", "team", "conference"],
        columns="statType",
        values="stat",
        aggfunc="sum"   # if multiple rows per statType, sum them
    ).reset_index()
    
    return df_wide

In [None]:
def get_player_usage(year: int) -> pd.DataFrame:
    url = f"{BASE_URL}/player/usage"
    params = {"year": year}
    r = requests.get(url, headers=HEADERS, params=params)
    df = pd.DataFrame(r.json())
    
    # Expand nested "usage" dictionary into columns
    usage_df = df.drop(columns=["usage"]).join(pd.json_normalize(df["usage"]))
    return usage_df


In [None]:
def normalize_stats(season_stats, usage_stats):
    df = season_stats.merge(usage_stats, left_on=["playerId", "season"], right_on=["id", "season"], how="left")
    
    # Example normalizations
    if "overall" in df:
        df["yards_per_snap"] = df["receivingYards"] / df["overall"]
        df["tds_per_snap"] = (df["receivingTDs"] + df["rushingTDs"] + df["passingTDs"]) / df["overall"]
    
    # Per game as fallback
    if "games" in df:
        df["yards_per_game"] = df["receivingYards"] / df["games"]
    
    return df


In [None]:
# usage
players = ["Marvin Harrison Jr", "Brock Bowers"]
year = 2023

season_stats = get_player_season_stats(year)
usage_stats = get_player_usage(year)

# Filter to your players of interest
season_stats = season_stats[season_stats["player_name"].isin(players)]

# Normalize
final_df = normalize_player_stats(season_stats, usage_stats)
print(final_df.head())

TypeError: string indices must be integers, not 'str'