In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

In [2]:
import pandas as pd
from src.config import RAW_DIR, PROCESSED_DIR

weekly_stats = pd.read_parquet(PROCESSED_DIR / "player_stats_2018_2024.parquet")
injuries = pd.read_parquet(RAW_DIR / "injuries_2018_2024.parquet")
profiles = pd.read_parquet(PROCESSED_DIR / "player_profiles.parquet")

In [3]:
profiles["allpro"]

0        NaN
1        0.0
2        NaN
3        NaN
4        NaN
        ... 
24527    NaN
24528    NaN
24529    NaN
24530    NaN
24531    NaN
Name: allpro, Length: 24532, dtype: float64

In [4]:
profiles.columns

Index(['gsis_id', 'name', 'common_first_name', 'first_name', 'last_name',
       'short_name', 'football_name', 'suffix', 'esb_id', 'nfl_id', 'pfr_id',
       'pff_id', 'otc_id', 'espn_id', 'smart_id', 'birth_date',
       'position_group', 'position', 'ngs_position_group', 'ngs_position',
       'height', 'weight', 'headshot', 'college_name', 'college_conference',
       'jersey_number', 'rookie_season', 'last_season', 'latest_team',
       'status', 'ngs_status', 'ngs_status_short_description',
       'years_of_experience', 'pff_position', 'pff_status', 'draft_year',
       'draft_round', 'draft_pick', 'draft_team', 'season', 'round', 'pick',
       'team', 'pfr_player_id', 'cfb_player_id', 'hof', 'category', 'side',
       'college', 'age', 'to', 'allpro', 'probowls', 'seasons_started', 'w_av',
       'car_av', 'dr_av', 'games', 'pass_completions', 'pass_attempts',
       'pass_yards', 'pass_tds', 'pass_ints', 'rush_atts', 'rush_yards',
       'rush_tds', 'receptions', 'rec_yards', 

In [5]:
injuries.head()

Unnamed: 0,season,game_type,team,week,gsis_id,position,full_name,first_name,last_name,report_primary_injury,report_secondary_injury,report_status,practice_primary_injury,practice_secondary_injury,practice_status,date_modified
0,2018.0,REG,ARI,1.0,00-0034687,T,Korey Cunningham,Korey,Cunningham,,,,Thumb,,Full Participation in Practice,2018-09-07 13:18:55+00:00
1,2018.0,REG,ARI,1.0,00-0027686,DT,Corey Peters,Corey,Peters,,,,Knee,,Full Participation in Practice,2018-09-07 13:19:02+00:00
2,2018.0,REG,ARI,1.0,00-0032127,DE,Markus Golden,Markus,Golden,Knee,,Out,Knee,,Did Not Participate In Practice,2018-09-07 13:20:00+00:00
3,2018.0,REG,ARI,1.0,00-0027873,TE,Jermaine Gresham,Jermaine,Gresham,Achilles,,Questionable,Achilles,,Limited Participation in Practice,2018-09-07 13:19:11+00:00
4,2018.0,REG,ARI,1.0,00-0033789,RB,T.J. Logan,T.J.,Logan,Ankle,,Questionable,Ankle,,Limited Participation in Practice,2018-09-07 13:19:11+00:00


In [6]:
injuries.columns

Index(['season', 'game_type', 'team', 'week', 'gsis_id', 'position',
       'full_name', 'first_name', 'last_name', 'report_primary_injury',
       'report_secondary_injury', 'report_status', 'practice_primary_injury',
       'practice_secondary_injury', 'practice_status', 'date_modified'],
      dtype='object')

In [7]:
import random
import pandas as pd

def generate_weekly_stats_queries(df, n_samples=100):

    stat_templates = {
        "completions": "How many passes did {player} complete in Week {week}, {season}?",
        "attempts": "How many passes did {player} attempt in Week {week}, {season}?",
        "passing_yards": "How many passing yards did {player} have in Week {week}, {season}?",
        "passing_tds": "How many passing touchdowns did {player} throw in Week {week}, {season}?",
        "interceptions": "How many interceptions did {player} throw in Week {week}, {season}?",
        "sacks": "How many times was {player} sacked in Week {week}, {season}?",
        "carries": "How many rushing attempts did {player} have in Week {week}, {season}?",
        "rushing_yards": "How many rushing yards did {player} have in Week {week}, {season}?",
        "rushing_tds": "How many rushing touchdowns did {player} score in Week {week}, {season}?",
        "receptions": "How many catches did {player} have in Week {week}, {season}?",
        "targets": "How many targets did {player} have in Week {week}, {season}?",
        "receiving_yards": "How many receiving yards did {player} have in Week {week}, {season}?",
        "receiving_tds": "How many receiving touchdowns did {player} score in Week {week}, {season}?",
        "fantasy_points": "How many fantasy points did {player} score in Week {week}, {season}?",
        "fantasy_points_ppr": "How many PPR fantasy points did {player} score in Week {week}, {season}?"
    } 

    qa_pairs = []
    for _ in range(n_samples):
        row = df.sample(1).iloc[0]
        player = row["player_display_name"]
        week = int(row["week"])
        season = int(row["season"])

        col = random.choice([c for c in stat_templates.keys() if pd.notnull(row[c])])
        val = row[col]

        q = stat_templates[col].format(player=player, week=week, season=season)

        qa_pairs.append({
            "question": q,
            "ground_truth": str(int(val)) if isinstance(val, (int,float)) and pd.notnull(val) else str(val),
            "metadata": {
                "player": player,
                "week": week,
                "season": season,
                "stat": col
            }
        })

    return pd.DataFrame(qa_pairs)

def generate_profile_queries(df, n_samples=100):
    queries = []

    profile_fields = [
        ("college", "Where did {player} play college football?"),
        ("draft_team", "Which team drafted {player}?"),
        ("draft_year", "What year was {player} drafted?"),
        ("position", "What position does {player} play?"),
        ("latest_team", "Which team does {player} currently play for?"),
        ("rookie_season", "What year did {player} enter the NFL?"),
        ("height", "How tall is {player}?"),
        ("weight", "What is the weight of {player}?"),
        ("draft_round", "What round was {player} drafted?"),
        ("draft_pick", "What pick was {player} in the draft?"),
        ("allpro", "How many times has {player} been named an All-Pro?"),
        ("jersey_number", "What number does {player} wear?")
    ]

    for _ in range(n_samples):
        row = df.sample(1).iloc[0]
        player = row["name"]
        field, template = random.choice([f for f in profile_fields if pd.notnull(row[f[0]])])
        query_text = template.format(player=player)
        truth_value = row[field]

        queries.append({
            "query": query_text,
            "truth_lookup": lambda df, r=row, f=field: r[f],
            "eval_fn": evaluate_text_answer,
            "index": "profiles"
        })

    return queries

def generate_injury_queries(df, n_samples=100):
    """
    Generates queries for the injuries index.
    """
    queries = []

    for _ in range(n_samples):
        row = df.sample(1).iloc[0]
        player = row["full_name"]
        week = int(row["week"])
        season = int(row["season"])
        injury_status = row["report_status"]

        # Only generate if status exists
        if pd.notnull(injury_status):
            query_text = f"What was {player}'s injury status in Week {week}, {season}?"
            queries.append({
                "query": query_text,
                "truth_lookup": lambda df, r=row: r["report_status"],
                "eval_fn": evaluate_label_answer,
                "index": "injuries"
            })

    return queries


In [8]:
injuries["report_status"].unique()

array([None, 'Out', 'Questionable', 'Doubtful', 'Note'], dtype=object)

In [9]:
import re
import random
import pandas as pd
from typing import Any, Dict
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from pinecone import Pinecone
from tavily import TavilyClient

from src.config import INDEX_DESCRIPTIONS
from src.rag_functions import answer_question

def evaluate_numeric_answer(pred: str, truth: float, tolerance: int = 5) -> Dict[str, Any]:
    try:
        pred_num = int(re.findall(r'\d+', pred)[0])
    except:
        return {"exact": 0, "within_tol": 0}
    exact = int(pred_num == truth)
    within_tol = int(abs(pred_num - truth) <= tolerance)
    return {"exact": exact, "within_tol": within_tol}

def evaluate_label_answer(pred: str, truth: str) -> Dict[str, Any]:
    pred_str = str(pred).lower()
    truth_str = str(truth).lower()

    statuses = ["out", "questionable", "doubtful"]

    found_status = None
    for s in statuses:
        if s in pred_str:
            found_status = s
            break
    
    exact = int(found_status == truth_str)
    return {"exact": exact}

def evaluate_text_answer(pred: str, truth: str) -> Dict[str, Any]:
    norm_pred, norm_truth = str(pred).lower(), str(truth).lower()
    exact = int(norm_pred == norm_truth)
    fuzzy = int(norm_truth in norm_pred or norm_pred in norm_truth)
    return {"exact": exact, "fuzzy": fuzzy}

def generate_weekly_stats_queries(df, n_samples=100):
    stat_templates = {
        "completions": "How many passes did {player} complete in Week {week}, {season}?",
        "attempts": "How many passes did {player} attempt in Week {week}, {season}?",
        "passing_yards": "How many passing yards did {player} have in Week {week}, {season}?",
        "passing_tds": "How many passing touchdowns did {player} throw in Week {week}, {season}?",
        "interceptions": "How many interceptions did {player} throw in Week {week}, {season}?",
        "sacks": "How many times was {player} sacked in Week {week}, {season}?",
        "carries": "How many rushing attempts did {player} have in Week {week}, {season}?",
        "rushing_yards": "How many rushing yards did {player} have in Week {week}, {season}?",
        "rushing_tds": "How many rushing touchdowns did {player} score in Week {week}, {season}?",
        "receptions": "How many catches did {player} have in Week {week}, {season}?",
        "targets": "How many targets did {player} have in Week {week}, {season}?",
        "receiving_yards": "How many receiving yards did {player} have in Week {week}, {season}?",
        "receiving_tds": "How many receiving touchdowns did {player} score in Week {week}, {season}?",
        "fantasy_points": "How many fantasy points did {player} score in Week {week}, {season}?",
        "fantasy_points_ppr": "How many PPR fantasy points did {player} score in Week {week}, {season}?"
    }

    queries = []
    for _ in range(n_samples):
        row = df.sample(1).iloc[0]
        player = row["player_display_name"]
        week = int(row["week"])
        season = int(row["season"])

        col = random.choice([c for c in stat_templates.keys() if pd.notnull(row[c])])
        query_text = stat_templates[col].format(player=player, week=week, season=season)

        queries.append({
            "query": query_text,
            "truth_lookup": lambda df, r=row, c=col: r[c],
            "eval_fn": evaluate_numeric_answer,
            "index": "weekly-stats"
        })

    return queries


def generate_profile_queries(df, n_samples=100):
    queries = []

    profile_fields = [
        ("college", "Where did {player} play college football?"),
        ("draft_team", "Which team drafted {player}?"),
        ("draft_year", "What year was {player} drafted?"),
        ("position", "What position does {player} play?"),
        ("latest_team", "Which team does {player} currently play for?"),
        ("rookie_season", "What year did {player} enter the NFL?"),
        ("height", "How tall is {player}?"),
        ("weight", "What is the weight of {player}?"),
        ("draft_round", "What round was {player} drafted?"),
        ("draft_pick", "What pick was {player} in the draft?"),
        ("allpro", "How many times has {player} been named an All-Pro?"),
        ("jersey_number", "What number does {player} wear?")
    ]

    for _ in range(n_samples):
        row = df.sample(1).iloc[0]
        player = row["name"]
        field, template = random.choice([f for f in profile_fields if pd.notnull(row[f[0]])])
        query_text = template.format(player=player)
        truth_value = row[field]

        queries.append({
            "query": query_text,
            "truth_lookup": lambda df, r=row, f=field: r[f],
            "eval_fn": evaluate_text_answer,
            "index": "profiles"
        })

    return queries

def generate_injury_queries(df, n_samples=100):
    """
    Generates queries for the injuries index.
    """
    queries = []

    for _ in range(n_samples):
        row = df.sample(1).iloc[0]
        player = row["full_name"]
        week = int(row["week"])
        season = int(row["season"])
        injury_status = row["report_status"]

        # Only generate if status exists
        if pd.notnull(injury_status):
            query_text = f"What was {player}'s injury status in Week {week}, {season}?"
            queries.append({
                "query": query_text,
                "truth_lookup": lambda df, r=row: r["report_status"],
                "eval_fn": evaluate_label_answer,
                "index": "injuries"
            })

    return queries

def run_rag(query: str, index: str) -> str:

    index_descriptions = INDEX_DESCRIPTIONS
    answer, results, source = answer_question(query=query,
                                              embedder=embedder,
                                              indices=indices,
                                              index_descriptions=index_descriptions,
                                              client=client,
                                              top_k=5,
                                              research_mode=False
                                            )
    return answer

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
weekly_queries = generate_weekly_stats_queries(weekly_stats, n_samples=100)
profile_queries = generate_profile_queries(profiles, n_samples=100)
injury_queries = generate_injury_queries(injuries, n_samples=100)
all_queries = weekly_queries + profile_queries + injury_queries

In [11]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

indices = {
    "weekly_stats": pc.Index("weekly-stats"),
    "play_by_play": pc.Index("pbp"),
    "profiles": pc.Index("player-profiles"),
    "injuries": pc.Index("injuries"),
    "schedules": pc.Index("schedules")
}

In [12]:
results = []

embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = OpenAI(api_key=OPENAI_API_KEY)

for q in all_queries:
    # 1. Choose the correct dataframe for ground truth
    if q["index"] == "weekly-stats":
        df_lookup = weekly_stats
    elif q["index"] == "profiles":
        df_lookup = profiles
    else:
        df_lookup = injuries

    # 2. Get ground truth using the lambda
    truth = q["truth_lookup"](df_lookup)

    # 3. Run your RAG pipeline
    print(q["query"])
    prediction = run_rag(q["query"], q["index"])

    # 4. Evaluate prediction
    scores = q["eval_fn"](prediction, truth)

    # 5. Save everything
    results.append({
        "index": q["index"],
        "query": q["query"],
        "ground_truth": truth,
        "prediction": prediction,
        **scores
    })

How many rushing attempts did Velus Jones have in Week 5, 2022?
How many PPR fantasy points did Jamal Agnew score in Week 17, 2022?
How many catches did Melvin Gordon have in Week 1, 2021?
How many catches did DeVonta Smith have in Week 17, 2021?
How many passes did Sammy Watkins complete in Week 14, 2020?
How many interceptions did Mecole Hardman throw in Week 11, 2020?
How many passes did Equanimeous St. Brown complete in Week 4, 2022?
How many interceptions did Diontae Johnson throw in Week 18, 2024?
How many rushing attempts did Joseph Fortson have in Week 10, 2022?
How many passes did Ladd McConkey attempt in Week 8, 2024?
How many receiving yards did James Robinson have in Week 13, 2021?
How many passing yards did Courtland Sutton have in Week 8, 2021?
How many rushing attempts did Emanuel Wilson have in Week 3, 2023?
How many rushing yards did C.J. Stroud have in Week 8, 2024?
How many fantasy points did Collin Johnson score in Week 7, 2021?
How many rushing touchdowns did Nick 

In [13]:
df_results = pd.DataFrame(results)
df_results.to_csv(RAW_DIR / "rag_eval_results_full.csv", index=False)
df_results.head()

Unnamed: 0,index,query,ground_truth,prediction,exact,within_tol,fuzzy
0,weekly-stats,How many rushing attempts did Velus Jones have...,0.0,"Velus Jones had 0 rushing attempts in Week 5, ...",1,1.0,
1,weekly-stats,How many PPR fantasy points did Jamal Agnew sc...,0.8,Jamal Agnew scored 1.8 PPR fantasy points in W...,0,1.0,
2,weekly-stats,How many catches did Melvin Gordon have in Wee...,3.0,"Melvin Gordon had 0 catches in Week 1, 2021.",0,1.0,
3,weekly-stats,How many catches did DeVonta Smith have in Wee...,3.0,"In Week 17, 2021, DeVonta Smith had 3 receptions.",0,0.0,
4,weekly-stats,How many passes did Sammy Watkins complete in ...,0.0,"In Week 14, 2020, there is no data provided fo...",0,0.0,


In [14]:
df_results.tail()

Unnamed: 0,index,query,ground_truth,prediction,exact,within_tol,fuzzy
241,injuries,What was Anthony Levine's injury status in Wee...,Questionable,"In Week 17 of the 2018 season, Anthony Levine ...",1,,
242,injuries,What was Michael Mitchell's injury status in W...,Out,"In Week 15 of 2018, Michael Mitchell was repor...",1,,
243,injuries,What was Ronnie Perkins's injury status in Wee...,Questionable,"In Week 13 of the 2021 season, Ronnie Perkins ...",1,,
244,injuries,What was Mitchell Schwartz's injury status in ...,Questionable,Mitchell Schwartz did not have any reported in...,0,,
245,injuries,What was Tyquan Lewis's injury status in Week ...,Questionable,Tyquan Lewis was not reported to have any inju...,0,,


In [16]:
summary = df_results.groupby("index")[["exact", "within_tol", "fuzzy"]].mean()
summary

Unnamed: 0_level_0,exact,within_tol,fuzzy
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
injuries,0.586957,,
profiles,0.0,,0.14
weekly-stats,0.11,0.41,
