In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [2]:
import pandas as pd
from src.config import KNOWLEDGE_DIR

df = pd.read_parquet(KNOWLEDGE_DIR / "snippet_embeddings.parquet")
df.head()

Unnamed: 0,snippet,embedding
0,"Tom Brady (QB, NE), Week 1, 2018 vs. HOU: | Pa...","[-0.025262221693992615, 0.012066834606230259, ..."
1,"Tom Brady (QB, NE), Week 2, 2018 vs. JAX: | Pa...","[-0.008193491958081722, 0.03533443436026573, 0..."
2,"Tom Brady (QB, NE), Week 3, 2018 vs. DET: | Pa...","[-0.008198426105082035, 0.03153906390070915, 0..."
3,"Tom Brady (QB, NE), Week 4, 2018 vs. MIA: | Pa...","[-0.010116074234247208, 0.029954370111227036, ..."
4,"Tom Brady (QB, NE), Week 5, 2018 vs. IND: | Pa...","[-0.016648583114147186, 0.05683114379644394, 0..."


In [12]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [4]:
index_name = "weekly-stats"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
snippets = df["snippet"].tolist()
embeddings = df["embedding"].tolist()

batch_size = 100
for i in range(0, len(snippets), batch_size):
    batch_ids = [str(i) for i in range(i, i+batch_size)]
    batch_embeddings = embeddings[i:i+batch_size]
    batch_snippets = snippets[i:i+batch_size]

    vectors = [
        {"id": batch_ids[k], "values": batch_embeddings[k], "metadata": {"text": batch_snippets[k]}}
        for k in range(len(batch_embeddings))
    ]

    index.upsert(vectors=vectors)

In [6]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

query = "Josh Allen rushing yards week 5"
query_embedding = client.embeddings.create(
    model="text-embedding-3-small",
    input=query
).data[0].embedding

results = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True
)

for match in results.matches:
    print(match.score, match.metadata["text"])

0.778427601 Josh Allen (QB, BUF), Week 5, 2021 vs. KC: | Passing: 15/26, 315.0 yards, 3 TDs, 0.0 INTs, 14.434711516484073 EPA) | Rushing: 11 carries, 59.0 yards, 0.0 fumbles, 1 TDs, 4.874372204649262 EPA | Fantasy (PPR): 36.5 pts | Depth Chart: 1 | 57.0 offensive snaps, 1.0 offensive snap %, 0.0 defensive snaps, 0.0 defensive snap %, 0.0 special teams snaps, 0.0 special teams snap %
0.774203241 Josh Allen (QB, BUF), Week 5, 2022 vs. PIT: | Passing: 20/31, 424.0 yards, 4 TDs, 1.0 INTs, 24.854640499632122 EPA) | Rushing: 5 carries, 42.0 yards, 0.0 fumbles, 0 TDs, 2.8632336995215155 EPA | Fantasy (PPR): 35.2 pts | Depth Chart: 1 | 48.0 offensive snaps, 0.86 offensive snap %, 0.0 defensive snaps, 0.0 defensive snap %, 0.0 special teams snaps, 0.0 special teams snap %
0.770329297 Josh Allen (QB, BUF), Week 5, 2023 vs. JAX: | Passing: 27/40, 359.0 yards, 2 TDs, 1.0 INTs, 14.9453127423767 EPA) | Rushing: 4 carries, 14.0 yards, 0.0 fumbles, 1 TDs, 2.275630131654907 EPA | Fantasy (PPR): 27.8 pt

In [3]:
from src.config import RAW_DIR

pbp = pd.read_parquet(RAW_DIR / "pbp_2018_2024.parquet")

In [5]:
pd.set_option("display.max_columns", None)

pbp.head()

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,side_of_field,yardline_100,game_date,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,drive,sp,qtr,down,goal_to_go,time,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,shotgun,no_huddle,qb_dropback,qb_kneel,qb_spike,qb_scramble,pass_length,pass_location,air_yards,yards_after_catch,run_location,run_gap,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,home_timeouts_remaining,away_timeouts_remaining,timeout,timeout_team,td_team,td_player_name,td_player_id,posteam_timeouts_remaining,defteam_timeouts_remaining,total_home_score,total_away_score,posteam_score,defteam_score,score_differential,posteam_score_post,defteam_score_post,score_differential_post,no_score_prob,opp_fg_prob,opp_safety_prob,opp_td_prob,fg_prob,safety_prob,td_prob,extra_point_prob,two_point_conversion_prob,ep,epa,total_home_epa,total_away_epa,total_home_rush_epa,total_away_rush_epa,total_home_pass_epa,total_away_pass_epa,air_epa,yac_epa,comp_air_epa,comp_yac_epa,total_home_comp_air_epa,total_away_comp_air_epa,total_home_comp_yac_epa,total_away_comp_yac_epa,total_home_raw_air_epa,total_away_raw_air_epa,total_home_raw_yac_epa,total_away_raw_yac_epa,wp,def_wp,home_wp,away_wp,wpa,vegas_wpa,vegas_home_wpa,home_wp_post,away_wp_post,vegas_wp,vegas_home_wp,total_home_rush_wpa,total_away_rush_wpa,total_home_pass_wpa,total_away_pass_wpa,air_wpa,yac_wpa,comp_air_wpa,comp_yac_wpa,total_home_comp_air_wpa,total_away_comp_air_wpa,total_home_comp_yac_wpa,total_away_comp_yac_wpa,total_home_raw_air_wpa,total_away_raw_air_wpa,total_home_raw_yac_wpa,total_away_raw_yac_wpa,punt_blocked,first_down_rush,first_down_pass,first_down_penalty,third_down_converted,third_down_failed,fourth_down_converted,fourth_down_failed,incomplete_pass,touchback,interception,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,kickoff_inside_twenty,kickoff_in_endzone,kickoff_out_of_bounds,kickoff_downed,kickoff_fair_catch,fumble_forced,fumble_not_forced,fumble_out_of_bounds,solo_tackle,safety,penalty,tackled_for_loss,fumble_lost,own_kickoff_recovery,own_kickoff_recovery_td,qb_hit,rush_attempt,pass_attempt,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,kickoff_attempt,punt_attempt,fumble,complete_pass,assist_tackle,lateral_reception,lateral_rush,lateral_return,lateral_recovery,passer_player_id,passer_player_name,passing_yards,receiver_player_id,receiver_player_name,receiving_yards,rusher_player_id,rusher_player_name,rushing_yards,lateral_receiver_player_id,lateral_receiver_player_name,lateral_receiving_yards,lateral_rusher_player_id,lateral_rusher_player_name,lateral_rushing_yards,lateral_sack_player_id,lateral_sack_player_name,interception_player_id,interception_player_name,lateral_interception_player_id,lateral_interception_player_name,punt_returner_player_id,punt_returner_player_name,lateral_punt_returner_player_id,lateral_punt_returner_player_name,kickoff_returner_player_name,kickoff_returner_player_id,lateral_kickoff_returner_player_id,lateral_kickoff_returner_player_name,punter_player_id,punter_player_name,kicker_player_name,kicker_player_id,own_kickoff_recovery_player_id,own_kickoff_recovery_player_name,blocked_player_id,blocked_player_name,tackle_for_loss_1_player_id,tackle_for_loss_1_player_name,tackle_for_loss_2_player_id,tackle_for_loss_2_player_name,qb_hit_1_player_id,qb_hit_1_player_name,qb_hit_2_player_id,qb_hit_2_player_name,forced_fumble_player_1_team,forced_fumble_player_1_player_id,forced_fumble_player_1_player_name,forced_fumble_player_2_team,forced_fumble_player_2_player_id,forced_fumble_player_2_player_name,solo_tackle_1_team,solo_tackle_2_team,solo_tackle_1_player_id,solo_tackle_2_player_id,solo_tackle_1_player_name,solo_tackle_2_player_name,assist_tackle_1_player_id,assist_tackle_1_player_name,assist_tackle_1_team,assist_tackle_2_player_id,assist_tackle_2_player_name,assist_tackle_2_team,assist_tackle_3_player_id,assist_tackle_3_player_name,assist_tackle_3_team,assist_tackle_4_player_id,assist_tackle_4_player_name,assist_tackle_4_team,tackle_with_assist,tackle_with_assist_1_player_id,tackle_with_assist_1_player_name,tackle_with_assist_1_team,tackle_with_assist_2_player_id,tackle_with_assist_2_player_name,tackle_with_assist_2_team,pass_defense_1_player_id,pass_defense_1_player_name,pass_defense_2_player_id,pass_defense_2_player_name,fumbled_1_team,fumbled_1_player_id,fumbled_1_player_name,fumbled_2_player_id,fumbled_2_player_name,fumbled_2_team,fumble_recovery_1_team,fumble_recovery_1_yards,fumble_recovery_1_player_id,fumble_recovery_1_player_name,fumble_recovery_2_team,fumble_recovery_2_yards,fumble_recovery_2_player_id,fumble_recovery_2_player_name,sack_player_id,sack_player_name,half_sack_1_player_id,half_sack_1_player_name,half_sack_2_player_id,half_sack_2_player_name,return_team,return_yards,penalty_team,penalty_player_id,penalty_player_name,penalty_yards,replay_or_challenge,replay_or_challenge_result,penalty_type,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv,safety_player_name,safety_player_id,season,cp,cpoe,series,series_success,series_result,order_sequence,start_time,time_of_day,stadium,weather,nfl_api_id,play_clock,play_deleted,play_type_nfl,special_teams_play,st_play_type,end_clock_time,end_yard_line,fixed_drive,fixed_drive_result,drive_real_start_time,drive_play_count,drive_time_of_possession,drive_first_downs,drive_inside20,drive_ended_with_score,drive_quarter_start,drive_quarter_end,drive_yards_penalized,drive_start_transition,drive_end_transition,drive_game_clock_start,drive_game_clock_end,drive_start_yard_line,drive_end_yard_line,drive_play_id_started,drive_play_id_ended,away_score,home_score,location,result,total,spread_line,total_line,div_game,roof,surface,temp,wind,home_coach,away_coach,stadium_id,game_stadium,aborted_play,success,passer,passer_jersey_number,rusher,rusher_jersey_number,receiver,receiver_jersey_number,pass,rush,first_down,special,play,passer_id,rusher_id,receiver_id,name,jersey_number,id,fantasy_player_name,fantasy_player_id,fantasy,fantasy_id,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe
0,1.0,2018_01_ATL_PHI,2018090600,PHI,ATL,REG,1,,,,,,2018-09-06,900.0,1800.0,3600.0,Half1,0.0,,0.0,1.0,,0,15:00,PHI 35,0.0,,GAME,,,0.0,0.0,,0.0,0.0,0.0,,,,,,,,,,,3.0,3.0,,,,,,,,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.770222,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433208,0.566792,0.566792,0.433208,-0.0,-0.0,0.0,,,0.487451,0.512549,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,2018,,,1.0,1.0,First down,1.0,"9/6/18, 21:05:29",,Lincoln Financial Field,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",10012018-0906-0018-4b62-2991b9bec18b,0,0.0,GAME_START,0.0,,,,1.0,Turnover on downs,,,,,,,,,,,,,,,,,,12,18,Home,6,30,1.0,44.5,0,outdoors,grass,81.0,8.0,Doug Pederson,Dan Quinn,PHI00,Lincoln Financial Field,0.0,0.0,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.0,-0.0,,,,,,,
1,37.0,2018_01_ATL_PHI,2018090600,PHI,ATL,REG,1,ATL,away,PHI,PHI,35.0,2018-09-06,900.0,1800.0,3600.0,Half1,0.0,1.0,0.0,1.0,,0,15:00,PHI 35,0.0,73.0,4-J.Elliott kicks 65 yards from PHI 35 to end ...,kickoff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,65.0,,,3.0,3.0,0.0,,,,,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004568,0.143585,0.002325,0.275986,0.215226,0.003265,0.355046,0.0,0.0,0.770222,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433208,0.566792,0.566792,0.433208,-0.0,-0.0,0.0,0.566792,0.433208,0.487451,0.512549,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,J.Elliott,00-0033787,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ATL,0.0,,,,,0.0,,,0.0,0.0,0.0,0.0,,,2018,,,1.0,1.0,First down,37.0,"9/6/18, 21:05:29",2018-09-07T01:05:29Z,Lincoln Financial Field,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",10012018-0906-0018-4b62-2991b9bec18b,0,0.0,KICK_OFF,1.0,,,,1.0,Turnover on downs,2018-09-07T01:05:29Z,10.0,4:09,3.0,1.0,0.0,1.0,1.0,-5.0,KICKOFF,DOWNS,15:00,10:51,ATL 25,PHI 1,37.0,278.0,12,18,Home,6,30,1.0,44.5,0,outdoors,grass,81.0,8.0,Doug Pederson,Dan Quinn,PHI00,Lincoln Financial Field,0.0,0.0,,,,,,,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,,0.0,0.0,-0.0,,,,,,,
2,52.0,2018_01_ATL_PHI,2018090600,PHI,ATL,REG,1,ATL,away,PHI,ATL,75.0,2018-09-06,900.0,1800.0,3600.0,Half1,0.0,1.0,0.0,1.0,1.0,0,15:00,ATL 25,10.0,73.0,"(15:00) PENALTY on ATL-82-L.Paulsen, False Sta...",no_play,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,3.0,3.0,0.0,,,,,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004568,0.143585,0.002325,0.275986,0.215226,0.003265,0.355046,0.0,0.0,0.770222,-0.773778,0.773778,-0.773778,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433208,0.566792,0.566792,0.433208,-0.022318,-0.035972,0.035972,0.58911,0.41089,0.487451,0.512549,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,ATL,00-0027215,L.Paulsen,5.0,0.0,,False Start,0.0,0.0,0.0,0.0,,,2018,,,1.0,1.0,First down,52.0,"9/6/18, 21:05:29",2018-09-07T01:06:23Z,Lincoln Financial Field,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",10012018-0906-0018-4b62-2991b9bec18b,0,0.0,PENALTY,0.0,,,,1.0,Turnover on downs,2018-09-07T01:05:29Z,10.0,4:09,3.0,1.0,0.0,1.0,1.0,-5.0,KICKOFF,DOWNS,15:00,10:51,ATL 25,PHI 1,37.0,278.0,12,18,Home,6,30,1.0,44.5,0,outdoors,grass,81.0,8.0,Doug Pederson,Dan Quinn,PHI00,Lincoln Financial Field,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,0.0,0.0,-0.773778,,,,,,0.479781,
3,75.0,2018_01_ATL_PHI,2018090600,PHI,ATL,REG,1,ATL,away,PHI,ATL,80.0,2018-09-06,900.0,1800.0,3600.0,Half1,0.0,1.0,0.0,1.0,1.0,0,15:00,ATL 20,15.0,73.0,(15:00) 2-M.Ryan pass short right to 11-J.Jone...,pass,10.0,0.0,0.0,1.0,0.0,0.0,0.0,short,right,8.0,2.0,,,,,,,3.0,3.0,0.0,,,,,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004851,0.195573,0.003698,0.30456,0.174841,0.003477,0.313001,0.0,0.0,-0.003556,0.850118,-0.07634,0.07634,0.0,0.0,-0.850118,0.850118,0.321212,0.528906,0.321212,0.528906,-0.321212,0.321212,-0.528906,0.528906,-0.321212,0.321212,-0.528906,0.528906,0.41089,0.58911,0.58911,0.41089,0.009866,0.019719,-0.019719,0.579244,0.420756,0.451479,0.548521,0.0,0.0,-0.009866,0.009866,0.0,0.009866,0.0,0.009866,0.0,0.0,-0.009866,0.009866,0.0,0.0,-0.009866,0.009866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,00-0026143,M.Ryan,10.0,00-0027944,J.Jones,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,PHI,,00-0026990,,M.Jenkins,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,0.0,,,0.0,0.0,0.0,0.0,,,2018,0.713305,28.669465,1.0,1.0,First down,75.0,"9/6/18, 21:05:29",2018-09-07T01:07:18Z,Lincoln Financial Field,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",10012018-0906-0018-4b62-2991b9bec18b,0,0.0,PASS,0.0,,,,1.0,Turnover on downs,2018-09-07T01:05:29Z,10.0,4:09,3.0,1.0,0.0,1.0,1.0,-5.0,KICKOFF,DOWNS,15:00,10:51,ATL 25,PHI 1,37.0,278.0,12,18,Home,6,30,1.0,44.5,0,outdoors,grass,81.0,8.0,Doug Pederson,Dan Quinn,PHI00,Lincoln Financial Field,0.0,1.0,M.Ryan,2.0,,,J.Jones,11.0,1.0,0.0,0.0,0.0,1.0,00-0026143,,00-0027944,M.Ryan,2.0,00-0026143,J.Jones,00-0027944,J.Jones,00-0027944,1.0,0.0,0.850118,0.564953,3.515878,2.0,0.998706,0.147457,0.587117,41.288257
4,104.0,2018_01_ATL_PHI,2018090600,PHI,ATL,REG,1,ATL,away,PHI,ATL,70.0,2018-09-06,862.0,1762.0,3562.0,Half1,0.0,1.0,0.0,1.0,2.0,0,14:22,ATL 30,5.0,73.0,(14:22) 11-J.Jones left end pushed ob at ATL 4...,run,11.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,left,end,,,,,3.0,3.0,0.0,,,,,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004832,0.147758,0.00244,0.268651,0.210675,0.003249,0.362393,0.0,0.0,0.846563,1.005722,-1.082063,1.082063,-1.005722,1.005722,-0.850118,0.850118,,,0.0,0.0,-0.321212,0.321212,-0.528906,0.528906,-0.321212,0.321212,-0.528906,0.528906,0.420756,0.579244,0.579244,0.420756,0.034143,0.038261,-0.038261,0.545101,0.454899,0.471198,0.528802,-0.034143,0.034143,-0.009866,0.009866,,,0.0,0.0,0.0,0.0,-0.009866,0.009866,0.0,0.0,-0.009866,0.009866,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,00-0027944,J.Jones,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,PHI,,00-0033876,,D.Barnett,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,0.0,,,0.0,0.0,0.0,0.0,,,2018,,,1.0,1.0,First down,104.0,"9/6/18, 21:05:29",2018-09-07T01:07:58Z,Lincoln Financial Field,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",10012018-0906-0018-4b62-2991b9bec18b,0,0.0,RUSH,0.0,,,,1.0,Turnover on downs,2018-09-07T01:05:29Z,10.0,4:09,3.0,1.0,0.0,1.0,1.0,-5.0,KICKOFF,DOWNS,15:00,10:51,ATL 25,PHI 1,37.0,278.0,12,18,Home,6,30,1.0,44.5,0,outdoors,grass,81.0,8.0,Doug Pederson,Dan Quinn,PHI00,Lincoln Financial Field,0.0,1.0,,,J.Jones,11.0,,,0.0,1.0,1.0,0.0,1.0,,00-0027944,,J.Jones,11.0,00-0027944,J.Jones,00-0027944,J.Jones,00-0027944,1.0,0.0,1.005722,,,,,,0.498259,-49.825913


In [16]:
pbp["game_date"] = pd.to_datetime(pbp["game_date"], errors="coerce")

In [4]:
import pandas as pd

# Ensure all columns exist to avoid KeyErrors
columns_needed = ["desc","week","game_date","posteam","defteam","drive","qtr",
                  "down","ydstogo","yardline_100","play_type","success",
                  "yards_gained","air_yards","yards_after_catch","epa",
                  "stadium","weather"]

for col in columns_needed:
    if col not in pbp.columns:
        pbp[col] = ""

# Build labeled columns
pbp["week_str"] = "Week: " + pbp["week"].astype(str)
pbp["game_date_str"] = "Game Date: " + pbp["game_date"].astype(str)
pbp["posteam_str"] = "Offense: " + pbp["posteam"].astype(str)
pbp["defteam_str"] = "Defense: " + pbp["defteam"].astype(str)
pbp["drive_str"] = "Drive: " + pbp["drive"].astype(str)
pbp["qtr_str"] = "Quarter: " + pbp["qtr"].astype(str)
pbp["down_distance_str"] = "Down & Distance: " + pbp["down"].fillna("").astype(str) + " & " + pbp["ydstogo"].fillna("").astype(str)
pbp["field_pos_str"] = "Yardline: " + pbp["yardline_100"].fillna("").astype(str) + " to opponent endzone"
pbp["play_type_str"] = "Type: " + pbp["play_type"].astype(str)
pbp["success_str"] = "Success: " + pbp["success"].fillna("").astype(str)
pbp["yards_gained_str"] = "Yards gained: " + pbp["yards_gained"].fillna("").astype(str)
pbp["air_yards_str"] = "Air yards: " + pbp["air_yards"].fillna("").astype(str)
pbp["yac_str"] = "Yards after catch: " + pbp["yards_after_catch"].fillna("").astype(str)
pbp["epa_str"] = "EPA: " + pbp["epa"].fillna("").astype(str)
pbp["stadium_str"] = "Stadium: " + pbp["stadium"].fillna("").astype(str)
pbp["weather_str"] = "Weather: " + pbp["weather"].fillna("").astype(str)

# Combine all labeled columns into snippet
snippet_cols = ["desc", "week_str", "game_date_str", "posteam_str", "defteam_str",
                "drive_str", "qtr_str", "down_distance_str", "field_pos_str",
                "play_type_str", "success_str", "yards_gained_str", "air_yards_str",
                "yac_str", "epa_str", "stadium_str", "weather_str"]

# Vectorized join (fast for large datasets)
pbp["snippet"] = pbp[snippet_cols].agg(" | ".join, axis=1)

In [5]:
pbp_snippets = pbp["snippet"].tolist()

In [12]:
len(pbp_snippets)

340587

In [14]:
import tiktoken

encoding = tiktoken.encoding_for_model("text-embedding-3-small")

token_counts = pbp["snippet"].apply(lambda x: len(encoding.encode(x)))
pbp["num_tokens"] = token_counts
token_counts.sum()

np.int64(59767850)

In [15]:
total_tokens = token_counts.sum()
cost = total_tokens / 1000 * 0.0004
cost

np.float64(23.907140000000002)

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(pbp_snippets, show_progress_bar=True)

Batches: 100%|██████████| 10644/10644 [03:35<00:00, 49.32it/s]


In [7]:
embeddings

array([[-0.02441123,  0.01675814, -0.04312323, ..., -0.01906752,
        -0.08080325,  0.07056724],
       [ 0.01406047,  0.02960094,  0.00769014, ..., -0.03021434,
        -0.0508137 ,  0.06035511],
       [-0.05648218,  0.03164314, -0.06800178, ..., -0.04469106,
        -0.11255363,  0.06848561],
       ...,
       [-0.11566669,  0.0683311 , -0.00183371, ..., -0.05022172,
        -0.05815786,  0.09861643],
       [-0.11634778,  0.06026205, -0.00589712, ..., -0.05590302,
        -0.05801969,  0.10094178],
       [-0.0352438 ,  0.00464839, -0.0230139 , ...,  0.03155459,
        -0.0557024 ,  0.04580555]], shape=(340587, 384), dtype=float32)

In [9]:
from src.config import RAW_DIR

schedules = pd.read_parquet(RAW_DIR / "schedules.parquet")
pd.set_option("display.max_columns", None)
schedules.tail()

Unnamed: 0,game_id,season,game_type,week,gameday,weekday,gametime,away_team,away_score,home_team,home_score,location,result,total,overtime,old_game_id,gsis,nfl_detail_id,pfr,pff,espn,ftn,away_rest,home_rest,away_moneyline,home_moneyline,spread_line,away_spread_odds,home_spread_odds,total_line,under_odds,over_odds,div_game,roof,surface,temp,wind,away_qb_id,home_qb_id,away_qb_name,home_qb_name,away_coach,home_coach,referee,stadium_id,stadium
7258,2025_18_DAL_NYG,2025,REG,18,2026-01-04,Sunday,13:00,DAL,,NYG,,Home,,,,2025010411,,,202501040nyg,,,,6,6,-125.0,105.0,-1.5,-110.0,-110.0,,,,1,outdoors,fieldturf,,,,,,,Brian Schottenheimer,Brian Daboll,,NYC01,MetLife Stadium
7259,2025_18_WAS_PHI,2025,REG,18,2026-01-04,Sunday,13:00,WAS,,PHI,,Home,,,,2025010412,,,202501040phi,,,,6,6,180.0,-218.0,4.5,-110.0,-110.0,,,,1,outdoors,grass,,,,,,,Dan Quinn,Nick Sirianni,,PHI00,Lincoln Financial Field
7260,2025_18_BAL_PIT,2025,REG,18,2026-01-04,Sunday,13:00,BAL,,PIT,,Home,,,,2025010413,,,202501040pit,,,,10,10,-218.0,180.0,-4.5,-110.0,-110.0,,,,1,outdoors,grass,,,,,,,John Harbaugh,Mike Tomlin,,PIT00,Acrisure Stadium
7261,2025_18_SEA_SF,2025,REG,18,2026-01-04,Sunday,13:00,SEA,,SF,,Home,,,,2025010414,,,202501040sfo,,,,9,5,180.0,-218.0,4.5,-110.0,-110.0,44.5,-105.0,-115.0,1,outdoors,grass,,,,,,,Mike Macdonald,Kyle Shanahan,,SFO01,Levi's Stadium
7262,2025_18_CAR_TB,2025,REG,18,2026-01-04,Sunday,13:00,CAR,,TB,,Home,,,,2025010415,,,202501040tam,,,,6,6,250.0,-310.0,7.0,-115.0,-105.0,,,,1,outdoors,grass,,,,,,,Dave Canales,Todd Bowles,,TAM00,Raymond James Stadium


In [17]:
row_strings = schedules.apply(lambda row: ' | '.join([f"{col}: {row[col]}" for col in schedules.columns]), axis=1).tolist()
row_strings

['game_id: 1999_01_MIN_ATL | season: 1999 | game_type: REG | week: 1 | gameday: 1999-09-12 | weekday: Sunday | gametime: None | away_team: MIN | away_score: 17.0 | home_team: ATL | home_score: 14.0 | location: Home | result: -3.0 | total: 31.0 | overtime: 0.0 | old_game_id: 1999091210 | gsis: 598.0 | nfl_detail_id: None | pfr: 199909120atl | pff: nan | espn: 190912001 | ftn: nan | away_rest: 7 | home_rest: 7 | away_moneyline: nan | home_moneyline: nan | spread_line: -4.0 | away_spread_odds: nan | home_spread_odds: nan | total_line: 49.0 | under_odds: nan | over_odds: nan | div_game: 0 | roof: dome | surface: astroturf | temp: nan | wind: nan | away_qb_id: 00-0003761 | home_qb_id: 00-0002876 | away_qb_name: Randall Cunningham | home_qb_name: Chris Chandler | away_coach: Dennis Green | home_coach: Dan Reeves | referee: Gerry Austin | stadium_id: ATL00 | stadium: Georgia Dome',
 'game_id: 1999_01_KC_CHI | season: 1999 | game_type: REG | week: 1 | gameday: 1999-09-12 | weekday: Sunday | ga

In [18]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
row_embeddings = model.encode(row_strings, show_progress_bar=True)

Batches: 100%|██████████| 227/227 [00:06<00:00, 34.74it/s]


In [15]:
row_embeddings

array([[-0.04990634, -0.00449834, -0.0171146 , ...,  0.00329445,
        -0.0997844 ,  0.06891218],
       [-0.05120961,  0.01345685, -0.02529473, ..., -0.01139584,
        -0.07887076,  0.05797613],
       [-0.04920378,  0.00961506, -0.0217108 , ..., -0.0006668 ,
        -0.06712744,  0.05631376],
       ...,
       [-0.03819719, -0.00413516, -0.0035859 , ..., -0.01706785,
        -0.06488162,  0.05688783],
       [-0.03227217, -0.00254776, -0.01315786, ..., -0.00855806,
        -0.07974361,  0.06551852],
       [-0.03197099, -0.00823523,  0.00215451, ..., -0.0106712 ,
        -0.06561203,  0.05588233]], shape=(7263, 384), dtype=float32)

In [9]:
index_name = "pbp"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

In [None]:
batch_size = 100
for i in range(0, len(pbp_snippets), batch_size):
    batch_ids = [str(i) for i in range(i, i+batch_size)]
    batch_embeddings = embeddings[i:i+batch_size]
    batch_snippets = pbp_snippets[i:i+batch_size]

    vectors = [
        {"id": batch_ids[k], "values": batch_embeddings[k], "metadata": {"text": batch_snippets[k]}}
        for k in range(len(batch_embeddings))
    ]

    index.upsert(vectors=vectors)

In [13]:
index_name = "schedules"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

In [19]:
batch_size = 100
for i in range(0, len(row_strings), batch_size):
    batch_ids = [str(i) for i in range(i, i+batch_size)]
    batch_embeddings = row_embeddings[i:i+batch_size]
    batch_snippets = row_strings[i:i+batch_size]

    vectors = [
        {"id": batch_ids[k], "values": batch_embeddings[k], "metadata": {"text": batch_snippets[k]}}
        for k in range(len(batch_embeddings))
    ]

    index.upsert(vectors=vectors)

In [22]:
injuries = pd.read_parquet(RAW_DIR / "injuries_2018_2024.parquet")

injury_strings = injuries.apply(lambda row: ' | '.join([f"{col}: {row[col]}" for col in injuries.columns]), axis=1).tolist()
injury_embeddings = model.encode(injury_strings, show_progress_bar=True)

index_name = "injuries"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

batch_size = 100
for i in range(0, len(injury_strings), batch_size):
    batch_ids = [str(i) for i in range(i, i+batch_size)]
    batch_embeddings = injury_embeddings[i:i+batch_size]
    batch_snippets = injury_strings[i:i+batch_size]

    vectors = [
        {"id": batch_ids[k], "values": batch_embeddings[k], "metadata": {"text": batch_snippets[k]}}
        for k in range(len(batch_embeddings))
    ]

    index.upsert(vectors=vectors)

Batches: 100%|██████████| 1228/1228 [00:20<00:00, 58.77it/s]


In [24]:
from src.config import PROCESSED_DIR

profiles = pd.read_parquet(PROCESSED_DIR / "player_profiles.parquet")

profile_strings = profiles.apply(lambda row: ' | '.join([f"{col}: {row[col]}" for col in profiles.columns]), axis=1).tolist()
profile_embeddings = model.encode(profile_strings, show_progress_bar=True)

index_name = "player-profiles"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

batch_size = 100
for i in range(0, len(profile_strings), batch_size):
    batch_ids = [str(i) for i in range(i, i+batch_size)]
    batch_embeddings = profile_embeddings[i:i+batch_size]
    batch_snippets = profile_strings[i:i+batch_size]

    vectors = [
        {"id": batch_ids[k], "values": batch_embeddings[k], "metadata": {"text": batch_snippets[k]}}
        for k in range(len(batch_embeddings))
    ]

    index.upsert(vectors=vectors)

Batches: 100%|██████████| 767/767 [00:22<00:00, 34.54it/s]
