In [65]:
%load_ext autoreload
%autoreload 2

import os
import re
import sys

sys.path.append(os.path.abspath("../"))


import faiss
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 400)


load_dotenv()


from src.data_utils import PodcastContainer, load_clean_scores
from src.player_utils import PlayerUtil
from src.utils import get_repo_root

scores = load_clean_scores(["2022-23", "2023-24"])
scores.head(5)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,fieldGoalsAttempted,fieldGoalsPercentage,threePointersMade,threePointersAttempted,threePointersPercentage,freeThrowsMade,freeThrowsAttempted,freeThrowsPercentage,reboundsOffensive,reboundsDefensive,reboundsTotal,assists,steals,blocks,turnovers,foulsPersonal,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed
360551,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,200782,pj tucker,33.016667,3,5,0.6,0,2,0.0,0,0,0.0,2,2,4,0,0,1,2,2,6,-6,12.0,17.15,False
360552,2022-23,2022-10-18,22200001,BOS vs. PHI,celtics,celtics,201143,al horford,23.1,2,7,0.286,2,5,0.4,0,0,0.0,1,4,5,1,0,0,0,4,6,8,13.75,35.15,False
360553,2022-23,2022-10-18,22200001,BOS vs. PHI,celtics,celtics,201933,blake griffin,8.283333,0,2,0.0,0,1,0.0,1,2,0.5,2,3,5,1,0,0,0,3,1,-5,8.75,0.5,True
360554,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,201935,james harden,37.266667,9,14,0.643,5,9,0.556,12,12,1.0,0,8,8,7,0,0,3,3,35,1,54.0,44.05,True
360555,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,202699,tobias harris,34.233333,7,14,0.5,3,6,0.5,1,2,0.5,1,1,2,0,3,0,0,3,18,-1,26.5,25.95,True


In [45]:
scores.game_date.unique()

<DatetimeArray>
['2023-10-24 00:00:00', '2023-10-25 00:00:00', '2023-10-26 00:00:00', '2023-10-27 00:00:00', '2023-10-28 00:00:00', '2023-10-29 00:00:00', '2023-10-30 00:00:00', '2023-10-31 00:00:00', '2023-11-01 00:00:00', '2023-11-02 00:00:00',
 ...
 '2024-04-03 00:00:00', '2024-04-04 00:00:00', '2024-04-05 00:00:00', '2024-04-06 00:00:00', '2024-04-07 00:00:00', '2024-04-09 00:00:00', '2024-04-10 00:00:00', '2024-04-11 00:00:00', '2024-04-12 00:00:00', '2024-04-14 00:00:00']
Length: 160, dtype: datetime64[ns]

In [62]:
from src.llm_feature_extractor import PromptFeatureExtractor

prompt_fe = PromptFeatureExtractor()
cont = PodcastContainer(
    {
        "rotowire": get_repo_root() / "data/raw/DG RFB Transcripts/",
    }
)

# cont = PodcastContainer()
podcast_df = cont.get_all_episodes()
begin, end = pd.to_datetime("2023-10-22").date(), pd.to_datetime("2023-10-26").date()

podcast_df = podcast_df[
    (podcast_df.publication_date >= begin) & (podcast_df.publication_date < end)
]
podcast_df

Unnamed: 0,publication_date,file_name,file_path,content,duration,podcast_name
3,2023-10-23,western_conference_win_total_overunder_picks_w...,G:\My Drive\Columbia\Practical Deep Learning\F...,Welcome to the rotor wire fantasy basketball p...,4359,rotowire
4,2023-10-24,opening_week_preview_boom_or_bust_players_thre...,G:\My Drive\Columbia\Practical Deep Learning\F...,"Hello, friends, and welcome to the award winni...",2664,rotowire
5,2023-10-25,opening_night_takeaways_lebrons_workload_harde...,G:\My Drive\Columbia\Practical Deep Learning\F...,"Welcome to a regular season edition, Brandon K...",3264,rotowire


In [68]:
# podcast_df = cont.get_all_episodes()
# podcast_df = podcast_df[podcast_df.publication_date == pd.to_datetime('2023-10-24').date()]
# podcast_df

Unnamed: 0,publication_date,file_name,file_path,content,duration,podcast_name
4,2023-10-24,opening_week_preview_boom_or_bust_players_thre...,G:\My Drive\Columbia\Practical Deep Learning\F...,"Hello, friends, and welcome to the award winni...",2664,rotowire


In [71]:
# llm_feats2 = prompt_fe.extract_llm_feats(podcast_df)

In [63]:
llm_feats = prompt_fe.extract_llm_feats(podcast_df)

In [77]:
feats = llm_feats.groupby(['podcast_name', 'personName', 'podcast_date']).agg({
    'mentions': 'sum',
    'increased_playing_time': 'mean',
    'trending_upwards': 'mean'
}).reset_index()

In [None]:
# TODO join feats with the box score dataframe

# TODO compute AOC with just the trending_upwards column


# Train XGBOOST model



In [None]:
from sklearn.model_selection import train_test_split
from src.model_xg import add_lagged_features
from xgboost import XGBRegressor

lag_features = [
    "minutes",
    "fieldGoalsAttempted",
    "fieldGoalsPercentage",
    "threePointersAttempted",
    "threePointersPercentage",
    "freeThrowsAttempted",
    "freeThrowsPercentage",
    "reboundsDefensive",
    "reboundsTotal",
    "assists",
    "steals",
    "blocks",
    "turnovers",
    "foulsPersonal",
    "points",
]

df_xg, new_feats = add_lagged_features(scores, lag_features, 5)
df_xg = df_xg.dropna()

# ---- Create train/test dataset ----
select_features = ["plusMinusPoints"] + new_feats
target_col = "fantasyPoints"

train_size = int(len(df) * 0.8)
train_df = df_xg.iloc[train_size:]
test_df = df_xg.iloc[:train_size]

X_train, X_test = train_df[select_features], test_df[select_features]
y_train, y_test = train_df[target_col], test_df[target_col]

# ---- Train XGBoost model ----
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# ---- Evaluate the model ----
y_pred = model.predict(X_test)

display(df_xg)

# 5-day moving average performance
print(
    "Sliding Window MAE:", np.mean(np.abs(y_test - test_df["projectedFantasyPoints"]))
)
print("Model MAE:", np.mean(np.abs(y_test - y_pred)))

In [None]:
----

In [86]:
from typing import List

from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel

system_prompt = """You are a specialized NBA analyst. Analyze the following text for NBA player mentions, including pronouns and nicknames. Provide a precise, structured analysis.

Requirements:
1. Identify all NBA players active in the 2023 season mentioned. 
2. Do not use player nicknames in the output, use their real names instead.
3. Count total mentions for each player
4. Analyze whether a player is likely to see increased playing time in upcoming games
5. Analyze whether a player is likely to outperform or trending upwards in upcoming games


Return the results as a CSV with the header:
player, mentions, increased_playing_time, trending_upwards

Context:
{}
"""

llm = ChatOpenAI(temperature=0)

response = llm.invoke(system_prompt.format(podcast_text))

In [141]:
from io import StringIO

import pandas as pd
from src.player_utils import normalize_name

df = pd.read_csv(StringIO(response.content))

df["player"] = df["player"].apply(normalize_name)
df["increased_playing_time"] = np.where(
    df["increased_playing_time"].str.lower() == "yes", 1, 0
)
df["trending_upwards"] = np.where(df["trending_upwards"].str.lower() == "yes", 1, 0)
df["podcast_date"] = pd.to_datetime(pd.to_datetime(pub_date).date())

df.head(20)

Unnamed: 0,player,mentions,increased_playing_time,trending_upwards,podcast_date
0,damian lillard,3,0,1,2023-10-27
1,tyrese maxey,1,1,1,2023-10-27
2,kelly oubre,1,0,0,2023-10-27
3,james harden,6,0,0,2023-10-27
4,joel embiid,4,0,0,2023-10-27
5,anthony davis,3,1,1,2023-10-27
6,miles turner,2,1,1,2023-10-27
7,brandon miller,1,0,0,2023-10-27
8,xavier tillman,3,1,1,2023-10-27
9,santi aldama,1,0,0,2023-10-27


In [95]:
scores.game_date.unique()

<DatetimeArray>
['2023-10-24 00:00:00', '2023-10-25 00:00:00', '2023-10-26 00:00:00',
 '2023-10-27 00:00:00', '2023-10-28 00:00:00', '2023-10-29 00:00:00',
 '2023-10-30 00:00:00', '2023-10-31 00:00:00', '2023-11-01 00:00:00',
 '2023-11-02 00:00:00',
 ...
 '2024-04-03 00:00:00', '2024-04-04 00:00:00', '2024-04-05 00:00:00',
 '2024-04-06 00:00:00', '2024-04-07 00:00:00', '2024-04-09 00:00:00',
 '2024-04-10 00:00:00', '2024-04-11 00:00:00', '2024-04-12 00:00:00',
 '2024-04-14 00:00:00']
Length: 160, dtype: datetime64[ns]

In [109]:
pub_date

datetime.date(2023, 10, 27)

In [143]:
s = scores[(scores.game_date >= "2023-10-27") & (scores.game_date <= "2023-10-31")]

# set(s.personName) & set(df.player.str.lower())
s.game_date

392548   2023-10-27
392549   2023-10-27
392550   2023-10-27
392551   2023-10-27
392552   2023-10-27
            ...    
393603   2023-10-31
393604   2023-10-31
393605   2023-10-31
393606   2023-10-31
393607   2023-10-31
Name: game_date, Length: 1060, dtype: datetime64[ns]

In [142]:
df.podcast_date

0     2023-10-27
1     2023-10-27
2     2023-10-27
3     2023-10-27
4     2023-10-27
         ...    
168   2023-10-27
169   2023-10-27
170   2023-10-27
171   2023-10-27
172   2023-10-27
Name: podcast_date, Length: 173, dtype: datetime64[ns]

In [146]:
m = pd.merge_asof(
    s,
    df,
    left_on="game_date",
    right_on="podcast_date",
    left_by="personName",
    right_by="player",
    tolerance=pd.Timedelta("5d"),
)
m

Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,...,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed,player,mentions,increased_playing_time,trending_upwards,podcast_date
0,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,202330,gordon hayward,33.866667,6,...,19,-10,40.75,33.550000,True,,,,,NaT
1,2023-24,2023-10-27,22300077,DET @ CHA,pistons,pistons,202692,alec burks,30.750000,5,...,24,28,36.00,20.650000,True,,,,,NaT
2,2023-24,2023-10-27,22300077,DET @ CHA,pistons,pistons,203925,joe harris,11.316667,1,...,2,3,5.50,7.950000,False,,,,,NaT
3,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,1626179,terry rozier,34.700000,8,...,20,-11,26.25,24.200000,True,,,,,NaT
4,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,1628373,frank ntilikina,0.000000,0,...,0,0,0.00,11.100000,False,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1055,2023-24,2023-10-31,22300114,LAC vs. ORL,clippers,clippers,1631217,moussa diabate,2.916667,1,...,2,2,5.75,2.850000,True,,,,,NaT
1056,2023-24,2023-10-31,22300114,ORL @ LAC,magic,magic,1641710,anthony black,3.933333,0,...,0,-1,1.50,3.500000,False,,,,,NaT
1057,2023-24,2023-10-31,22300114,ORL @ LAC,magic,magic,1641724,jett howard,3.933333,1,...,2,-1,3.25,0.500000,True,,,,,NaT
1058,2023-24,2023-10-31,22300114,LAC vs. ORL,clippers,clippers,1641738,kobe brown,19.033333,3,...,7,11,14.25,3.833333,True,,,,,NaT


In [149]:
m[m.game_date == "2023-10-27"].head(40)

Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,...,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed,player,mentions,increased_playing_time,trending_upwards,podcast_date
0,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,202330,gordon hayward,33.866667,6,...,19,-10,40.75,33.55,True,,,,,NaT
1,2023-24,2023-10-27,22300077,DET @ CHA,pistons,pistons,202692,alec burks,30.75,5,...,24,28,36.0,20.65,True,,,,,NaT
2,2023-24,2023-10-27,22300077,DET @ CHA,pistons,pistons,203925,joe harris,11.316667,1,...,2,3,5.5,7.95,False,,,,,NaT
3,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,1626179,terry rozier,34.7,8,...,20,-11,26.25,24.2,True,,,,,NaT
4,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,1628373,frank ntilikina,0.0,0,...,0,0,0.0,11.1,False,,,,,NaT
5,2023-24,2023-10-27,22300077,DET @ CHA,pistons,pistons,1628963,marvin bagley iii,13.15,5,...,14,6,20.5,13.7,True,,,,,NaT
6,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,1628970,miles bridges,0.0,0,...,0,0,0.0,30.55,False,,,,,NaT
7,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,1629023,pj washington,28.983333,6,...,13,-12,26.25,39.55,False,,,,,NaT
8,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,1630163,lamelo ball,33.066667,4,...,20,-7,48.75,47.65,True,,,,,NaT
9,2023-24,2023-10-27,22300077,DET @ CHA,pistons,pistons,1630164,james wiseman,0.0,0,...,0,0,0.0,16.3,False,,,,,NaT


In [147]:
m[~m.player.isnull()]

Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,...,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed,player,mentions,increased_playing_time,trending_upwards,podcast_date
15,2023-24,2023-10-27,22300077,DET @ CHA,pistons,pistons,1630595,cade cunningham,24.133333,4,...,12,7,23.0,33.7,False,cade cunningham,2.0,1.0,1.0,2023-10-27
24,2023-24,2023-10-27,22300077,CHA vs. DET,hornets,hornets,1641706,brandon miller,33.416667,6,...,17,-2,30.0,19.25,True,brandon miller,1.0,0.0,0.0,2023-10-27
44,2023-24,2023-10-27,22300078,MEM vs. DEN,grizzlies,grizzlies,1630214,xavier tillman,34.05,5,...,13,-5,33.5,19.15,True,xavier tillman,3.0,1.0,1.0,2023-10-27
131,2023-24,2023-10-27,22300081,OKC @ CLE,thunder,thunder,1630198,isaiah joe,22.3,2,...,5,12,11.25,16.0,False,isaiah joe,2.0,1.0,1.0,2023-10-27
140,2023-24,2023-10-27,22300081,OKC @ CLE,thunder,thunder,1631096,chet holmgren,30.85,5,...,16,6,52.25,22.0,True,chet holmgren,4.0,0.0,0.0,2023-10-27
150,2023-24,2023-10-27,22300082,CHI vs. TOR,bulls,bulls,203083,andre drummond,12.333333,3,...,8,1,9.5,10.75,False,andre drummond,2.0,1.0,1.0,2023-10-27
224,2023-24,2023-10-27,22300084,BKN @ DAL,nets,nets,1630560,cam thomas,33.166667,12,...,30,3,45.75,19.15,True,cam thomas,4.0,1.0,1.0,2023-10-27
288,2023-24,2023-10-27,22300087,GSW @ SAC,warriors,warriors,101108,chris paul,33.166667,5,...,10,2,36.5,34.25,True,chris paul,2.0,1.0,1.0,2023-10-27
318,2023-24,2023-10-28,22300088,CHI @ DET,bulls,bulls,203083,andre drummond,14.466667,1,...,2,-5,19.25,10.45,True,andre drummond,2.0,1.0,1.0,2023-10-27
331,2023-24,2023-10-28,22300088,DET vs. CHI,pistons,pistons,1630595,cade cunningham,39.616667,9,...,25,10,43.5,33.4,True,cade cunningham,2.0,1.0,1.0,2023-10-27


{'andre drummond',
 'brandon miller',
 'cade cunningham',
 'cam thomas',
 'chet holmgren',
 'chris paul',
 'isaiah joe',
 'xavier tillman'}

Unnamed: 0,player,mentions,increased_playing_time,trending_upwards
0,Damian Lillard,3.0,no,yes
1,Tyrese Maxey,1.0,yes,yes
2,Kelly Oubre,1.0,no,no
3,James Harden,6.0,no,no
4,Joel Embiid,3.0,no,no
5,Terrence Mann,1.0,no,no
6,Norm Powell,1.0,no,no
7,Bones Hyland,1.0,no,no
8,Anthony Davis,1.0,no,no
9,Miles Turner,2.0,no,no


In [73]:
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_lg")

# Process the text with SpaCy
doc = nlp(podcast_text)

target = "James Harden"
# Count mentions of "James Harden"
count = sum(1 for ent in doc.ents if ent.text == target and ent.label_ == "PERSON")
count

2

In [None]:
-----

In [None]:
prompt = podcast_text

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt},
]

from langchain_google_vertexai import ChatVertexAI

llm = ChatVertexAI(model="gemini-1.5-flash", temperature=0)
response = llm.invoke(messages)

print("Response:", response.content)

In [19]:
response.content

"```csv\nPlayer,Mentions,Increased Playing Time,Trending Upwards\nDamian Lillard,4,No,No\nJames Harden,6,No,No\nTyrese Maxey,1,Yes,Yes\nKelly Oubre,1,No,No\nJoel Embiid,2,No,No\nTerrence Mann,1,No,No\nNorm Powell,1,No,No\nBones Hyland,1,No,No\nBogdanovic,1,No,No\nAnthony Davis,2,No,No\nLeBron James,1,No,No\nKevin Durant,2,No,No\nYousef Nurkic,1,No,No\nEric Gordon,3,Yes,No\nJordan Goodwin,1,Yes,Yes\nJay Eric Gordon,1,No,No\nBradley Beal,3,No,No\nMiles Turner,2,No,No\nBrandon Miller,2,No,No\nXavier Tillman,3,Yes,Yes\nSanti Aldama,2,Yes,Yes\nJaren Jackson,1,No,No\nJa Morant,1,No,No\nKobe White,3,Yes,Yes\nDerek Lively,4,Yes,Yes\nMaxi Kleber,1,No,No\nKyrie Irving,1,No,No\nJason Kidd,1,No,No\nJalen Johnson,4,Yes,Yes\nSadik Bey,3,No,No\nDeandre Hunter,2,No,No\nDonovic Mitchell,1,No,No\nAJ Griffin,2,No,No\nCam Thomas,6,Yes,Yes\nRoyce O'Neil,1,No,No\nDorian Finney Smith,1,No,No\nBen Simmons,2,No,No\nNick Claxton,2,No,No\nCam Johnson,2,No,No\nSpencer Dinwiddie,1,No,No\nMax Struss,5,Yes,Yes\nJare

In [4]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

prompt = "What is the capital of France?"

messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant that answers questions.",
    },
    {"role": "user", "content": prompt},
]
response = llm.invoke(prompt)
response.content

'The capital of France is Paris.'

In [2]:
from langchain_google_vertexai import ChatVertexAI

llm = ChatVertexAI(model="gemini-1.5-flash", temperature=0)


prompt = "What is the capital of France?"

messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant that answers questions.",
    },
    {"role": "user", "content": prompt},
]


response = llm.invoke(messages)


print("Response:", response.content)

Response: The capital of France is **Paris**. 

