In [11]:
%load_ext autoreload
%autoreload 2
    
import os
import re
import sys

sys.path.append(os.path.abspath("../"))


import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from dotenv import load_dotenv

import numpy as np
from langchain.chat_models import ChatOpenAI

load_dotenv()


from src.data_utils import PodcastContainer, load_clean_scores
from src.player_utils import PlayerUtil
from src.utils import  get_repo_root

cont = PodcastContainer()
podcast_df = cont.get_podcast_data()
podcast_df.head(10)

scores = load_clean_scores(['2023-24'])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
scores.game_date.unique()

<DatetimeArray>
['2023-10-24 00:00:00', '2023-10-25 00:00:00', '2023-10-26 00:00:00',
 '2023-10-27 00:00:00', '2023-10-28 00:00:00', '2023-10-29 00:00:00',
 '2023-10-30 00:00:00', '2023-10-31 00:00:00', '2023-11-01 00:00:00',
 '2023-11-02 00:00:00',
 ...
 '2024-04-03 00:00:00', '2024-04-04 00:00:00', '2024-04-05 00:00:00',
 '2024-04-06 00:00:00', '2024-04-07 00:00:00', '2024-04-09 00:00:00',
 '2024-04-10 00:00:00', '2024-04-11 00:00:00', '2024-04-12 00:00:00',
 '2024-04-14 00:00:00']
Length: 160, dtype: datetime64[ns]

In [37]:
# d = scores[scores.game_date == '2023-11-04']
d = scores
d[d.personName == 'Xavier Tillman'.lower()]

Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,...,assists,steals,blocks,turnovers,foulsPersonal,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed
392390,2023-24,2023-10-25,22300071,MEM vs. NOP,grizzlies,grizzlies,1630214,xavier tillman,33.700000,7,...,4,3,1,0,2,17,3,47.50,16.55,True
392592,2023-24,2023-10-27,22300078,MEM vs. DEN,grizzlies,grizzlies,1630214,xavier tillman,34.050000,5,...,2,3,2,5,5,13,-5,33.50,19.15,True
392903,2023-24,2023-10-28,22300089,MEM @ WAS,grizzlies,grizzlies,1630214,xavier tillman,23.733333,3,...,3,0,0,1,4,8,-14,20.75,22.55,False
393378,2023-24,2023-10-30,22300106,MEM vs. DAL,grizzlies,grizzlies,1630214,xavier tillman,28.866667,3,...,1,2,1,0,0,6,4,22.25,25.50,False
393892,2023-24,2023-11-01,22300125,MEM @ UTA,grizzlies,grizzlies,1630214,xavier tillman,13.850000,0,...,0,1,0,2,3,0,-7,6.00,24.80,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422689,2023-24,2024-04-07,22301134,BOS vs. POR,celtics,celtics,1630214,xavier tillman,17.200000,4,...,1,1,2,0,0,9,6,22.75,6.55,True
423069,2023-24,2024-04-09,22301148,BOS @ MIL,celtics,celtics,1630214,xavier tillman,18.416667,2,...,2,1,2,0,1,6,-15,15.00,9.90,True
423585,2023-24,2024-04-11,22301167,BOS vs. NYK,celtics,celtics,1630214,xavier tillman,6.050000,1,...,1,0,0,0,0,2,6,3.50,11.50,False
423735,2023-24,2024-04-12,22301173,BOS vs. CHA,celtics,celtics,1630214,xavier tillman,13.333333,1,...,1,2,0,1,3,2,-4,10.75,11.65,False


In [17]:
row = podcast_df.iloc[7]
pub_date = row.publication_date
podcast_text = row.content
row

publication_date                            2023-10-27 17:21:13+00:00
file_name            fantasy_basketball_waiver_wire_for_week_2_202324
file_path           G:\My Drive\Columbia\Practical Deep Learning\F...
content             Welcome to the award winning Fantasy Basketbal...
duration                                                         2875
Name: 7, dtype: object

In [75]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel
from langchain.chat_models import ChatOpenAI
from typing import List


class PlayerAnalysis(BaseModel):
    player: str
    mentions: int
    increased_playing_time: bool
    trending_upwards: bool

class PlayerAnalysisList(BaseModel):
    players: List[PlayerAnalysis]
    
parser = PydanticOutputParser(pydantic_object=PlayerAnalysisList)

system_prompt = """You are a specialized NBA analyst. Analyze the following text for NBA player mentions, including pronouns and nicknames. Provide a precise, structured analysis.

Requirements:
1. Identify all NBA players active in the 2023 season mentioned. Use their real names in the output.
3. Count total mentions for each player
4. Analyze whether a player is likely to see increased playing time in upcoming games
5. Analyze whether a player is likely to outperform or trending upwards in upcoming games

{format_instructions}

Context:
{podcast_text}
"""

llm = ChatOpenAI(temperature=0)

prompt_template = PromptTemplate(
    template=system_prompt,
    input_variables=['podcast_text'],
    partial_variables={'format_instructions': parser.get_format_instructions()}
)

prompt_and_model = prompt_template | llm | parser
response = prompt_and_model.invoke({"podcast_text": podcast_text})
struct_players = response.players
struct_players

[PlayerAnalysis(player='Damian Lillard', mentions=1, increased_playing_time=False, trending_upwards=True),
 PlayerAnalysis(player='Tyrese Maxey', mentions=1, increased_playing_time=True, trending_upwards=True),
 PlayerAnalysis(player='Kelly Oubre', mentions=1, increased_playing_time=False, trending_upwards=False),
 PlayerAnalysis(player='James Harden', mentions=6, increased_playing_time=False, trending_upwards=False),
 PlayerAnalysis(player='Joel Embiid', mentions=4, increased_playing_time=False, trending_upwards=True),
 PlayerAnalysis(player='Terrence Mann', mentions=1, increased_playing_time=True, trending_upwards=True),
 PlayerAnalysis(player='Norm Powell', mentions=1, increased_playing_time=False, trending_upwards=False),
 PlayerAnalysis(player='Anthony Davis', mentions=1, increased_playing_time=True, trending_upwards=True),
 PlayerAnalysis(player='Miles Turner', mentions=2, increased_playing_time=True, trending_upwards=True),
 PlayerAnalysis(player='Brandon Miller', mentions=2, in

In [76]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel
from langchain.chat_models import ChatOpenAI
from typing import List

system_prompt = """You are a specialized NBA analyst. Analyze the following text for NBA player mentions, including pronouns and nicknames. Provide a precise, structured analysis.

Requirements:
1. Identify all NBA players active in the 2023 season mentioned. Use their real names in the output.
3. Count total mentions for each player
4. Analyze whether a player is likely to see increased playing time in upcoming games
5. Analyze whether a player is likely to outperform or trending upwards in upcoming games


Return the results as a CSV with the header:
player, mentions, increased_playing_time, trending_upwards

Context:
{}
"""

llm = ChatOpenAI(temperature=0)

response = llm.invoke(system_prompt.format(podcast_text))
import pandas as pd
from io import StringIO

df = pd.read_csv(StringIO(response.content))
df

Unnamed: 0,player,mentions,increased_playing_time,trending_upwards
0,Damian Lillard,3,no,yes
1,Tyrese Maxey,1,yes,yes
2,Kelly Oubre,1,no,no
3,James Harden,6,no,no
4,Joel Embiid,4,no,no
...,...,...,...,...
140,Jalen Duran,1,yes,yes
141,Markelle Fultz,1,yes,yes
142,Andre Drummond,1,no,no
143,Victor Wim Biyana,1,no,no


Unnamed: 0,player,mentions,increased_playing_time,trending_upwards
0,Damian Lillard,3.0,no,yes
1,Tyrese Maxey,1.0,yes,yes
2,Kelly Oubre,1.0,no,no
3,James Harden,6.0,no,no
4,Joel Embiid,3.0,no,no
5,Terrence Mann,1.0,no,no
6,Norm Powell,1.0,no,no
7,Bones Hyland,1.0,no,no
8,Anthony Davis,1.0,no,no
9,Miles Turner,2.0,no,no


In [73]:
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_lg")

# Process the text with SpaCy
doc = nlp(podcast_text)

target = "James Harden"
# Count mentions of "James Harden"
count = sum(1 for ent in doc.ents if ent.text == target and ent.label_ == "PERSON")
count

2

In [None]:
-----

In [None]:

prompt = podcast_text

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt}
]

from langchain_google_vertexai import ChatVertexAI

llm = ChatVertexAI(
    model="gemini-1.5-flash", 
    temperature=0
)
response = llm.invoke(messages)

print("Response:", response.content)

In [19]:
response.content

"```csv\nPlayer,Mentions,Increased Playing Time,Trending Upwards\nDamian Lillard,4,No,No\nJames Harden,6,No,No\nTyrese Maxey,1,Yes,Yes\nKelly Oubre,1,No,No\nJoel Embiid,2,No,No\nTerrence Mann,1,No,No\nNorm Powell,1,No,No\nBones Hyland,1,No,No\nBogdanovic,1,No,No\nAnthony Davis,2,No,No\nLeBron James,1,No,No\nKevin Durant,2,No,No\nYousef Nurkic,1,No,No\nEric Gordon,3,Yes,No\nJordan Goodwin,1,Yes,Yes\nJay Eric Gordon,1,No,No\nBradley Beal,3,No,No\nMiles Turner,2,No,No\nBrandon Miller,2,No,No\nXavier Tillman,3,Yes,Yes\nSanti Aldama,2,Yes,Yes\nJaren Jackson,1,No,No\nJa Morant,1,No,No\nKobe White,3,Yes,Yes\nDerek Lively,4,Yes,Yes\nMaxi Kleber,1,No,No\nKyrie Irving,1,No,No\nJason Kidd,1,No,No\nJalen Johnson,4,Yes,Yes\nSadik Bey,3,No,No\nDeandre Hunter,2,No,No\nDonovic Mitchell,1,No,No\nAJ Griffin,2,No,No\nCam Thomas,6,Yes,Yes\nRoyce O'Neil,1,No,No\nDorian Finney Smith,1,No,No\nBen Simmons,2,No,No\nNick Claxton,2,No,No\nCam Johnson,2,No,No\nSpencer Dinwiddie,1,No,No\nMax Struss,5,Yes,Yes\nJare

In [4]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

prompt = (
   "What is the capital of France?"
)

messages = [
    {"role": "system", "content": "You are a helpful assistant that answers questions."},
    {"role": "user", "content": prompt}
]
response = llm.invoke(prompt)
response.content

'The capital of France is Paris.'

In [2]:
from langchain_google_vertexai import ChatVertexAI

llm = ChatVertexAI(
    model="gemini-1.5-flash", 
    temperature=0
)


prompt = "What is the capital of France?"

messages = [
    {"role": "system", "content": "You are a helpful assistant that answers questions."},
    {"role": "user", "content": prompt}
]


response = llm.invoke(messages)


print("Response:", response.content)

Response: The capital of France is **Paris**. 

