In [21]:
import pandas as pd
import time
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import boxscoretraditionalv2
from langchain.vectorstores import Chroma

pd.set_option('display.max_columns', None)

In [49]:
from dotenv import load_dotenv
load_dotenv('../../../openai.env')

True

# get data

In [None]:
dallas_id = teams.find_team_by_abbreviation('DAL')['id']
season_2122 = leaguegamefinder.LeagueGameFinder(team_id_nullable=dallas_id, season_nullable='2021-22', season_type_nullable=leaguegamefinder.SeasonTypeNullable.regular)
season = season_2122.get_data_frames()[0]
print(season.shape)
season.head()

In [32]:
season['metadata_source'] = season.GAME_DATE.astype(str) + ' ' + season.MATCHUP
game_id_to_source = season.set_index('GAME_ID').metadata_source.to_dict()

In [60]:
history = season.query('GAME_DATE < "2022-02-19"')
target_game = season.query('GAME_DATE == "2022-03-19"')
print(history.shape)
target_game

(59, 29)


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,metadata_source
11,22021,1610612742,DAL,Dallas Mavericks,22101059,2022-03-19,DAL @ CHA,L,241,108,39,89,0.438,14,38,0.368,16,20,0.8,12,32,44,18,4,4,13,26,-21.0,2022-03-19 DAL @ CHA


In [61]:
bs_frames = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=target_game.GAME_ID.iloc[0]).get_data_frames()
len(bs_frames)

3

In [62]:
def prepare_bs(df_frame_0):
    game_id = df_frame_0.GAME_ID.iloc[0]
    result = (
        df_frame_0
        .query(f'TEAM_ID == {dallas_id}')
        .dropna(axis=0)
        .drop(columns=['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_ID', 'NICKNAME', 'START_POSITION', 'COMMENT'])
        .copy()
    )
    result['metadata_source'] = game_id_to_source[game_id]
    return result
prepare_bs(bs_frames[0])#.columns

Unnamed: 0,PLAYER_NAME,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,metadata_source
0,Dorian Finney-Smith,28.000000:00,3.0,10.0,0.3,2.0,6.0,0.333,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,8.0,-22.0,2022-03-19 DAL @ CHA
1,Maxi Kleber,25.000000:24,3.0,4.0,0.75,1.0,2.0,0.5,2.0,2.0,1.0,1.0,6.0,7.0,0.0,1.0,1.0,1.0,4.0,9.0,-6.0,2022-03-19 DAL @ CHA
2,Dwight Powell,21.000000:40,0.0,4.0,0.0,0.0,1.0,0.0,2.0,2.0,1.0,2.0,2.0,4.0,2.0,0.0,0.0,0.0,1.0,2.0,-12.0,2022-03-19 DAL @ CHA
3,Jalen Brunson,22.000000:42,3.0,8.0,0.375,0.0,2.0,0.0,4.0,4.0,1.0,0.0,2.0,2.0,2.0,2.0,0.0,1.0,1.0,10.0,-4.0,2022-03-19 DAL @ CHA
4,Luka Doncic,29.000000:52,13.0,20.0,0.65,8.0,12.0,0.667,3.0,5.0,0.6,0.0,4.0,4.0,3.0,0.0,0.0,4.0,2.0,37.0,-30.0,2022-03-19 DAL @ CHA
5,Trey Burke,22.000000:35,2.0,11.0,0.182,1.0,5.0,0.2,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,3.0,4.0,5.0,-13.0,2022-03-19 DAL @ CHA
6,Marquese Chriss,16.000000:39,4.0,9.0,0.444,0.0,1.0,0.0,2.0,4.0,0.5,3.0,6.0,9.0,1.0,0.0,2.0,0.0,2.0,10.0,0.0,2022-03-19 DAL @ CHA
7,Davis Bertans,11.000000:48,1.0,4.0,0.25,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0,0.0,0.0,1.0,6.0,2.0,-9.0,2022-03-19 DAL @ CHA
8,Josh Green,21.000000:08,4.0,6.0,0.667,1.0,2.0,0.5,0.0,0.0,0.0,2.0,1.0,3.0,2.0,0.0,0.0,3.0,0.0,9.0,-6.0,2022-03-19 DAL @ CHA
9,Sterling Brown,16.000000:51,1.0,4.0,0.25,0.0,2.0,0.0,2.0,2.0,1.0,2.0,2.0,4.0,1.0,0.0,0.0,0.0,3.0,4.0,5.0,2022-03-19 DAL @ CHA


In [63]:
def load_game(game_id):
    return prepare_bs(boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id).get_data_frames()[0])
prepared_history = pd.concat([load_game(_id) for _id in sorted(history.GAME_ID)], axis=0)
print(prepared_history.shape)
print(prepared_history.metadata_source.nunique())
prepared_history.head()

(673, 22)
59


Unnamed: 0,PLAYER_NAME,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,metadata_source
0,Dorian Finney-Smith,28.000000:47,2.0,12.0,0.167,1.0,6.0,0.167,0.0,0.0,0.0,4.0,4.0,8.0,2.0,2.0,0.0,1.0,4.0,5.0,-21.0,2021-10-21 DAL @ ATL
1,Kristaps Porzingis,28.000000:49,4.0,13.0,0.308,1.0,4.0,0.25,2.0,2.0,1.0,2.0,3.0,5.0,0.0,1.0,2.0,4.0,3.0,11.0,-23.0,2021-10-21 DAL @ ATL
2,Dwight Powell,20.000000:13,1.0,5.0,0.2,0.0,1.0,0.0,2.0,3.0,0.667,2.0,2.0,4.0,1.0,1.0,0.0,1.0,2.0,4.0,-21.0,2021-10-21 DAL @ ATL
3,Tim Hardaway Jr.,27.000000:56,4.0,9.0,0.444,2.0,6.0,0.333,4.0,4.0,1.0,0.0,2.0,2.0,0.0,1.0,0.0,1.0,3.0,14.0,-17.0,2021-10-21 DAL @ ATL
4,Luka Doncic,35.000000:24,6.0,17.0,0.353,2.0,7.0,0.286,4.0,4.0,1.0,0.0,11.0,11.0,7.0,1.0,0.0,5.0,1.0,18.0,-24.0,2021-10-21 DAL @ ATL


In [64]:
prepared_history.to_csv('dallas_2021_22_before_2022-03-19.csv')

In [65]:
prepared_target = load_game(target_game.GAME_ID.iloc[0])
prepared_target

Unnamed: 0,PLAYER_NAME,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,metadata_source
0,Dorian Finney-Smith,28.000000:00,3.0,10.0,0.3,2.0,6.0,0.333,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,8.0,-22.0,2022-03-19 DAL @ CHA
1,Maxi Kleber,25.000000:24,3.0,4.0,0.75,1.0,2.0,0.5,2.0,2.0,1.0,1.0,6.0,7.0,0.0,1.0,1.0,1.0,4.0,9.0,-6.0,2022-03-19 DAL @ CHA
2,Dwight Powell,21.000000:40,0.0,4.0,0.0,0.0,1.0,0.0,2.0,2.0,1.0,2.0,2.0,4.0,2.0,0.0,0.0,0.0,1.0,2.0,-12.0,2022-03-19 DAL @ CHA
3,Jalen Brunson,22.000000:42,3.0,8.0,0.375,0.0,2.0,0.0,4.0,4.0,1.0,0.0,2.0,2.0,2.0,2.0,0.0,1.0,1.0,10.0,-4.0,2022-03-19 DAL @ CHA
4,Luka Doncic,29.000000:52,13.0,20.0,0.65,8.0,12.0,0.667,3.0,5.0,0.6,0.0,4.0,4.0,3.0,0.0,0.0,4.0,2.0,37.0,-30.0,2022-03-19 DAL @ CHA
5,Trey Burke,22.000000:35,2.0,11.0,0.182,1.0,5.0,0.2,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,3.0,4.0,5.0,-13.0,2022-03-19 DAL @ CHA
6,Marquese Chriss,16.000000:39,4.0,9.0,0.444,0.0,1.0,0.0,2.0,4.0,0.5,3.0,6.0,9.0,1.0,0.0,2.0,0.0,2.0,10.0,0.0,2022-03-19 DAL @ CHA
7,Davis Bertans,11.000000:48,1.0,4.0,0.25,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0,0.0,0.0,1.0,6.0,2.0,-9.0,2022-03-19 DAL @ CHA
8,Josh Green,21.000000:08,4.0,6.0,0.667,1.0,2.0,0.5,0.0,0.0,0.0,2.0,1.0,3.0,2.0,0.0,0.0,3.0,0.0,9.0,-6.0,2022-03-19 DAL @ CHA
9,Sterling Brown,16.000000:51,1.0,4.0,0.25,0.0,2.0,0.0,2.0,2.0,1.0,2.0,2.0,4.0,1.0,0.0,0.0,0.0,3.0,4.0,5.0,2022-03-19 DAL @ CHA


In [71]:
prepared_target.to_csv('dallas_2021_22_on_2022-03-19.csv')

In [66]:
prepared_history.query('FG3M > 7')#.describe()

Unnamed: 0,PLAYER_NAME,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,metadata_source


# store in verctor db

In [67]:
from langchain.document_loaders.csv_loader import CSVLoader


loader = CSVLoader(file_path='./dallas_2021_22_before_2022-03-19.csv', source_column='metadata_source')
data = loader.load()

In [68]:
from langchain.storage import InMemoryStore, LocalFileStore
from langchain.embeddings import  OpenAIEmbeddings, CacheBackedEmbeddings

In [69]:
underlying_embeddings = OpenAIEmbeddings()

fs = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, fs, namespace=underlying_embeddings.model
)

In [81]:
db = Chroma.from_documents(data, embedding=cached_embedder)

In [79]:
query_data = CSVLoader(file_path='./dallas_2021_22_on_2022-03-19.csv', source_column='metadata_source').load()
query_1 = query_data[0]
# query_1.
query_embeddings = cached_embedder.embed_documents([_q.page_content for _q in query_data])
print(len(query_embeddings))

11


In [82]:
docs = db.similarity_search_by_vector(query_embeddings[0])
print(docs[0].page_content)

: 0
PLAYER_NAME: Dorian Finney-Smith
MIN: 31.000000:40
FGM: 3.0
FGA: 6.0
FG_PCT: 0.5
FG3M: 3.0
FG3A: 5.0
FG3_PCT: 0.6
FTM: 0.0
FTA: 0.0
FT_PCT: 0.0
OREB: 1.0
DREB: 6.0
REB: 7.0
AST: 1.0
STL: 0.0
BLK: 0.0
TO: 1.0
PF: 0.0
PTS: 9.0
PLUS_MINUS: -6.0
metadata_source: 2021-11-17 DAL @ PHX
