In [1]:
%load_ext autoreload
%autoreload 2

import os
import re
import sys

sys.path.append(os.path.abspath("../"))


import faiss
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 400)


load_dotenv()


from src.data_utils import PodcastContainer, load_clean_scores
from src.player_utils import PlayerUtil
from src.utils import get_repo_root

scores = load_clean_scores(["2022-23", "2023-24"])
scores.head(5)

Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,fieldGoalsAttempted,fieldGoalsPercentage,threePointersMade,threePointersAttempted,threePointersPercentage,freeThrowsMade,freeThrowsAttempted,freeThrowsPercentage,reboundsOffensive,reboundsDefensive,reboundsTotal,assists,steals,blocks,turnovers,foulsPersonal,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed
360551,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,200782,pj tucker,33.016667,3,5,0.6,0,2,0.0,0,0,0.0,2,2,4,0,0,1,2,2,6,-6,12.0,17.15,False
360552,2022-23,2022-10-18,22200001,BOS vs. PHI,celtics,celtics,201143,al horford,23.1,2,7,0.286,2,5,0.4,0,0,0.0,1,4,5,1,0,0,0,4,6,8,13.75,35.15,False
360553,2022-23,2022-10-18,22200001,BOS vs. PHI,celtics,celtics,201933,blake griffin,8.283333,0,2,0.0,0,1,0.0,1,2,0.5,2,3,5,1,0,0,0,3,1,-5,8.75,0.5,True
360554,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,201935,james harden,37.266667,9,14,0.643,5,9,0.556,12,12,1.0,0,8,8,7,0,0,3,3,35,1,54.0,44.05,True
360555,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,202699,tobias harris,34.233333,7,14,0.5,3,6,0.5,1,2,0.5,1,1,2,0,3,0,0,3,18,-1,26.5,25.95,True


In [2]:
cont = PodcastContainer({"rotowire": PodcastContainer.ROTOWIRE_DIR})

# begin, end = pd.to_datetime("2023-10-22").date(), pd.to_datetime("2023-10-27").date()
begin, end = pd.to_datetime("2023-10-27"), pd.to_datetime("2023-11-05")

# Filter podcast episodes by date
podcast_df = cont.get_all_episodes()
podcast_df = podcast_df[
    (podcast_df.publication_date >= begin) & (podcast_df.publication_date < end)
]


df = cont.get_all_episodes()
podcast_df = df[df.file_name.str.contains("week_15")]

display(podcast_df)

Unnamed: 0,publication_date,file_name,file_path,content,duration,podcast_name
56,2024-01-26,fantasy_basketball_waiver_wire_for_week_15_202324,G:\My Drive\Columbia\Practical Deep Learning\F...,Welcome everybody to the award winning rotor w...,2839,rotowire


## Load Annotated Data

In [54]:
import json
from pathlib import Path


# Load the JSON file
def load_label_studio_data(filepath: Path) -> list[dict]:
    ner_data = []

    with open(filepath, "r", encoding="utf-8") as f:
        label_studio_data = json.load(f)

    for item in label_studio_data:
        text = item["data"]["text"]

        # Extract annotations (entities)
        annotations = item.get("annotations", [])

        entities = []
        if annotations:
            # annotations[0]["result"] contains the entities
            for result in annotations[0]["result"]:
                if result["type"] == "labels":  # NER task type
                    entity = result["value"]
                    start = entity["start"]
                    end = entity["end"]
                    label = entity["labels"][0]
                    entities.append((start, end, label))

        # Add text and entities to the dataset
        ner_data.append({"text": text, "entities": entities})

    return ner_data


root = get_repo_root()
annotated_file = (
    root
    / "data/processed/annotated/fantasy_basketball_waiver_wire_for_week_2_202324_annotated.json"
)

ner_data = load_label_studio_data(annotated_file)

chunk_with_labels = []

for chunk_label in ner_data:
    players = set([player for _, _, player in chunk_label["entities"]])
    players = [p for p in players]

    replace_dct = {
        'xavier tillman sr': 'xavier tillman'
    }

    corrected_players = []
    for p in players:
        if p in replace_dct:
            corrected_players.append(replace_dct[p])
        else:
            corrected_players.append(p)

    chunk_with_labels.append((chunk_label["text"], list(corrected_players)))

## Run NER Model

In [57]:
%%time
from src.llm_feature_extractor import PlayerNER
from tqdm import tqdm

ner_players = []
ner = PlayerNER()

count = 0
for chunk, _ in tqdm(chunk_with_labels):
    count += 1
    players = ner.extract_all_players(chunk)
    ner_players.append(players)

100%|████████████████████████████████████████████████████████████████████████████████| 105/105 [01:05<00:00,  1.61it/s]

CPU times: total: 2 s
Wall time: 1min 6s





In [45]:
import jellyfish

p =('wembanyama', 'wenbin yama')

jellyfish.metaphone(p[0]), jellyfish.metaphone(p[1])

('WMBNYM', 'WNBN YM')

In [30]:
pu = PlayerUtil()
p = pu.get_all_players()

p[p.personName.str.contains('saddiq')]

Unnamed: 0,personId,personName,teamName
1385,1630180,saddiq bey,hawks


## Compare NER with Annotated

In [58]:
chunk_count = 0
total_entities = 0
matched_entities = 0

ner = PlayerNER()
corrected_ner_players = [ner.correct_players(n) for n in ner_players]
# Iterate through the expected and actual values
for (_, expected), actual in zip(chunk_with_labels, corrected_ner_players, strict=True):
    expected_set = set(expected)  # Convert expected labels to a set
    actual_set = set(actual)  # Convert actual labels to a set

    # Track total entities in the expected set
    total_entities += len(expected_set)

    # Count matches for individual entities
    matched_entities += len(expected_set & actual_set)  # Intersection of sets

    # Check if the chunk is completely correct
    is_correct = actual_set.issuperset(expected_set)
    if is_correct:
        chunk_count += 1
    else:
        if len(actual) != len(expected):
            print(f"Expected: {expected} - Actual: {actual} - Correct: {is_correct}")

# Output results
print(f"\nNumber of correct chunks: {chunk_count} / {len(chunk_with_labels)} = {int(chunk_count * 100.0 / len(chunk_with_labels))}%")
print(f"Total entities matched: {matched_entities} / {total_entities} = {int(matched_entities * 100.0 / total_entities)}%")

Expected: ['norman powell', 'joel embiid', 'james harden'] - Actual: ['james harden', 'joel embiid', 'terance mann', 'norm powell', "na'shon hylands"] - Correct: False
Expected: ["nah'shon hyland", 'bojan bogdanovic', 'terance mann', 'norman powell'] - Actual: ["na'shon hyland", 'terance mann', 'norm powell'] - Correct: False
Expected: ['lebron james', 'bojan bogdanovic', 'kevin durant', 'bradley beal', 'devin booker'] - Actual: ['anthony davis', 'lebron james', 'kevin durant'] - Correct: False
Expected: ['bradley beal', 'devin booker', 'jusuf nurkic', 'kevin durant'] - Actual: ['kevin durant', 'yusuf nurkic'] - Correct: False
Expected: ['santi aldama', 'xavier tillman', 'david roddy', 'steven adams'] - Actual: ['steven adams', 'xavier tillman', 'david roddy'] - Correct: False
Expected: ['ja morant', 'santi aldama', 'xavier tillman', 'jaren jackson jr'] - Actual: ['jaren jackson jr', 'ja morant'] - Correct: False
Expected: ['alex caruso', 'ayo dosunmu', 'coby white'] - Actual: ['coby w

In [37]:
ner.correct_players(['xavier tillman'])

['xavier tillman']

In [39]:
p = pu.get_all_players()
p[p.personName.str.contains('tillman')]

Unnamed: 0,personId,personName,teamName
1247,1629460,justin tillman,hawks
1417,1630214,xavier tillman,celtics


In [59]:
chunk_count = 0
total_entities = 0
matched_entities = 0


# Iterate through the expected and actual values
for (_, expected), actual in zip(chunk_with_labels, ner_players, strict=True):
    expected_set = set(expected)  # Convert expected labels to a set
    actual_set = set(actual)  # Convert actual labels to a set

    # Track total entities in the expected set
    total_entities += len(expected_set)

    # Count matches for individual entities
    matched_entities += len(expected_set & actual_set)  # Intersection of sets

    # Check if the chunk is completely correct
    is_correct = actual_set.issuperset(expected_set)
    if is_correct:
        chunk_count += 1
    else:
        if len(actual) == len(expected):
            print(f"Expected: {expected} - Actual: {actual} - Correct: {is_correct}")

# Output results
print(f"\nNumber of correct chunks: {chunk_count} / {len(chunk_with_labels)} = {int(chunk_count * 100.0 / len(chunk_with_labels))}%")
print(f"Total entities matched: {matched_entities} / {total_entities} = {int(matched_entities * 100.0 / total_entities)}%")

Expected: ['james harden', 'damian lillard', 'kelly oubre jr', 'tyrese maxey'] - Actual: ['damian lillard', 'tyrese maxey', 'james harden', 'kelly oubre'] - Correct: False
Expected: ['kelly oubre jr', 'james harden'] - Actual: ['james harden', 'kelly oubre'] - Correct: False
Expected: ['brandon miller', 'anthony davis', 'myles turner'] - Actual: ['miles turner', 'brandon miller', 'anthony davis'] - Correct: False
Expected: ['myles turner'] - Actual: ['miles turner'] - Correct: False
Expected: ['ayo dosunmu', 'coby white'] - Actual: ['kobe white', 'ayo dosunmu'] - Correct: False
Expected: ['coby white'] - Actual: ['kobe white'] - Correct: False
Expected: ['saddiq bey', "de'andre hunter", 'jalen johnson'] - Actual: ['sadik bey', 'deandre hunter', 'jalen johnson'] - Correct: False
Expected: ['saddiq bey', 'aj griffin'] - Actual: ['sadik bey', 'aj griffin'] - Correct: False
Expected: ['cameron johnson', 'ben simmons', 'cam thomas'] - Actual: ['cam johnson', 'cam thomas', 'ben simmons'] - C

In [2]:
import jellyfish

test_pairs = [
    ("sadik bey", "saddiq bey"),
    ("kobe white", "kobe"),
    ("kobe white", "cobe white"),
    ("miles turner", "myles turner"),
]


for a, b in test_pairs:
    print(a, b)
    print(jellyfish.soundex(a), jellyfish.soundex(b))
    print(jellyfish.metaphone(a), jellyfish.metaphone(b))

sadik bey saddiq bey
S321 S321
STK B STK B
kobe white kobe
K130 K100
KB HT KB
kobe white cobe white
K130 C130
KB HT KB HT
miles turner myles turner
M423 M423
MLS TRNR MLS TRNR


In [None]:
# Following are pairs of common mispellings and the correct legal name of the player:
# dereck lively -> dereck lively ii
# kelly oubre -> kelly oubre jr
# xavier tillman -> xavier tillman sr
# kobe white -> coby white
# jaylen johnson -> jalen johnson
# herb jones -> herbert jones
# kris porzingis -> kristaps porzingis
# deandre hunter -> de'andre hunter

# Following are the legal names of select players:
# saddiq bey, victor wembanyama, monte morris, trey murphy iii

In [60]:
chunk_count = 0
total_entities = 0
matched_entities = 0

# Iterate through the expected and actual values
for (_, expected), actual in zip(chunk_with_labels, ner_players, strict=True):
    expected_set = set(expected)  # Convert expected labels to a set
    actual_set = set(actual)  # Convert actual labels to a set

    # Track total entities in the expected set
    total_entities += len(expected_set)

    # Count matches for individual entities
    matched_entities += len(expected_set & actual_set)  # Intersection of sets

    # Check if the chunk is completely correct
    is_correct = actual_set.issuperset(expected_set)
    if is_correct:
        chunk_count += 1
    else:
        print(f"Expected: {expected} - Actual: {actual} - Correct: {is_correct}")

# Output results
print(f"\nNumber of correct chunks: {chunk_count} / {len(chunk_with_labels)}")
print(f"Total entities matched: {matched_entities} / {total_entities}")

Expected: ['damian lillard', 'kelly oubre jr', 'tyrese maxey', 'james harden'] - Actual: ['damian lillard', 'tyrese maxey', 'james harden', 'kelly oubre'] - Correct: False
Expected: ['kelly oubre jr', 'james harden'] - Actual: ['james harden', 'kelly oubre'] - Correct: False
Expected: ['norman powell', 'james harden', 'joel embiid'] - Actual: ['james harden', 'joel embiid'] - Correct: False
Expected: ['norman powell', 'bojan bogdanovic', "nah'shon hyland", 'terance mann'] - Actual: ['terrence mann', 'norman powell', 'bones hyland', 'bojan bogdanovic'] - Correct: False
Expected: ['lebron james', 'bradley beal', 'kevin durant', 'bojan bogdanovic', 'devin booker'] - Actual: ['bogdan bogdanovic', 'anthony davis', 'lebron james', 'kevin durant'] - Correct: False
Expected: ['kevin durant', 'jusuf nurkic', 'bradley beal', 'devin booker'] - Actual: ['kevin durant', 'yusuf nurkic'] - Correct: False
Expected: ['brandon miller', 'myles turner', 'anthony davis'] - Actual: ['miles turner', 'brandon

In [55]:
len(ner_players)

10

In [46]:
ner_players

[['damian lillard'],
 ['damian lillard', 'tyrese maxey', 'james harden', 'kelly oubre jr'],
 ['james harden', 'kelly oubre'],
 ['james harden'],
 ['james harden'],
 ['james harden', 'joel embiid'],
 ['joel embiid', 'james harden'],
 ['james harden', 'joel embiid'],
 ['terrence mann', 'norman powell', 'bones hyland', 'bojan bogdanovic'],
 ['bogdan bogdanovic', 'anthony davis', 'lebron james', 'kevin durant']]

In [41]:
chunk_with_labels[9]

("comes to mind. But Hopefully, Bogdanovic is available. Well, let's get on. Let's get back to real basketball. The other game last night, for players playing, Lakers beat Phoenix in the LA home debut. AD bounced back with a nice 30.12 rebound, 3 block, 3 steal performance. LeBron, who was allegedly on a minutes restriction, played 35 minutes, 21 points, 8 boards, 9 dimes, 2 steals, 2 blocks. Of course, this is against the Suns team with no Booker, no Beal. KD had a very nice 39 and 11 game. I'll",
 ['kevin durant', 'lebron james'])

In [66]:
scores[scores.personName.str.contains("cam thomas")]

Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,fieldGoalsAttempted,fieldGoalsPercentage,threePointersMade,threePointersAttempted,threePointersPercentage,freeThrowsMade,freeThrowsAttempted,freeThrowsPercentage,reboundsOffensive,reboundsDefensive,reboundsTotal,assists,steals,blocks,turnovers,foulsPersonal,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed
360715,2022-23,2022-10-19,22200006,BKN vs. NOP,nets,nets,1630560,cam thomas,12.716667,1,4,0.25,0,1,0.0,0,0,0.0,0,0,0,1,0,0,0,0,2,3,3.5,8.4,False
361140,2022-23,2022-10-21,22200021,BKN vs. TOR,nets,nets,1630560,cam thomas,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,5.3,False
361915,2022-23,2022-10-24,22200049,BKN @ MEM,nets,nets,1630560,cam thomas,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,1.8,False
362203,2022-23,2022-10-26,22200060,BKN @ MIL,nets,nets,1630560,cam thomas,0.983333,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,1,0.0,1.8,False
362361,2022-23,2022-10-27,22200066,BKN vs. DAL,nets,nets,1630560,cam thomas,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,1.8,False
362811,2022-23,2022-10-29,22200083,BKN vs. IND,nets,nets,1630560,cam thomas,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.7,False
363209,2022-23,2022-10-31,22200098,BKN vs. IND,nets,nets,1630560,cam thomas,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,False
363340,2022-23,2022-11-01,22200103,BKN vs. CHI,nets,nets,1630560,cam thomas,0.816667,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,2,0.0,0.0,False
363851,2022-23,2022-11-04,22200123,BKN @ WAS,nets,nets,1630560,cam thomas,31.833333,6,13,0.462,2,4,0.5,3,3,1.0,0,4,4,6,1,0,0,2,17,36,33.0,0.0,True
364102,2022-23,2022-11-05,22200133,BKN @ CHA,nets,nets,1630560,cam thomas,28.783333,5,11,0.455,1,2,0.5,10,11,0.909,0,4,4,4,1,1,1,2,21,13,35.5,6.6,True


In [None]:
scores.

In [18]:
expected_player

['damian lillard']

In [11]:
players

['damian lillard']

In [199]:
from src.llm_feature_extractor import FaissFeatureExtractor

title = podcast_df.file_name.iloc[0]
text = podcast_df.content.iloc[0]
podcast_date = podcast_df.publication_date.iloc[0]

fe = FaissFeatureExtractor(title, text, podcast_date)

Loading from disk
Loading from disk


In [200]:
%%time

players = [
    "giannis antetokounmpo",
    "joel embiid",
    "jayson tatum",
    "damian lillard",
    "tyrese haliburton",
    "lebron james",
    "nikola jokic",
    "kevin durant",
    "luka doncic",
]


feats = fe.extract_llm_feats()

CPU times: total: 93.8 ms
Wall time: 12.9 s


In [201]:
feats

Unnamed: 0,personName,increased_playing_time,podcast_date
0,damian lillard,0.0,2024-01-26
1,mitchell robinson,-1.0,2024-01-26
2,isaiah hartenstein,0.0,2024-01-26
3,og anunoby,0.5,2024-01-26
4,kristaps porzingis,-0.5,2024-01-26
5,bam adebayo,0.0,2024-01-26
6,terry rozier,0.5,2024-01-26
7,kyle lowry,-0.5,2024-01-26
8,tyler herro,0.0,2024-01-26
9,jimmy butler,0.0,2024-01-26


In [197]:
dist, indices = fe.index.retrieve_relevant_indices(
    "mitchell robinson", "Increase or decrease in minutes and playing time"
)
for p in [fe.chunks[i] for i in indices]:
    print(p)
    print()

out the Nuggets last night. 122 to 84 at MSG. UJ Anunoby with 26 points and 6 steals, all Knicks starters, plus or better in point differential. Knicks won this despite still being without Isaiah Hartenstein again. Achilles issued. So he's missed 2 games, and, of course, Robinson's out for the year. The Nuggets did have a day of rest, but it was the end of a 5 game road trip. And Denver did go 3 and 2. Alex and Shannon, any reason for concern for the Nuggets? Yes and no. I mean, they still



In [198]:
dist, indices = fe.index.retrieve_relevant_indices(
    "damian lillard", "Increase or decrease in minutes and playing time"
)
for p in [fe.chunks[i] for i in indices]:
    print(p)
    print()

ore deserving than Dame Damian Lillard, but solid collection. Probably, you know, arguably, the 10 best players in the league or 10 of the 12 best players in the league are in the starting lineup. So it's hard to really nitpick any of those options. Yeah. I mean, I'm a Bucks fan. I don't I don't hate the idea of either Mitchell or of, Brunson being there. I mean, again, Lillard's been good. Just he's Damian Lillard. He's only gonna be so disappointing to an extent. Ken, you still hate this all

he's Damian Lillard. He's only gonna be so disappointing to an extent. Ken, you still hate this all star game? I remember we used to we you and I did, like, a social media breakout one time where I'm like, I love all star weekend, slam dunk contest. Sign me up. You were like, no. This is a joke. It's a joke. No defense. I don't want anybody getting hurt. It just I don't even like to watch it. It's a good time to do chores around the house. It it definitely has become less and less watchable. I



In [182]:
len(fe.chunks), len(indices)

(100, 72)

In [184]:
len(text), len(" ".join(combined))

(40948, 34353)

In [173]:
" ".join(d[1])

"t. Like, I'm on my phone, you know, and, Accumming, I don't care. Just get something done with this garbage is on the air. Yeah. Yeah. Okay. Well, the I'm I mean, I'm still looking forward to all star weekend, but, mostly for the break. For the lack of injury reporting? Exactly. And, hopefully, we can get some good participants in the, slam dunk contest. I don't even know if that's been out. Have any none of those have been announced, have they? Like, 3 points slam dunk. Yeah. We'll see. Okay. Let's go get some real basketball, Ken. Yeah. Let's go to New York where Spike Lee's Knicks blew out the Nuggets last night. 122 to 84 at MSG. UJ Anunoby with 26 points and 6 steals, all Knicks starters, plus or better in point differential. Knicks won this despite still being without Isaiah Hartenstein again. Achilles issued. So he's missed 2 games, and, of course, Robinson's out for the year. The Nuggets did have a day of rest, but it was the end of a 5 game road trip. And Denver did go 3 and 

In [172]:
len(text)

40948

In [163]:
len(text)

40948

In [138]:
p = [p for p, _ in d]

['giannis antetokounmpo',
 'joel embiid',
 'jayson tatum',
 'damian lillard',
 'tyrese haliburton',
 'lebron james',
 'nikola jokic',
 'kevin durant',
 'luka doncic',
 'shai gilgeous-alexander',
 'donovan mitchell',
 'jalen brunson',
 'og anunoby',
 'isaiah hartenstein',
 'mitchell robinson',
 'terry rozier',
 'kristaps porzingis',
 'bam adebayo',
 'kyle lowry',
 'tyler herro',
 'jimmy butler',
 'josh richardson',
 'caleb martin',
 'duncan robinson',
 'hawkeye',
 'highsmith',
 'hawkeyes',
 'harrison barnes',
 'andrew wiggins',
 'kevin huerter',
 'keegan murray',
 'jabari walker',
 'james clutch',
 'grayson allen',
 'gary terry',
 'bilal koulibaly',
 'tyus jones',
 'jordan poole',
 'deni avdija',
 'kyle kuzma',
 'nick richards',
 'mark williams',
 'jonathan kuminga',
 'draymond green',
 'stephen curry',
 'kirk',
 'moses moody',
 'jon conchar',
 'luke kennard',
 'kembe mutombo',
 'vince williams',
 'brandon miller',
 'lamelo ball',
 'miles bridges',
 'gordon hayward',
 'sam merrill',
 'd

In [149]:
set(p) - set(scores.personName.unique())

{'bilal koulibaly',
 'cam johnson',
 'chris middleton',
 'dennis schroeder',
 'emmanuel quickley',
 'gary terry',
 'hawkeye',
 'hawkeyes',
 'highsmith',
 'jacob pirtle',
 'james clutch',
 'jared allen',
 'jon conchar',
 'jontay murray',
 'kembe mutombo',
 'kirk',
 'luca fontecchio',
 'luke cornett',
 'terrence mann',
 'tim hardaway',
 'vince williams'}

In [146]:
len(p)

96

In [145]:
scores[scores.personName.str.contains("tim harda")]

Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,fieldGoalsAttempted,fieldGoalsPercentage,threePointersMade,threePointersAttempted,threePointersPercentage,freeThrowsMade,freeThrowsAttempted,freeThrowsPercentage,reboundsOffensive,reboundsDefensive,reboundsTotal,assists,steals,blocks,turnovers,foulsPersonal,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed
360897,2022-23,2022-10-19,22200013,DAL @ PHX,mavericks,mavericks,203501,tim hardaway jr,25.016667,3,10,0.3,1,5,0.2,2,2,1.0,0,2,2,1,1,0,2,1,9,-2,14.0,16.7,False
361482,2022-23,2022-10-22,22200034,DAL vs. MEM,mavericks,mavericks,203501,tim hardaway jr,25.883333,3,10,0.3,2,6,0.333,8,8,1.0,0,2,2,2,0,0,0,1,16,23,21.5,15.05,True
362343,2022-23,2022-10-27,22200066,DAL @ BKN,mavericks,mavericks,203501,tim hardaway jr,30.283333,6,13,0.462,4,8,0.5,2,2,1.0,0,2,2,1,0,2,1,1,18,-1,25.5,15.65,True
362870,2022-23,2022-10-29,22200086,DAL vs. OKC,mavericks,mavericks,203501,tim hardaway jr,22.183333,2,7,0.286,1,3,0.333,3,4,0.75,0,2,2,1,1,0,2,3,8,3,13.0,17.0,False
363054,2022-23,2022-10-30,22200093,DAL vs. ORL,mavericks,mavericks,203501,tim hardaway jr,27.116667,5,15,0.333,3,9,0.333,8,8,1.0,0,1,1,3,1,0,0,2,21,9,28.75,17.1,True
363632,2022-23,2022-11-02,22200115,DAL vs. UTA,mavericks,mavericks,203501,tim hardaway jr,20.766667,1,9,0.111,1,6,0.167,0,0,0.0,0,1,1,2,0,0,1,2,3,-2,6.75,20.55,False
363933,2022-23,2022-11-04,22200127,DAL vs. TOR,mavericks,mavericks,203501,tim hardaway jr,19.1,2,8,0.25,2,6,0.333,0,0,0.0,0,2,2,3,1,0,2,2,6,-6,14.0,19.1,False
364638,2022-23,2022-11-07,22200155,DAL vs. BKN,mavericks,mavericks,203501,tim hardaway jr,13.8,4,8,0.5,2,4,0.5,1,2,0.5,0,1,1,1,0,0,0,0,11,6,13.75,17.6,False
364792,2022-23,2022-11-09,22200161,DAL @ ORL,mavericks,mavericks,203501,tim hardaway jr,22.183333,2,10,0.2,1,7,0.143,0,0,0.0,0,2,2,2,2,0,0,0,5,-3,14.5,15.25,False
365076,2022-23,2022-11-10,22200172,DAL @ WAS,mavericks,mavericks,203501,tim hardaway jr,28.083333,5,12,0.417,5,11,0.455,1,2,0.5,1,1,2,1,0,0,0,0,16,-7,20.0,15.55,True


In [110]:
player_mapping = fe.index.player_chunk_mapping
text_chunks = fe.index.text_chunks

In [104]:
text_chunks[31]

"surprised if the if the injury drags on maybe a little bit longer than than was hoped just it's a pretty serious injury. So the current current stance is he was cleared for basketball activity around January 19th, ruled out of the 4 game road trip that Cleveland was on at the time. Yeah. They're gonna return to home January 29th against the Clippers. So we're looking at Monday night. He could return as soon as next week. But did they also I they don't even need to rush him back. They don't lose anymore. So, like, he's been playing great. Jared Allen has been awesome lately, down low without Mobley. Anyway, we digress. Whose turn is it for a pick here? It's my we digress. Whose turn is it for a pick here? I think it's my turn. I think it is. You got a second long shot, don't you, Alex? I do. Sorry. I was looking up Brandon Miller stats. I was like, how is he? Brandon Miller hasn't missed a shot at the rim in the past 4 games, by the way. 12 of 12, and getting to the foul line. I owe"

In [84]:
relevant_chunks = [text_chunks[idx] for idx in player_mapping["dennis schroder"]]
relevant_chunks

["RJ Barrett, Dennis Schroeder as value plays the same at the same time. RJ Barrett is probably going to be checked by Paul George or Kawhi Leonard for a decent chunk of the game or Terrence man. So I would be a little bit worried about that. I would almost just prefer Dennis Schroder at 59100. But to me at this again, I don't feel like 5,900 is a is like a steal for Dennis Schroder. You know, I don't feel like I'm slotting him my my lineup and getting almost guaranteed value. Now even before they acquired Quickley, Schroder was oftentimes in that $6,000 range. So I agree. He is a a better value, on DraftKings than he is FanDuel for tonight's slate. 5,200 for Schroeder on DraftKings. I like him considerably more at 5,200 than 59 on FanDuel. But I think if he's if he's announced as a starter, he is in play. Another guard from a different game that I like quite a bit right in that same range. 54100 from Markelle Fultz on FanDuel. Yeah. He is back in the starting lineup, played 29 minutes

In [111]:
rc = fe.index.retrieve_relevant_chunks("andrew wiggins", "minutes playing time")
rc

TypeError: combine_chunks_with_overlap() missing 1 required positional argument: 'overlap_length'

In [108]:
rc

["gonna take some time for the chemistry to build. I'll say this for the heat. You know, they're on 5 game losing streak, and Hawkeyes has been out for 6 games. So he's missed all 5 of those losses. I think he's a critical part of this heat rotation. They should move Hawkeyes in the starting lineup in my opinion, or at least make sure he's getting plus 27 minutes a game. I think there's a good chance he starts over high. Smith wants healthy. Yep. Alright. Let's go to Golden State. Last night, the Kings beat the warriors in a thriller. Kings, Kings 1, 134, 133. Harrison Barnes, 39 points. His second straight, 30 plus point scoring night. And fantasy wise, Wiggins got to start again for the warriors. He went 174 and 4 and 31 minutes. So Wiggins slowly earning legit minutes for the warriors. Shannon's, I think you pointed out last week's show, but back to the Kings after a 4 game slide, they won 2 in a row. Is the arrow pointing up for Sacramento, gentlemen? You know, sort of, I mean,",
 

In [105]:
relevant_chunks = [text_chunks[idx] for idx in player_mapping["andrew wiggins"]]
relevant_chunks
# d = combine_chunks_with_overlap(relevant_chunks, overlap_length = 200)

["gonna take some time for the chemistry to build. I'll say this for the heat. You know, they're on 5 game losing streak, and Hawkeyes has been out for 6 games. So he's missed all 5 of those losses. I think he's a critical part of this heat rotation. They should move Hawkeyes in the starting lineup in my opinion, or at least make sure he's getting plus 27 minutes a game. I think there's a good chance he starts over high. Smith wants healthy. Yep. Alright. Let's go to Golden State. Last night, the Kings beat the warriors in a thriller. Kings, Kings 1, 134, 133. Harrison Barnes, 39 points. His second straight, 30 plus point scoring night. And fantasy wise, Wiggins got to start again for the warriors. He went 174 and 4 and 31 minutes. So Wiggins slowly earning legit minutes for the warriors. Shannon's, I think you pointed out last week's show, but back to the Kings after a 4 game slide, they won 2 in a row. Is the arrow pointing up for Sacramento, gentlemen? You know, sort of, I mean,",
 

In [101]:
for s in d:
    print(s)
    print()

gonna take some time for the chemistry to build. I'll say this for the heat. You know, they're on 5 game losing streak, and Hawkeyes has been out for 6 games. So he's missed all 5 of those losses. I think he's a critical part of this heat rotation. They should move Hawkeyes in the starting lineup in my opinion, or at least make sure he's getting plus 27 minutes a game. I think there's a good chance he starts over high. Smith wants healthy. Yep. Alright. Let's go to Golden State. Last night, the Kings beat the warriors in a thriller. Kings, Kings 1, 134, 133. Harrison Barnes, 39 points. His second straight, 30 plus point scoring night. And fantasy wise, Wiggins got to start again for the warriors. He went 174 and 4 and 31 minutes. So Wiggins slowly earning legit minutes for the warriors. Shannon's, I think you pointed out last week's show, but back to the Kings after a 4 game slide, they won 2 in a row. Is the arrow pointing up for Sacramento, gentlemen? You know, sort of, I mean,

and 

In [82]:
for c in relevant_chunks:
    print(c)
    print()

gonna take some time for the chemistry to build. I'll say this for the heat. You know, they're on 5 game losing streak, and Hawkeyes has been out for 6 games. So he's missed all 5 of those losses. I think he's a critical part of this heat rotation. They should move Hawkeyes in the starting lineup in my opinion, or at least make sure he's getting plus 27 minutes a game. I think there's a good chance he starts over high. Smith wants healthy. Yep. Alright. Let's go to Golden State. Last night, the Kings beat the warriors in a thriller. Kings, Kings 1, 134, 133. Harrison Barnes, 39 points. His second straight, 30 plus point scoring night. And fantasy wise, Wiggins got to start again for the warriors. He went 174 and 4 and 31 minutes. So Wiggins slowly earning legit minutes for the warriors. Shannon's, I think you pointed out last week's show, but back to the Kings after a 4 game slide, they won 2 in a row. Is the arrow pointing up for Sacramento, gentlemen? You know, sort of, I mean,

and 

In [62]:
r = ner.extract_all_players(chunks[9])
r

['Hawkeyes', 'Harrison Barnes', 'Andrew Wiggins']

In [58]:
r.players

['Harrison Barnes', 'Andrew Wiggins']

In [63]:
len(chunks)

51

In [52]:
chunks[9]

"gonna take some time for the chemistry to build. I'll say this for the heat. You know, they're on 5 game losing streak, and Hawkeyes has been out for 6 games. So he's missed all 5 of those losses. I think he's a critical part of this heat rotation. They should move Hawkeyes in the starting lineup in my opinion, or at least make sure he's getting plus 27 minutes a game. I think there's a good chance he starts over high. Smith wants healthy. Yep. Alright. Let's go to Golden State. Last night, the Kings beat the warriors in a thriller. Kings, Kings 1, 134, 133. Harrison Barnes, 39 points. His second straight, 30 plus point scoring night. And fantasy wise, Wiggins got to start again for the warriors. He went 174 and 4 and 31 minutes. So Wiggins slowly earning legit minutes for the warriors. Shannon's, I think you pointed out last week's show, but back to the Kings after a 4 game slide, they won 2 in a row. Is the arrow pointing up for Sacramento, gentlemen? You know, sort of, I mean,"

In [46]:
chunks[9]

"gonna take some time for the chemistry to build. I'll say this for the heat. You know, they're on 5 game losing streak, and Hawkeyes has been out for 6 games. So he's missed all 5 of those losses. I think he's a critical part of this heat rotation. They should move Hawkeyes in the starting lineup in my opinion, or at least make sure he's getting plus 27 minutes a game. I think there's a good chance he starts over high. Smith wants healthy. Yep. Alright. Let's go to Golden State. Last night, the Kings beat the warriors in a thriller. Kings, Kings 1, 134, 133. Harrison Barnes, 39 points. His second straight, 30 plus point scoring night. And fantasy wise, Wiggins got to start again for the warriors. He went 174 and 4 and 31 minutes. So Wiggins slowly earning legit minutes for the warriors. Shannon's, I think you pointed out last week's show, but back to the Kings after a 4 game slide, they won 2 in a row. Is the arrow pointing up for Sacramento, gentlemen? You know, sort of, I mean,"

In [34]:
rel_chunks = fe.index.retrieve_relevant_chunks(
    "Duncan Robinson from the Knicks minutes playtime"
)
rel_chunks

[(0.3326186,
  "Have any none of those have been announced, have they? Like, 3 points slam dunk. Yeah. We'll see. Okay. Let's go get some real basketball, Ken. Yeah. Let's go to New York where Spike Lee's Knicks blew out the Nuggets last night. 122 to 84 at MSG. UJ Anunoby with 26 points and 6 steals, all Knicks starters, plus or better in point differential. Knicks won this despite still being without Isaiah Hartenstein again. Achilles issued. So he's missed 2 games, and, of course, Robinson's out for the year. The Nuggets did have a day of rest, but it was the end of a 5 game road trip. And Denver did go 3 and 2. Alex and Shannon, any reason for concern for the Nuggets? Yes and no. I mean, they still can't win road games. They're 14 and 11 on the road. We always know their better at home. There's a championship hangover thing that makes me not as worried where I think, you know, they they know they need to coast this year compared to last year a little bit more. At the same time, to 

In [23]:
dist

NameError: name 'dist' is not defined

In [43]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

title = podcast_df.file_name.iloc[0]
text = podcast_df.content.iloc[0]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(text)

In [14]:
chunks[0]

"Welcome everybody to the award winning rotor wire fantasy basketball podcast brought to you by Underdog Fantasy and FanDuel. It is Friday, January 26, Alex Barutha here with Shannon McEwen and Ken K train crates. All aboard. We had the All Star starters announced last night in the East, yanis, embiid Tatum, Lillard and Halliburton in the West LeBron. That's James. Jokic KD, Luca and SGA Shannon as is tradition. Is there anybody that you think was either snubbed or maybe you just would have preferred to see as an all star or one person that you think maybe doesn't deserve to be there. Yeah. I I think in general, all 10 all 10 starters, definitely deserve, the the the nomination. I the only one I really have any qualms about is Lillard. He's having a fine year for Milwaukee in his 1st year as a buck, but there are 2 east cars. Yeah. I I actually, Donovan Mitchell would be my number one choice to to replace Lillard and then Jalen Brunson. So so one of those two guards, I think, would be"

In [15]:
chunks[1]

"as a buck, but there are 2 east cars. Yeah. I I actually, Donovan Mitchell would be my number one choice to to replace Lillard and then Jalen Brunson. So so one of those two guards, I think, would be more deserving than Dame Damian Lillard, but solid collection. Probably, you know, arguably, the 10 best players in the league or 10 of the 12 best players in the league are in the starting lineup. So it's hard to really nitpick any of those options. Yeah. I mean, I'm a Bucks fan. I don't I don't hate the idea of either Mitchell or of, Brunson being there. I mean, again, Lillard's been good. Just he's Damian Lillard. He's only gonna be so disappointing to an extent. Ken, you still hate this all star game? I remember we used to we you and I did, like, a social media breakout one time where I'm like, I love all star weekend, slam dunk contest. Sign me up. You were like, no. This is a joke. It's a joke. No defense. I don't want anybody getting hurt. It just I don't even like to watch it."