In [1]:
%load_ext autoreload
%autoreload 2

import os
import re
import sys

sys.path.append(os.path.abspath("../"))


import faiss
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 400)


load_dotenv()


from src.data_utils import PodcastContainer, load_clean_scores
from src.player_utils import PlayerUtil
from src.utils import get_repo_root

scores = load_clean_scores(["2022-23", "2023-24"])
scores.head(5)

Unnamed: 0,season_year,game_date,gameId,matchup,teamName,teamSlug,personId,personName,minutes,fieldGoalsMade,fieldGoalsAttempted,fieldGoalsPercentage,threePointersMade,threePointersAttempted,threePointersPercentage,freeThrowsMade,freeThrowsAttempted,freeThrowsPercentage,reboundsOffensive,reboundsDefensive,reboundsTotal,assists,steals,blocks,turnovers,foulsPersonal,points,plusMinusPoints,fantasyPoints,projectedFantasyPoints,outperformed,outperform_next,outperform_next_5,outperform_next_10,injured_next,injured,fantasyDiff
359494,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,200782,pj tucker,33.016667,3,5,0.6,0,2,0.0,0,0,0.0,2,2,4,0,0,1,2,2,6,-6,12.0,17.15,0,1,False,False,0.0,0,-5.15
359495,2022-23,2022-10-18,22200001,BOS vs. PHI,celtics,celtics,201143,al horford,23.1,2,7,0.286,2,5,0.4,0,0,0.0,1,4,5,1,0,0,0,4,6,8,13.75,35.15,0,0,False,False,0.0,0,-21.4
359496,2022-23,2022-10-18,22200001,BOS vs. PHI,celtics,celtics,201933,blake griffin,8.283333,0,2,0.0,0,1,0.0,1,2,0.5,2,3,5,1,0,0,0,3,1,-5,8.75,0.5,1,0,False,False,1.0,0,8.25
359497,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,201935,james harden,37.266667,9,14,0.643,5,9,0.556,12,12,1.0,0,8,8,7,0,0,3,3,35,1,54.0,44.05,1,1,True,True,0.0,0,9.95
359498,2022-23,2022-10-18,22200001,PHI @ BOS,76ers,sixers,202699,tobias harris,34.233333,7,14,0.5,3,6,0.5,1,2,0.5,1,1,2,0,3,0,0,3,18,-1,26.5,25.95,1,1,False,False,0.0,0,0.55


In [2]:
cont = PodcastContainer({"rotowire": PodcastContainer.ROTOWIRE_DIR})

# begin, end = pd.to_datetime("2023-10-22").date(), pd.to_datetime("2023-10-27").date()
begin, end = pd.to_datetime("2023-10-27"), pd.to_datetime("2023-11-05")

# Filter podcast episodes by date
podcast_df = cont.get_all_episodes()
podcast_df = podcast_df[
    (podcast_df.publication_date >= begin) & (podcast_df.publication_date < end)
]


df = cont.get_all_episodes()
podcast_df = df[df.file_name.str.contains("week_15")]

display(podcast_df)

Unnamed: 0,publication_date,file_name,file_path,content,duration,podcast_name
56,2024-01-26,fantasy_basketball_waiver_wire_for_week_15_202324,G:\My Drive\Columbia\Practical Deep Learning\F...,Welcome everybody to the award winning rotor w...,2839,rotowire


## Load Annotated Data

In [3]:
import json
from pathlib import Path


# Load the JSON file
def load_label_studio_data(filepath: Path) -> list[dict]:
    ner_data = []

    with open(filepath, "r", encoding="utf-8") as f:
        label_studio_data = json.load(f)

    for item in label_studio_data:
        text = item["data"]["text"]

        # Extract annotations (entities)
        annotations = item.get("annotations", [])

        entities = []
        if annotations:
            # annotations[0]["result"] contains the entities
            for result in annotations[0]["result"]:
                if result["type"] == "labels":  # NER task type
                    entity = result["value"]
                    start = entity["start"]
                    end = entity["end"]
                    label = entity["labels"][0]
                    entities.append((start, end, label))

        # Add text and entities to the dataset
        ner_data.append({"text": text, "entities": entities})

    return ner_data


root = get_repo_root()
annotated_file = (
    root
    / "data/processed/annotated/fantasy_basketball_waiver_wire_for_week_2_202324_annotated.json"
)

ner_data = load_label_studio_data(annotated_file)

chunk_with_labels = []

for chunk_label in ner_data:
    players = set([player for _, _, player in chunk_label["entities"]])
    players = [p for p in players]

    replace_dct = {"xavier tillman sr": "xavier tillman"}

    corrected_players = []
    for p in players:
        if p in replace_dct:
            corrected_players.append(replace_dct[p])
        else:
            corrected_players.append(p)

    chunk_with_labels.append((chunk_label["text"], list(corrected_players)))

In [12]:
player_set = set()
for player_lst in [p for _, p in chunk_with_labels]:
    for player in player_lst:
        player_set.add(player)


len(player_set)

77

## Run NER Model

In [57]:
%%time
from src.llm_feature_extractor import PlayerNER
from tqdm import tqdm

ner_players = []
ner = PlayerNER()

count = 0
for chunk, _ in tqdm(chunk_with_labels):
    count += 1
    players = ner.extract_all_players(chunk)
    ner_players.append(players)

100%|████████████████████████████████████████████████████████████████████████████████| 105/105 [01:05<00:00,  1.61it/s]

CPU times: total: 2 s
Wall time: 1min 6s





In [45]:
import jellyfish

p = ("wembanyama", "wenbin yama")

jellyfish.metaphone(p[0]), jellyfish.metaphone(p[1])

('WMBNYM', 'WNBN YM')

In [30]:
pu = PlayerUtil()
p = pu.get_all_players()

p[p.personName.str.contains("saddiq")]

Unnamed: 0,personId,personName,teamName
1385,1630180,saddiq bey,hawks


## Baseline NER

In [59]:
chunk_count = 0
total_entities = 0
matched_entities = 0


# Iterate through the expected and actual values
for (_, expected), actual in zip(chunk_with_labels, ner_players, strict=True):
    expected_set = set(expected)  # Convert expected labels to a set
    actual_set = set(actual)  # Convert actual labels to a set

    # Track total entities in the expected set
    total_entities += len(expected_set)

    # Count matches for individual entities
    matched_entities += len(expected_set & actual_set)  # Intersection of sets

    # Check if the chunk is completely correct
    is_correct = actual_set.issuperset(expected_set)
    if is_correct:
        chunk_count += 1
    else:
        if len(actual) == len(expected):
            print(f"Expected: {expected} - Actual: {actual} - Correct: {is_correct}")

# Output results
print(
    f"\nNumber of correct chunks: {chunk_count} / {len(chunk_with_labels)} = {int(chunk_count * 100.0 / len(chunk_with_labels))}%"
)
print(
    f"Total entities matched: {matched_entities} / {total_entities} = {int(matched_entities * 100.0 / total_entities)}%"
)

Expected: ['james harden', 'damian lillard', 'kelly oubre jr', 'tyrese maxey'] - Actual: ['damian lillard', 'tyrese maxey', 'james harden', 'kelly oubre'] - Correct: False
Expected: ['kelly oubre jr', 'james harden'] - Actual: ['james harden', 'kelly oubre'] - Correct: False
Expected: ['brandon miller', 'anthony davis', 'myles turner'] - Actual: ['miles turner', 'brandon miller', 'anthony davis'] - Correct: False
Expected: ['myles turner'] - Actual: ['miles turner'] - Correct: False
Expected: ['ayo dosunmu', 'coby white'] - Actual: ['kobe white', 'ayo dosunmu'] - Correct: False
Expected: ['coby white'] - Actual: ['kobe white'] - Correct: False
Expected: ['saddiq bey', "de'andre hunter", 'jalen johnson'] - Actual: ['sadik bey', 'deandre hunter', 'jalen johnson'] - Correct: False
Expected: ['saddiq bey', 'aj griffin'] - Actual: ['sadik bey', 'aj griffin'] - Correct: False
Expected: ['cameron johnson', 'ben simmons', 'cam thomas'] - Actual: ['cam johnson', 'cam thomas', 'ben simmons'] - C

## Baseline NER + Post Processing

In [60]:
chunk_count = 0
total_entities = 0
matched_entities = 0

ner = PlayerNER()
corrected_ner_players = [ner.correct_players(n) for n in ner_players]
# Iterate through the expected and actual values
for (_, expected), actual in zip(chunk_with_labels, corrected_ner_players, strict=True):
    expected_set = set(expected)  # Convert expected labels to a set
    actual_set = set(actual)  # Convert actual labels to a set

    # Track total entities in the expected set
    total_entities += len(expected_set)

    # Count matches for individual entities
    matched_entities += len(expected_set & actual_set)  # Intersection of sets

    # Check if the chunk is completely correct
    is_correct = actual_set.issuperset(expected_set)
    if is_correct:
        chunk_count += 1
    else:
        if len(actual) == len(expected):
            print(f"Expected: {expected} - Actual: {actual} - Correct: {is_correct}")

# Output results
print(
    f"\nNumber of correct chunks: {chunk_count} / {len(chunk_with_labels)} = {int(chunk_count * 100.0 / len(chunk_with_labels))}%"
)
print(
    f"Total entities matched: {matched_entities} / {total_entities} = {int(matched_entities * 100.0 / total_entities)}%"
)

Expected: ['josh okogie'] - Actual: ['cameron johnson'] - Correct: False
Expected: ['josh okogie'] - Actual: ['herbert jones'] - Correct: False
Expected: ['nic claxton'] - Actual: ["day'ron sharpe"] - Correct: False

Number of correct chunks: 69 / 105 = 65%
Total entities matched: 172 / 240 = 71%


## Misc

In [2]:
import jellyfish

test_pairs = [
    ("sadik bey", "saddiq bey"),
    ("kobe white", "kobe"),
    ("kobe white", "cobe white"),
    ("miles turner", "myles turner"),
]


for a, b in test_pairs:
    print(a, b)
    print(jellyfish.soundex(a), jellyfish.soundex(b))
    print(jellyfish.metaphone(a), jellyfish.metaphone(b))

sadik bey saddiq bey
S321 S321
STK B STK B
kobe white kobe
K130 K100
KB HT KB
kobe white cobe white
K130 C130
KB HT KB HT
miles turner myles turner
M423 M423
MLS TRNR MLS TRNR


In [41]:
chunk_with_labels[9]

("comes to mind. But Hopefully, Bogdanovic is available. Well, let's get on. Let's get back to real basketball. The other game last night, for players playing, Lakers beat Phoenix in the LA home debut. AD bounced back with a nice 30.12 rebound, 3 block, 3 steal performance. LeBron, who was allegedly on a minutes restriction, played 35 minutes, 21 points, 8 boards, 9 dimes, 2 steals, 2 blocks. Of course, this is against the Suns team with no Booker, no Beal. KD had a very nice 39 and 11 game. I'll",
 ['kevin durant', 'lebron james'])