In [3]:
%load_ext autoreload
%autoreload 2
    
import os
import re
import sys

sys.path.append(os.path.abspath("../"))

import nltk
import pandas as pd
import spacy
from nltk.tokenize import sent_tokenize
from src.utils import get_nba_season

from src.data_utils import load_clean_scores
from src.player_utils import PlayerUtil

# Set display options
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 400)

df = load_clean_scores(["2023-24"])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Download NLTK data (for sentence tokenization)
nltk.download("punkt")
nltk.download("punkt_tab")

# Load SpaCy model for NER
nlp = spacy.load("en_core_web_sm")


def segment_by_semantics(text, max_tokens=50):
    """
    Segments text into semantically meaningful chunks based on SpaCy's parsing.
    """
    doc = nlp(text)
    segments = []
    chunk = []
    token_count = 0

    for sent in doc.sents:
        token_count += len(sent)
        chunk.append(sent.text)
        if token_count >= max_tokens:
            segments.append(" ".join(chunk))
            chunk = []
            token_count = 0

    # Add any remaining sentences
    if chunk:
        segments.append(" ".join(chunk))

    return segments


# Define a function to clean text
def clean_text(text):
    """
    Removes irrelevant sections and prepares text for further processing.
    """
    # Remove advertisements or specific unwanted patterns
    ad_patterns = [
        r"(?i)sponsored by .*",  # Example: "Sponsored by XYZ"
        r"(?i)ad break.*",  # Example: "Ad break starts here"
        r"http\S+",  # URLs
        r"\[.*?\]",  # Content in brackets (e.g., [Music])
    ]
    for pattern in ad_patterns:
        text = re.sub(pattern, "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Define a function to annotate entities
def annotate_entities(sentences):
    """
    Uses SpaCy's NER to detect and annotate entities.
    Returns a list of sentences with annotated entities.
    """
    annotations = []
    for sentence in sentences:
        doc = nlp(sentence)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        annotations.append({"sentence": sentence, "entities": entities})
    return annotations


# Process all text files in a directory
def process_text_files(directory):
    """
    Processes all .txt files in the specified directory.
    Cleans text, segments it into sentences, and annotates entities.
    """
    results = []

    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            file_path = os.path.join(directory, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()

            # Clean text
            # cleaned_text = clean_text(text)

            # Segment text
            sentences = segment_by_semantics(text)

            # Annotate entities
            annotated_sentences = annotate_entities(sentences)

            # Save results
            results.extend(
                [
                    {
                        "file_name": file_name,
                        "sentence": ann["sentence"],
                        "word_count": len(ann["sentence"].split()),
                        "entities": ann["entities"],
                    }
                    for ann in annotated_sentences
                ]
            )
            break

    return results


def process_text_file(file_path: str):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Segment text
    sentences = segment_by_semantics(text)

    # Annotate entities
    annotated_sentences = annotate_entities(sentences)

    file_name = os.path.basename(file_path)

    # Save results
    results = [
        {
            "file_name": file_name,
            "sentence": ann["sentence"],
            "word_count": len(ann["sentence"].split()),
            "entities": ann["entities"],
        }
        for ann in annotated_sentences
    ]

    return results


# Directory containing text files
directory_path = "../data/raw/rotowire_2023_2024"

date = "2023-12-08"
file_path = (
    directory_path
    + "/Fantasy Basketball Waiver Wire - Adds for Week 8 2023-24_transcript_6b67bc46-0000-2bfc-bcc0-2405887bfb7c.txt"
)

results = process_text_file(file_path)

# Convert to a DataFrame for easy analysis
segment_df = pd.DataFrame(results)


current_season = get_nba_season(date)

minutes = get_player_minute_stats(df)
minutes = minutes[minutes.season_year == current_season]

players = get_players_for_date(df, date)
players = players.merge(minutes, on=['personId', 'personName'], how='left')
# players.sort_values(['games_over_5_minutes', 'avg_minutes_per_game'], ascending=False)
filtered_players = players[(players['games_over_5_minutes'] >= 10) & (players['avg_minutes_per_game'] >= 10)]

print(len(players))
print(len(filtered_players))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


NameError: name 'get_player_minute_stats' is not defined

In [37]:
sentence = ''.join([segment_df.iloc[i].sentence for i in [5,6,7]])
sentence

# TODO how can topic models be used?
# Hierarchical Dirichlet Process (HDP)
# Latent semantic Indexing (LSI or LDI)
# Latent Dirichlet Allocation (LDA)

"yeah Shannon we saw like I mentioned Trey Murphy was a team's highest score for the Pelicans he still coming off the bench though  any concerns about that or do you think like either still a good chance he ends up playing 30 minutes a game off the bench maybe the Pokemon starting lineup at some point you know her Jones is still in there right now  yeah I mean that that 43 to 17 third quarter by the Lakers basically makes last night's game you know absolutely and there's no reason to really there's no big takeaways from last night's game is Susan turned into a blowout you know nothing really matters what happened with the Pelicans from there so the 22 minutes that Murphy saw last night I'm not worried about like you said you still the leading scorer for 14 points in those 22 minutesand anytime you was on the floor he was very aggressive on the offensive end you know he's played 2230 and 22 minutes in the three games he's returned from injury having some rest kind of being East back int

In [39]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from gensim.models import HdpModel
from gensim.corpora.dictionary import Dictionary
import nltk

# Download NLTK data
nltk.download("stopwords")
nltk.download("punkt")

# Define stopwords
stop_words = set(stopwords.words("english"))

# Preprocess the text
def preprocess_text(text):
    """
    Preprocess text by lowercasing, removing special characters,
    tokenizing, and removing stopwords.
    """
    # Lowercase the text
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-z\s]", "", text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# File path to the podcast transcript
directory_path = "../data/raw/rotowire_2023_2024"
file_path = (
    directory_path
    + "/Fantasy Basketball Waiver Wire - Adds for Week 8 2023-24_transcript_6b67bc46-0000-2bfc-bcc0-2405887bfb7c.txt"
)

# Read and preprocess the text
# with open(file_path, "r", encoding="utf-8") as f:
#     text = f.read()

text = sentence
# Preprocess the text into tokenized segments
processed_text = preprocess_text(text)

# Segment the text into smaller chunks (e.g., paragraphs or 50-token chunks)
chunk_size = 50
chunks = [processed_text[i:i + chunk_size] for i in range(0, len(processed_text), chunk_size)]

# Prepare the data for HDP
dictionary = Dictionary(chunks)
corpus = [dictionary.doc2bow(chunk) for chunk in chunks]

# Train the HDP model
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)

# Print the discovered topics
topics = hdp_model.print_topics(num_topics=15, num_words=10)
for i, topic in enumerate(topics):
    print(f"Topic {i + 1}: {topic}")

Topic 1: (0, '0.033*inseason + 0.029*reason + 0.028*third + 0.027*teams + 0.025*injury + 0.024*right + 0.024*still + 0.022*herbert + 0.021*basically + 0.020*know')
Topic 2: (1, '0.046*said + 0.036*hit + 0.036*blowout + 0.029*herbert + 0.029*mentioned + 0.027*thats + 0.026*stick + 0.025*end + 0.023*theyre + 0.022*would')
Topic 3: (2, '0.048*champ + 0.040*rest + 0.036*yeah + 0.033*points + 0.027*hes + 0.026*blowout + 0.025*makes + 0.024*quarter + 0.021*ends + 0.020*mentioned')
Topic 4: (3, '0.063*blowout + 0.040*know + 0.035*absolutely + 0.034*well + 0.033*injury + 0.031*floor + 0.031*turned + 0.029*teams + 0.026*big + 0.024*reason')
Topic 5: (4, '0.064*said + 0.048*rotation + 0.043*stick + 0.036*night + 0.034*lakers + 0.030*quarter + 0.028*thats + 0.026*would + 0.025*three + 0.025*happened')
Topic 6: (5, '0.055*injury + 0.032*games + 0.031*leading + 0.027*stick + 0.026*happened + 0.026*alleviates + 0.022*yeah + 0.021*points + 0.021*pelicans + 0.021*minutes')
Topic 7: (6, '0.058*maybe + 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
from gensim.models.coherencemodel import CoherenceModel

# Compute the coherence score
coherence_model = CoherenceModel(model=hdp_model, texts=chunks, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.2795729567203427


In [None]:
# TODO better segmentation for better topic modeling?
# TODO look at topic distributions across documents
# TODO topic visualization in 2D space

# import pyLDAvis.gensim_models as gensimvis
# import pyLDAvis

# # Prepare the visualization
# vis_data = gensimvis.prepare(hdp_model, corpus, dictionary)
# pyLDAvis.display(vis_data)

## Pipeline


✘ No compatible package found for 'en_coreference_web_trf' (spaCy v3.8.2)



In [52]:
from fastcoref import FCoref
from transformers import pipeline, AutoTokenizer, AutoModel
import numpy as np
import torch

# Load the FastCoref model
coref_model = FCoref(model_name="biu-nlp/f-coref")

# Load a BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

def get_players():
    # Replace with your logic to retrieve player names
    return {"LeBron James", "Stephen Curry", "Anthony Davis"}

def resolve_coreferences(text):
    """
    Resolves coreferences in the text using FastCoref.
    """
    doc = coref_model.predict(text, aggregation_strategy="average")
    resolved_text = doc.resolved_text
    return resolved_text

def extract_player_sentences(podcast_text, players):
    """
    Extracts sentences mentioning specific players using coreference resolution.
    """
    resolved_text = resolve_coreferences(podcast_text)
    sentences = resolved_text.split(". ")  # Simple sentence splitting

    player_sentences = {player: [] for player in players}

    for sentence in sentences:
        # Check if a player is mentioned directly in the sentence
        for player in players:
            if player in sentence:
                player_sentences[player].append(sentence.strip())

    return player_sentences

def generate_player_vectors(player_sentences):
    """
    Generate vector representations for each player using BERT.
    """
    player_vectors = {}

    for player, sentences in player_sentences.items():
        # Tokenize and process all sentences related to the player
        sentence_embeddings = []
        for sentence in sentences:
            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
            with torch.no_grad():
                outputs = bert_model(**inputs)
                # Use [CLS] token embedding as the representation
                sentence_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0)
                sentence_embeddings.append(sentence_embedding.numpy())

        # Aggregate sentence embeddings for the player (mean pooling)
        if sentence_embeddings:
            player_vectors[player] = np.mean(sentence_embeddings, axis=0)
        else:
            player_vectors[player] = None  # No context found for the player

    return player_vectors


podcast_text = """
LeBron James played an incredible game last night, scoring 40 points. He showed great leadership on the court.
Meanwhile, Stephen Curry impressed everyone with his three-point shooting, hitting 7 threes.
Anthony Davis was solid on defense, but his offensive output was limited.
"""

# Retrieve players of interest
players = get_players()

# Extract sentences mentioning each player
player_sentences = extract_player_sentences(podcast_text, players)
print("Player Sentences:")
for player, sentences in player_sentences.items():
    print(f"{player}: {sentences}")

# Generate vector representations
player_vectors = generate_player_vectors(player_sentences)
print("\nPlayer Vectors:")
for player, vector in player_vectors.items():
    print(f"{player}: {vector}")


[autoreload of numpy.core.multiarray failed: Traceback (most recent call last):
  File "C:\Users\Patrick\anaconda3\envs\py310-ml-gpu\lib\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "C:\Users\Patrick\anaconda3\envs\py310-ml-gpu\lib\site-packages\IPython\extensions\autoreload.py", line 475, in superreload
    module = reload(module)
  File "C:\Users\Patrick\anaconda3\envs\py310-ml-gpu\lib\importlib\__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "C:\Users\Patrick\anaconda3\envs\py310-ml-gpu\lib\site-packages\numpy\core\multiarray.py", line 1, in <module>
    from numpy._core import multiarray
  File "C:\Users\Patrick\anaconda3\envs\py310-ml-gpu\lib\site-packages\numpy\_core\mul

TypeError: FCoref.__init__() got an unexpected keyword argument 'model_name'

In [48]:
extract_player_sentences(text, get_players())

AttributeError: [E046] Can't retrieve unregistered extension attribute 'coref_clusters'. Did you forget to call the `set_extension` method?

## Bert Question Answering

In [41]:
# File path to the podcast transcript
directory_path = "../data/raw/rotowire_2023_2024"
file_path = (
    directory_path
    + "/Fantasy Basketball Waiver Wire - Adds for Week 8 2023-24_transcript_6b67bc46-0000-2bfc-bcc0-2405887bfb7c.txt"
)

# Read and preprocess the text
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

In [45]:
from transformers import pipeline

# Load a QA pipeline
# qa_pipeline = pipeline("question-answering", model="deberta-v3-large")
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

# Input context (podcast transcript)
# context = "LeBron James played exceptionally well last night, scoring 40 points. However, his defense was questionable..."
context = text

# Example questions
questions = [
    "What was said about Herbert Jones performance?",
    "What was the sentiment around Herbert Jones?",
    "Were there any criticisms of Herbert Jones?"
]

# Get answers for each question
for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"Q: {question}\nA: {result['answer']}\n")


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Q: What was said about Herbert Jones performance?
A: the thing that disturbs me is he's not a young kid

Q: What was the sentiment around Herbert Jones?
A: he's not a young kid

Q: Were there any criticisms of Herbert Jones?
A: he's got two guys on the team that are better players than him



In [None]:
# Alternative Approaches
# If the above adjustments still seem complex, consider alternative modeling approaches:

# 1. Use a Transformer-Based Model for End-to-End Analysis
# Models like GPT or BERT-based architectures can handle NER, context understanding, and sentiment analysis jointly.
# Fine-tune these models on annotated podcast text for:
# Identifying players
# Extracting context
# Classifying sentiment directly
# 2. Treat it as a Question-Answering Task
# Formulate the task as:
# "What is the sentiment around Player X in the podcast?"
# Use QA models to identify the most relevant segment and its sentiment.
# 3. Use Context-Aware Embedding Models
# Embed the entire podcast using a context-aware model like Sentence-BERT.
# Use similarity-based search to locate the most relevant context around a player mention and then analyze sentiment.

In [None]:
# Improving Your Plan
# Instead of completely changing your plan, you can make it "smarter" and more manageable by addressing the key challenges strategically:

# 1. Named Entity Recognition (NER)
# Fine-tune or use a domain-specific model:

# Use pre-trained models like SpaCy or Hugging Face Transformers.
# Fine-tune these models on basketball-related text datasets (e.g., annotated transcripts, sports news, or player rosters).
# Incorporate a list of basketball player names:

# Use external data sources like NBA rosters or fantasy basketball datasets to build a lookup table for player names.
# Combine rule-based methods with NER:
# Check for matches between recognized entities and your roster list.
# Validate contextually by checking co-occurring terms like "team," "game," or "points."
# 2. Context Extraction
# Dynamic Window-Based Context:

# Extract a fixed-size window of text (e.g., N sentences or tokens) before and after the player mention.
# Experiment with window sizes to balance granularity and relevance.
# Semantic Segmentation:

# Use semantic chunking techniques (like segment_by_semantics) to split the text into coherent units.
# Identify the chunk containing the player's mention and analyze it.
# Dependency Parsing:

# Use syntactic dependency parsing to extract clauses or phrases related to the player’s name (e.g., actions or descriptors linked to the player).
# 3. Sentiment Analysis
# Fine-tune a Sentiment Model for Sports:

# Fine-tune a sentiment analysis model on basketball-specific text to handle domain-specific nuances like sarcasm or conditional praise.
# Example: Use Hugging Face models with labeled sentiment data from sports articles or social media.
# Aspect-Based Sentiment Analysis (ABSA):

# Train or use pre-built ABSA models to detect sentiment specifically related to the player.
# ABSA focuses on entities (e.g., players) and their associated sentiment directly.
# 4. Automate and Iterate
# Use a pipeline approach to combine NER, context extraction, and sentiment analysis:
# Perform NER to detect player mentions.
# Extract context dynamically using a combination of window-based and semantic chunking methods.
# Analyze sentiment on extracted context.
# Iteratively evaluate and refine the pipeline using test cases and manually annotated data.

In [20]:
sentence = ''.join([segment_df.iloc[i].sentence for i in [5,6,7]])
sentence

"yeah Shannon we saw like I mentioned Trey Murphy was a team's highest score for the Pelicans he still coming off the bench though  any concerns about that or do you think like either still a good chance he ends up playing 30 minutes a game off the bench maybe the Pokemon starting lineup at some point you know her Jones is still in there right now  yeah I mean that that 43 to 17 third quarter by the Lakers basically makes last night's game you know absolutely and there's no reason to really there's no big takeaways from last night's game is Susan turned into a blowout you know nothing really matters what happened with the Pelicans from there so the 22 minutes that Murphy saw last night I'm not worried about like you said you still the leading scorer for 14 points in those 22 minutesand anytime you was on the floor he was very aggressive on the offensive end you know he's played 2230 and 22 minutes in the three games he's returned from injury having some rest kind of being East back int

In [31]:
import spacy
import neuralcoref


nlp = spacy.load("en_core_web_sm")

doc = nlp(sentence)
doc._.coref_resolved

ModuleNotFoundError: No module named 'neuralcoref'

In [28]:
annotated_sentence = annotate_entities([sentence])

In [29]:
annotated_sentence

[{'sentence': "yeah Shannon we saw like I mentioned Trey Murphy was a team's highest score for the Pelicans he still coming off the bench though  any concerns about that or do you think like either still a good chance he ends up playing 30 minutes a game off the bench maybe the Pokemon starting lineup at some point you know her Jones is still in there right now  yeah I mean that that 43 to 17 third quarter by the Lakers basically makes last night's game you know absolutely and there's no reason to really there's no big takeaways from last night's game is Susan turned into a blowout you know nothing really matters what happened with the Pelicans from there so the 22 minutes that Murphy saw last night I'm not worried about like you said you still the leading scorer for 14 points in those 22 minutesand anytime you was on the floor he was very aggressive on the offensive end you know he's played 2230 and 22 minutes in the three games he's returned from injury having some rest kind of being

In [25]:
# annotate_entities(segment_by_semantics(sentence)[0])

In [17]:
segment_df.iloc[6].sentence

"yeah I mean that that 43 to 17 third quarter by the Lakers basically makes last night's game you know absolutely and there's no reason to really there's no big takeaways from last night's game is Susan turned into a blowout you know nothing really matters what happened with the Pelicans from there so the 22 minutes that Murphy saw last night I'm not worried about like you said you still the leading scorer for 14 points in those 22 minutes"

In [16]:
segment_df.iloc[7].sentence

"and anytime you was on the floor he was very aggressive on the offensive end you know he's played 2230 and 22 minutes in the three games he's returned from injury having some rest kind of being East back into the rotation champ be a big surprise but the fact that he already hit 30 minutes basically alleviates any concerns I would have will he be in the starting lineup I think that's a bigger question mark I don't Herbert Herbert Jones been playing so well for the Pelicans that I think they're gonna stick with them for at least a little while I mean they're 12 and 11 they had some assess the in-season tournament"

In [None]:
df.iloc[0].sentence

In [None]:
df.iloc[0].sentence

In [None]:
entities = df.iloc[0].entities