In [2]:
import os
import re
import sys

import nltk
import pandas as pd
import spacy
from nltk.tokenize import sent_tokenize

sys.path.append(os.path.abspath("../"))

# Set display options
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 400)

In [6]:
# Download NLTK data (for sentence tokenization)
nltk.download("punkt")
nltk.download('punkt_tab')

# Load SpaCy model for NER
nlp = spacy.load("en_core_web_sm")


# Define a function to clean text
def clean_text(text):
    """
    Removes irrelevant sections and prepares text for further processing.
    """
    # Remove advertisements or specific unwanted patterns
    ad_patterns = [
        r"(?i)sponsored by .*",  # Example: "Sponsored by XYZ"
        r"(?i)ad break.*",  # Example: "Ad break starts here"
        r"http\S+",  # URLs
        r"\[.*?\]",  # Content in brackets (e.g., [Music])
    ]
    for pattern in ad_patterns:
        text = re.sub(pattern, "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Define a function to segment text into sentences
def segment_text(text):
    """
    Segments cleaned text into sentences using NLTK.
    """
    return sent_tokenize(text)


# Define a function to annotate entities
def annotate_entities(sentences):
    """
    Uses SpaCy's NER to detect and annotate entities.
    Returns a list of sentences with annotated entities.
    """
    annotations = []
    for sentence in sentences:
        doc = nlp(sentence)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        annotations.append({"sentence": sentence, "entities": entities})
    return annotations


# Process all text files in a directory
def process_text_files(directory):
    """
    Processes all .txt files in the specified directory.
    Cleans text, segments it into sentences, and annotates entities.
    """
    results = []

    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            file_path = os.path.join(directory, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()

            # Clean text
            # cleaned_text = clean_text(text)

            # Segment text
            sentences = segment_text(text)

            # Annotate entities
            annotated_sentences = annotate_entities(sentences)

            # Save results
            results.extend(
                [
                    {
                        "file_name": file_name,
                        "sentence": ann["sentence"],
                        "entities": ann["entities"],
                    }
                    for ann in annotated_sentences
                ]
            )
            break

    return results


# Directory containing text files
directory_path = "../data/raw/rotowire_2023_2024"

# Process text files and save results
results = process_text_files(directory_path)

# Convert to a DataFrame for easy analysis
df = pd.DataFrame(results)

# Save to CSV for inspection
# df.to_csv("annotated_text.csv", index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [9]:
df

Unnamed: 0,file_name,sentence,entities
0,Fantasy Basketball Waiver Wire - Adds for Week...,welcome to the award-winning RotoWire fantasy ...,"[(RotoWire, PRODUCT), (Friday December 8th 202..."


In [10]:
df.iloc[0].sentence

"welcome to the award-winning RotoWire fantasy basketball podcast brought to you by Underdog fantasy it's Friday December 8th 2023 Alex barutha here with Shannon McEwan and Ken k train Crites all aboard  I will apologize to everybody in advance and recovering from being sick I'll try not to coffin everybody's ears who is listening to this podcast but that's why I sound like I have tissue paper stuck up my nose  last night the Lakers beat the Pelicans 133 to 89 in Las Vegas during the late in season Tournament game  Lakers took off in the second quarter they never looked back I mean he was basically a home game for the Lakers we can we can kind of talk about some of the Dynamics of this when we talk about the next game which I'm not looking forward to but New Orleans struggled 36% from the field  Trey Murphy who came off the bench was the team's highest score with 14 points LeBron went for 30 um very free charges Darius yeah efficient 30 took three charges Lakers shoot 55% it was it was

In [8]:
entities = df.iloc[0].entities

[('RotoWire', 'PRODUCT'),
 ('Friday December 8th 2023', 'DATE'),
 ('Alex', 'PERSON'),
 ('Shannon McEwan', 'PERSON'),
 ('Ken', 'PERSON'),
 ('Crites', 'ORG'),
 ('last night', 'TIME'),
 ('Pelicans', 'NORP'),
 ('133 to', 'CARDINAL'),
 ('89', 'CARDINAL'),
 ('Las Vegas', 'GPE'),
 ('the late in season', 'DATE'),
 ('the second quarter', 'DATE'),
 ('the Dynamics of', 'ORG'),
 ('New Orleans', 'GPE'),
 ('36%', 'PERCENT'),
 ('Trey Murphy', 'PERSON'),
 ('14', 'CARDINAL'),
 ('LeBron', 'ORG'),
 ('30', 'CARDINAL'),
 ('Darius', 'PERSON'),
 ('30', 'CARDINAL'),
 ('three', 'CARDINAL'),
 ('Lakers', 'PRODUCT'),
 ('55%', 'PERCENT'),
 ('Ken', 'PERSON'),
 ('Pelicans', 'NORP'),
 ('the 500,000 dollars', 'MONEY'),
 ('Target', 'ORG'),
 ('Las Vegas', 'GPE'),
 ('Austin', 'PERSON'),
 ('three', 'CARDINAL'),
 ('LeBron', 'PERSON'),
 ('Williamson', 'PERSON'),
 ('James', 'PERSON'),
 ('age 30', 'DATE'),
 ('three', 'CARDINAL'),
 ('three', 'CARDINAL'),
 ('nine', 'CARDINAL'),
 ('12', 'CARDINAL'),
 ('four', 'CARDINAL'),
 ('the