In [81]:
%load_ext autoreload
%autoreload 2

import json
import os
import re
import sys
from datetime import datetime
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

sys.path.append(os.path.abspath("../"))

# Set display options
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 400)

from src.utils import get_repo_root

root = get_repo_root()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load text file
with open(
    root
    / "data/raw/DG RFB Transcripts/Fantasy Basketball Waiver Wire for Week 15 2023-24_transcript.txt",
    "r",
) as f:
    full_text = f.readlines()[0]

# Load the labels (player names)
with open(root / "labels.txt", "r") as f:
    labels = [line.strip() for line in f.readlines()]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)


In [24]:
import json

import torch
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from tqdm import tqdm
from transformers import pipeline


# Define schema for annotations
class Annotation(BaseModel):
    start: int
    end: int
    label: str


class ParagraphAnnotation(BaseModel):
    annotations: list[Annotation]


# Load a lightweight local model using Hugging Face
# device = 0 if torch.cuda.is_available() else -1
device = "cpu"

from transformers import pipeline

class Annotation(BaseModel):
    start: int
    end: int
    label: str

class ParagraphAnnotation(BaseModel):
    annotations: list[Annotation]

template = """
You are an expert annotator. Identify all mentions of NBA players from the given text 
and provide their start and end character positions in a structured JSON format. 
The JSON structure must strictly follow this schema:
{format_instructions}

Labels: 
{labels}

Text:
{text}
"""

# llm = ChatOpenAI(model="gpt-4-0-mini", temperature=0)
local_model = pipeline("text-generation", model="distilgpt2", device=device, max_length=4056)
llm = HuggingFacePipeline(pipeline=local_model)

parser = PydanticOutputParser(pydantic_object=ParagraphAnnotation)

prompt_template = PromptTemplate(
    template=template,
    input_variables=["labels", "text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

pipeline = prompt_template | llm | parser

In [63]:
from src.data_utils import PodcastContainer

pod = PodcastContainer()
pdf = pod.get_episodes_for_podcast('rotowire')
pdf = pdf[(pdf.publication_date >= pd.to_datetime('2023-10-24')) & (pdf.publication_date < pd.to_datetime('2023-10-30'))]

In [77]:

    break

In [79]:
row.file_name

'opening_week_preview_boom_or_bust_players_three_bold_fantasy_predictions'

In [84]:
from tqdm import tqdm

base_dir = get_repo_root() / 'data/raw/unannotated'
base_dir.mkdir(parents=True, exist_ok=True)

for idx, row in tqdm(pdf.iterrows()):
    label_studio_data = []    
    file_name = row.file_name
    full_text = row.content
    
    paragraphs = text_splitter.split_text(full_text)
    # Generate annotations for each paragraph
    for idx, para in tqdm(enumerate(paragraphs, start=1)):
        try:
            json_data = {
                "id": f"{file_name}_{idx}",
                "data": {"text": para}
            }
            label_studio_data.append(json_data)
        except Exception as e:
            print(f"Error processing paragraph {idx}: {e}")

    # Save to JSON file
    output_path = base_dir / f"{file_name}_raw.json"
    with open(output_path, "w") as f:
        json.dump(label_studio_data, f, indent=2)

print(f"Label Studio annotations saved to {output_path}")

0it [00:00, ?it/s]
98it [00:00, ?it/s][A

152it [00:00, 152011.02it/s]

117it [00:00, ?it/s]A

105it [00:00, ?it/s]A
4it [00:00, 38.46it/s]

Label Studio annotations saved to G:\My Drive\Columbia\Practical Deep Learning\FantasyPodcastInsights\data\raw\unannotated\fantasy_basketball_waiver_wire_for_week_2_202324_raw.json





In [None]:
def generate_annotated_json(labels, paragraph, annotation):

    # Invoke the annotation pipeline
    para_annotated = pipeline.invoke({"labels": labels, "text": para})

    # Convert to Label Studio format
    return {
        "id": idx,
        "data": {"text": paragraph},
        "predictions": [
            {
                "model_version": "model_1",
                "result": [
                    {
                        "from_name": "label",
                        "to_name": "text",
                        "type": "labels",
                        "value": {
                            "start": annotation.start,
                            "end": annotation.end,
                            "labels": [annotation.label],
                        },
                    }
                    for annotation in para_annotated.annotations
                ],
            }
        ],
    }

In [11]:
1

1

In [26]:
from openai import OpenAI

client = OpenAI()


# def annotate_paragraph(paragraph, labels):
#     prompt = (
#         f"You are an expert annotator. Identify all mentions of NBA players from the following text "
#         f'and provide their start and end character positions. Here is the text:\n\n"{paragraph}"\n\n'
#         "Labels: " + ", ".join(labels)
#     )
#     response = client.chat.completions.create(
#         model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}]
#     )
#     return response.choices[0].message.content


def annotate_paragraph(paragraph, labels):
    prompt = (
        f"You are an expert annotator. Identify all mentions of NBA players from the following text "
        f"and provide their start and end character positions in JSON format with this structure:\n"
        f'{{"annotations": [{{"start": start_position, "end": end_position, "label": "PLAYER"}}]}}\n\n'
        "Do not output any comments. Only output valid json"
        f'Here is the text:\n\n"{paragraph}"\n\n'
        "Labels: " + ", ".join(labels)
    )
    response = client.chat.completions.create(
        model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}]
    )

    # Parse response content
    return response.choices[0].message.content.strip()
    try:
        return json.loads(structured_json)
    except json.JSONDecodeError:
        raise ValueError("LLM response is not valid JSON.")


r = annotate_paragraph(paragraphs[0], labels)
r

'```json\n{"annotations":[{"start":105,"end":110,"label":"PLAYER"},{"start":112,"end":118,"label":"PLAYER"},{"start":120,"end":125,"label":"PLAYER"},{"start":127,"end":134,"label":"PLAYER"},{"start":136,"end":145,"label":"PLAYER"},{"start":222,"end":226,"label":"PLAYER"},{"start":228,"end":233,"label":"PLAYER"},{"start":235,"end":237,"label":"PLAYER"},{"start":239,"end":244,"label":"PLAYER"},{"start":246,"end":250,"label":"PLAYER"},{"start":298,"end":301,"label":"PLAYER"},{"start":303,"end":309,"label":"PLAYER"},{"start":315,"end":320,"label":"PLAYER"},{"start":322,"end":327,"label":"PLAYER"},{"start":329,"end":331,"label":"PLAYER"},{"start":333,"end":339,"label":"PLAYER"},{"start":341,"end":349,"label":"PLAYER"},{"start":351,"end":354,"label":"PLAYER"},{"start":356,"end":363,"label":"PLAYER"},{"start":367,"end":372,"label":"PLAYER"}]}\n```'

JSONDecodeError: Expecting ',' delimiter: line 1 column 837 (char 836)

In [15]:
# Annotate paragraphs
annotations = []
for idx, paragraph in enumerate(paragraphs):
    llm_response = annotate_paragraph(paragraph, labels)
    # Parse LLM output (ensure it matches your required format)
    spans = json.loads(llm_response)
    annotations.append(
        {
            "id": idx + 1,
            "text": paragraph.strip(),
            "annotations": [
                {
                    "result": [
                        {
                            "from_name": "label",
                            "to_name": "text",
                            "type": "labels",
                            "value": {
                                "start": span["start"],
                                "end": span["end"],
                                "labels": ["PLAYER"],
                            },
                        }
                        for span in spans
                    ]
                }
            ],
        }
    )

# Save to a JSON file
output_path = root / "/data/preannotated_transcript_label_studio.json"
with open(output_path, "w") as f:
    json.dump(annotations, f, indent=4)

print(f"Pre-annotated data saved to {output_path}")

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
