In [49]:
import json
import os
from pathlib import Path
from typing import Dict, List

import tiktoken
from openai import OpenAI
from pydantic import BaseModel, RootModel

from datasets import load_dataset

In [11]:
cache_dir = Path("../cache")
assert cache_dir.exists(), f"Cache directory not found at {cache_dir}"
dataset = load_dataset(
    "Studeni/Pile-NER-type-conll", cache_dir=cache_dir, split="train"
).to_pandas()

In [12]:
dataset["labels_raw"] = dataset["labels"].apply(
    lambda labels: set([label.split("-")[-1] for label in labels if label != "O"])
)

In [13]:
dataset["labels_raw"].explode().nunique()

12654

In [33]:
LABELS_DESCRIPTION_PROMPT = """
###TASK###
Your task is to generate three different variety descriptions for the target NER (Named Entity Recognition) label based on the provided word pairs and NER labels. 
Each description should be one sentence long and may include examples if needed to better explain the label. 
The examples must be generalist and not specific to any particular domain.

###INSTRUCTIONS###
Follow these steps:

1. Analyze the word pairs:
   - Look for patterns and commonalities among the words in each pair.
   - Consider how these words relate to the target NER label.

2. Generate descriptions:
   - Create three distinct, one-sentence descriptions for the target label.
   - Sentences needs to be direct and concise.
   - Each description should capture the essence of the label based on the patterns observed in the word pairs.
   - If necessary, include generalist examples to clarify the label's meaning.
   - Ensure that the descriptions are varied in their approach and wording.

3. Format the output:
   - Present your generated descriptions in JSON format.
   - Use the keys "description_1", "description_2", and "description_3" for each description.

###OUTPUT EXAMPLE###
Your final output should be structured as follows:

{{
   "target_label_1": ["Your first generated description", "Your second generated description", "Your third generated description"],
   "target_label_2": ["Your first generated description", "Your second generated description", "Your third generated description"],
   ...
}}

###SUMMARY###
Remember to make your descriptions clear, concise, and informative, focusing on the general concept of the NER label rather than specific instances.

###INPUT###
List of words:
{words}

List of NER labels:
{ner_labels}

Target label:
{target_labels}
"""

In [29]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [64]:
class LabelDescriptions(BaseModel):
    description_1: str
    description_2: str
    description_3: str


class Response(RootModel):
    root: Dict[str, List[str]]

In [65]:
Response.schema()

{'additionalProperties': {'items': {'type': 'string'}, 'type': 'array'},
 'title': 'Response',
 'type': 'object'}

In [66]:
def get_descriptions(
    client: OpenAI,
    prompt: str,
    response_format: BaseModel,
    model: str = "gpt-4o-mini",
    temperature: float = 0.0,
):
    response = client.beta.chat.completions.parse(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
        ],
        temperature=temperature,
        response_format=response_format,
    )
    return response

In [67]:
prompt = LABELS_DESCRIPTION_PROMPT.format(
    words=dataset.iloc[0].words,
    ner_labels=dataset.iloc[0].ner_tags,
    target_labels=["PROGRAMMING_CONCEPT", "DATABASE", "DATE"],
)

response = get_descriptions(client, prompt, Response)

BadRequestError: Error code: 400 - {'error': {'message': "Invalid schema for response_format 'Response': In context=(), object schema missing properties.", 'type': 'invalid_request_error', 'param': 'response_format', 'code': None}}

In [43]:
json.loads(response.choices[0].message.content)

{'description_1': 'A programming concept refers to fundamental ideas and principles that guide the design and implementation of software, such as variables, loops, and functions.',
 'description_2': 'Programming concepts are the building blocks of coding, encompassing techniques like object-oriented programming and recursion that help developers solve problems efficiently.',
 'description_3': 'Examples of programming concepts include data structures, algorithms, and control flow, which are essential for creating effective and maintainable code.'}

In [None]:
response