In [1]:
import random
from pathlib import Path
from typing import Dict, List, Literal

import pandas as pd

from datasets import (
    ClassLabel,
    DatasetDict,
    Features,
    Sequence,
    Value,
    load_dataset,
)

In [2]:
output_dir = Path("./processed_data")
output_dir.mkdir(exist_ok=True)

In [5]:
repo_path = Path("~/Development/entity-recognition-datasets").expanduser()
assert repo_path.exists(), "Please clone the repository with the datasets"

## Utils

In [157]:
def check_labels(words: List[str], ner_tags: List[int]) -> str:

    line1 = ""
    line2 = ""

    for word, tag in zip(words, ner_tags):
        full_label = tag
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)

    return line1 + "\n" + line2

In [4]:
def load_conll_data(
    file_path: Path, split_by: str = "\t"
) -> Dict[int, Dict[str, List[str]]]:
    dataset = []
    sentence_id = 0
    current_words = []
    current_tags = []

    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line indicates a sentence boundary
                if current_words:  # If we have collected words and tags for a sentence
                    dataset.append(
                        {
                            "id": str(sentence_id),
                            "words": current_words,
                            "ner_tags": current_tags,
                        }
                    )
                    sentence_id += 1
                    current_words = []
                    current_tags = []
            else:
                word, tag = line.split(split_by)  # Split by tab
                current_words.append(word)
                current_tags.append(tag)

        # Append the last sentence if the file doesn't end with a newline
        if current_words:
            dataset.append(
                {
                    "id": str(sentence_id),
                    "words": current_words,
                    "ner_tags": current_tags,
                }
            )

    return dataset

In [21]:
def format_labels(labels: List[str]) -> List[str]:
    """This method fixes the labels to be in the correct IOB format. Wikigold only has O and I tags, but we need to have B, I and O tags."""
    if any(["-" in label for label in labels if label != "O"]):
        labels = [label.split("-")[-1] for label in labels]

    new_labels = []
    current_label = None
    for label in labels:
        if current_label is None:
            current_label = label
            if label == "O":
                new_labels.append(label)
            else:
                new_labels.append(f"B-{label}")
        else:
            if label == "O":
                new_labels.append(label)
                current_label = label
            else:
                if label == current_label:
                    new_labels.append(f"I-{current_label}")
                else:
                    new_labels.append(f"B-{label}")
                    current_label = label

    return new_labels


def pust_to_hf(
    repo_id: str,
    label_names: List[str],
    data_files: Dict[str, str] | str,
    test_split_percentage: float = None,
) -> DatasetDict:
    features = Features(
        {
            "id": Value(dtype="string"),
            "words": Sequence(feature=Value(dtype="string")),
            "ner_tags": Sequence(feature=ClassLabel(names=label_names)),
        }
    )
    dataset = load_dataset(
        "json",
        data_files=data_files,
        features=features,
    )

    if test_split_percentage:
        dataset = dataset["train"].train_test_split(
            test_size=test_split_percentage, seed=42
        )

    dataset.push_to_hub(repo_id=repo_id)
    print(f"Dataset {repo_id} pushed to the hub")

    return dataset


def sort_labels(labels):
    # Separate the 'O' label from the rest
    o_labels = [label for label in labels if label == "O"]

    # Separate B- labels and corresponding I- labels
    b_labels = sorted([label for label in labels if label.startswith("B-")])
    i_labels = [label for label in labels if label.startswith("I-")]

    # Sort I- labels based on their corresponding B- labels
    sorted_labels = o_labels  # 'O' first
    for b_label in b_labels:
        sorted_labels.append(b_label)
        # Add the corresponding I- label
        corresponding_i_labels = [
            i_label for i_label in i_labels if i_label[2:] == b_label[2:]
        ]
        sorted_labels.extend(corresponding_i_labels)

    return sorted_labels


def print_analysis(df: pd.DataFrame, format: Literal["count", "percentage"] = "count"):
    print(f"Number of sentences in the dataset: {len(df)}")
    if format == "count":
        print(
            f"Label count in dataset:\n{df['ner_tags'].explode().value_counts().to_markdown()}"
        )
    else:
        print(
            f"Label percentage in dataset:\n{(df['ner_tags'].explode().value_counts(normalize=True) * 100).to_markdown()}"
        )


def vizualize_ner_dataset(dataset: DatasetDict, num_samples: int = 5):
    if num_samples:
        random_ids = random.sample(range(len(dataset)), num_samples)

        print("=" * 50 + "START" + "=" * 50)
        for id in random_ids:
            print(f"Sentence ID: {id}")
            print(check_labels(dataset[id]["words"], dataset[id]["ner_tags"]))
            print("-" * 100)
        print("=" * 50 + "END" + "=" * 50)


def get_statistics(dataset: DatasetDict):
    for split in dataset:
        print(f"Split: {split}")
        df = dataset[split].to_pandas()
        print_analysis(df, format="count")
        print_analysis(df, format="percentage")


def proces_ner_dataset(
    file_path: Path,
    output_path: Path = None,
    split_by: str = "\t",
    vizualize: int = 5,
) -> pd.DataFrame:
    assert file_path.exists(), f"File not found at {file_path}"
    if output_path:
        assert output_path.suffix == ".jsonl", "Output path should be a JSONL file"
        output_path.parent.mkdir(exist_ok=True, parents=True)

    raw_data = load_conll_data(file_path, split_by=split_by)
    print(f"Number of sentences in the train set: {len(raw_data)}")

    vizualize_ner_dataset(raw_data, num_samples=vizualize)

    df = pd.DataFrame(raw_data)

    if output_path is not None:
        df.to_json(
            path_or_buf=output_path,
            orient="records",
            lines=True,
        )

    return df

## Wikigold

In [60]:
file_path = repo_path / "data/wikigold/CONLL-format/data/wikigold.conll.txt"
dataset_name = file_path.parts[6]
output_path = output_dir / dataset_name / "wikigold.jsonl"

df = proces_ner_dataset(file_path, split_by=" ", vizualize=0)

Number of sentences in the train set: 1841


In [31]:
df["ner_tags"] = df["ner_tags"].apply(format_labels)

In [None]:
print_analysis(df)

In [None]:
label_names = list(df["ner_tags"].explode().value_counts().keys())
label_names = sort_labels(label_names)

dataset = pust_to_hf(
    repo_id="Studeni/Wikigold-NER-conll",
    label_names=label_names,
    data_files=str(output_path),
    test_split_percentage=0.2,
)

In [None]:
get_statistics(dataset=dataset)

## GUM Dataset

In [69]:
file_path_train = repo_path / "data/GUM/CONLL-format/data/train/gum-train.conll"
assert file_path_train.exists(), f"File not found at {file_path_train}"
file_path_test = repo_path / "data/GUM/CONLL-format/data/test/gum-test.conll"
assert file_path_test.exists(), f"File not found at {file_path_test}"

dataset_name = "GUM"

In [None]:
output_path_train = output_dir / dataset_name / "gum-train.jsonl"

df_train = proces_ner_dataset(file_path_train, output_path, split_by="\t", vizualize=2)

In [None]:
output_path_test = output_dir / dataset_name / "gum-test.jsonl"

df_test = proces_ner_dataset(file_path_test, split_by="\t", vizualize=2)

In [73]:
label_names = list(df_train["ner_tags"].explode().value_counts().keys())
label_names_sorted = sort_labels(label_names)

dataset = pust_to_hf(
    repo_id="Studeni/GUM-NER-conll",
    label_names=label_names_sorted,
    data_files={"train": str(output_path_train), "test": str(output_path_test)},
)

## Label generation

In [29]:
import json
import os
from pathlib import Path

from openai import OpenAI
from pydantic import BaseModel

from datasets import load_dataset

In [2]:
cache_dir = Path("../cache")
assert cache_dir.exists(), f"Cache directory not found at {cache_dir}"
dataset = load_dataset(
    "Studeni/Pile-NER-type-conll", cache_dir=cache_dir, split="train"
).to_pandas()

In [35]:
dataset["labels_raw"] = dataset["labels"].apply(
    lambda labels: set([label.split("-")[-1] for label in labels if label != "O"])
)

In [36]:
dataset["labels_raw"].explode().nunique()

12654

In [44]:
LABELS_DESCRIPTION_PROMPT = """
###TASK###
Your task is to generate three different variety descriptions for the target NER (Named Entity Recognition) label based on the provided word pairs and NER labels. 
Each description should be one sentence long and may include examples if needed to better explain the label. 
The examples must be generalist and not specific to any particular domain.

###INSTRUCTIONS###
Follow these steps:

1. Analyze the word pairs:
   - Look for patterns and commonalities among the words in each pair.
   - Consider how these words relate to the target NER label.

2. Generate descriptions:
   - Create three distinct, one-sentence descriptions for the target label.
   - Sentences needs to be direct and concise.
   - Each description should capture the essence of the label based on the patterns observed in the word pairs.
   - If necessary, include generalist examples to clarify the label's meaning.
   - Ensure that the descriptions are varied in their approach and wording.

3. Format the output:
   - Present your generated descriptions in JSON format.
   - Use the keys "description_1", "description_2", and "description_3" for each description.

###OUTPUT EXAMPLE###
Your final output should be structured as follows:

{{
   "description_1": "Your first generated description",
   "description_2": "Your second generated description",
   "description_3": "Your third generated description"
}}

###SUMMARY###
Remember to make your descriptions clear, concise, and informative, focusing on the general concept of the NER label rather than specific instances.

###INPUT###
List of words:
{words}

List of NER labels:
{ner_labels}

Target label:
{target_label}
"""

In [45]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [46]:
class Response(BaseModel):
    description_1: str
    description_2: str
    description_3: str

In [47]:
def get_descriptions(
    client: OpenAI,
    prompt: str,
    response_format: BaseModel,
    model: str = "gpt-4o-mini",
    temperature: float = 0.0,
):
    response = client.beta.chat.completions.parse(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
        ],
        temperature=temperature,
        response_format=response_format,
    )
    return response

In [48]:
dataset["labels_raw"].apply(len).describe()

count    45889.000000
mean         7.701911
std          4.621669
min          0.000000
25%          4.000000
50%          7.000000
75%         10.000000
max         44.000000
Name: labels_raw, dtype: float64

In [49]:
prompt = LABELS_DESCRIPTION_PROMPT.format(
    words=dataset.iloc[0].words,
    ner_labels=dataset.iloc[0].ner_tags,
    target_label="PROGRAMMING_CONCEPT",
)

response = get_descriptions(client, prompt, Response)

In [31]:
json.loads(response.choices[0].message.content)

{'description_1': 'A programming concept refers to an abstract idea or principle that is fundamental to the practice of programming, such as variables, loops, and functions, which help in structuring and organizing code.',
 'description_2': 'Programming concepts encompass the foundational elements of coding, including data types, control structures, and algorithms, which are essential for developing software applications.',
 'description_3': 'Examples of programming concepts include object-oriented programming, which focuses on the use of objects and classes, and functional programming, which emphasizes the use of functions as the primary building blocks of software.'}

In [32]:
response

ParsedChatCompletion[Response](id='chatcmpl-ADdpP41ccK38FOYBBZ1kVe3C7kQzh', choices=[ParsedChoice[Response](finish_reason='stop', index=0, logprobs=None, message=ParsedChatCompletionMessage[Response](content='{\n   "description_1": "A programming concept refers to an abstract idea or principle that is fundamental to the practice of programming, such as variables, loops, and functions, which help in structuring and organizing code.",\n   "description_2": "Programming concepts encompass the foundational elements of coding, including data types, control structures, and algorithms, which are essential for developing software applications.",\n   "description_3": "Examples of programming concepts include object-oriented programming, which focuses on the use of objects and classes, and functional programming, which emphasizes the use of functions as the primary building blocks of software."\n}', refusal=None, role='assistant', function_call=None, tool_calls=[], parsed=Response(description_1='

In [50]:
json.loads(response.choices[0].message.content)

{'description_1': 'A programming concept refers to a fundamental idea or principle that underlies the structure and behavior of programming languages, such as variables, loops, and functions.',
 'description_2': 'Programming concepts are the building blocks of software development, encompassing techniques and methodologies like object-oriented programming and recursion that guide how code is written and organized.',
 'description_3': 'Examples of programming concepts include data types, control structures, and algorithms, which are essential for creating efficient and effective software solutions.'}

In [51]:
response

ParsedChatCompletion[Response](id='chatcmpl-ADe0DwY94T92uQihlkbUe9ZrMpJ12', choices=[ParsedChoice[Response](finish_reason='stop', index=0, logprobs=None, message=ParsedChatCompletionMessage[Response](content='{"description_1":"A programming concept refers to a fundamental idea or principle that underlies the structure and behavior of programming languages, such as variables, loops, and functions.","description_2":"Programming concepts are the building blocks of software development, encompassing techniques and methodologies like object-oriented programming and recursion that guide how code is written and organized.","description_3":"Examples of programming concepts include data types, control structures, and algorithms, which are essential for creating efficient and effective software solutions."}', refusal=None, role='assistant', function_call=None, tool_calls=[], parsed=Response(description_1='A programming concept refers to a fundamental idea or principle that underlies the structur