In [3]:
import os
import random
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd

from datasets import (
    ClassLabel,
    DatasetDict,
    Features,
    Sequence,
    Value,
    load_dataset,
)

In [4]:
output_dir = Path("./processed_data")
output_dir.mkdir(exist_ok=True)

In [5]:
def check_labels(words: List[str], ner_tags: List[int]) -> str:

    line1 = ""
    line2 = ""

    for word, tag in zip(words, ner_tags):
        full_label = tag
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)

    return line1 + "\n" + line2

In [12]:
def load_conll_data(
    file_path: Path, split_by: str = "\t"
) -> Dict[int, Dict[str, List[str]]]:
    dataset = []
    sentence_id = 0
    current_words = []
    current_tags = []

    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line indicates a sentence boundary
                if current_words:  # If we have collected words and tags for a sentence
                    dataset.append(
                        {
                            "id": str(sentence_id),
                            "words": current_words,
                            "ner_tags": current_tags,
                        }
                    )
                    sentence_id += 1
                    current_words = []
                    current_tags = []
            else:
                word, tag = line.split(split_by)  # Split by tab
                current_words.append(word)
                current_tags.append(tag)

        # Append the last sentence if the file doesn't end with a newline
        if current_words:
            dataset.append(
                {
                    "id": str(sentence_id),
                    "words": current_words,
                    "ner_tags": current_tags,
                }
            )

    return dataset

In [7]:
repo_path = Path("~/Development/entity-recognition-datasets").expanduser()
assert repo_path.exists(), "Please clone the repository with the datasets"

## GUM Dataset

In [4]:
file_path_train = repo_path / "data/GUM/CONLL-format/data/train/gum-train.conll"
assert file_path_train.exists(), f"File not found at {file_path_train}"
file_path_test = repo_path / "data/GUM/CONLL-format/data/test/gum-test.conll"
assert file_path_test.exists(), f"File not found at {file_path_test}"

In [7]:
train = load_conll_data(file_path_train)
print(f"Number of sentences in the train set: {len(train)}")
test = load_conll_data(file_path_test)
print(f"Number of sentences in the test set: {len(test)}")

Number of sentences in the train set: 2495
Number of sentences in the test set: 1000


In [9]:
random_ids = random.sample(range(len(test)), 5)

In [10]:
for id in random_ids:
    print(f"Sentence ID: {id}")
    print(check_labels(test[id]["words"], test[id]["ner_tags"]))
    print("-" * 100)

Sentence ID: 437
" Pregnant   women      in         Australia  are        getting    about      half       as         much       as         what       they       require    on         a          daily      basis      .          
O B-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract 
----------------------------------------------------------------------------------------------------
Sentence ID: 189
The     accident occurred at a          time       when the     mosque  was relatively uncrowded . 
B-event I-event  O        O  B-abstract I-abstract O    B-place I-place O   O          O         O 
----------------------------------------------------------------------------------------------------
Sentence ID: 411
We             took quite a              few            new            girls          over there   back then in 2005   , le

### Analysis

In [8]:
output_path = output_dir / "gum"
output_path.mkdir(exist_ok=True)


df_train = pd.DataFrame(train)
train_path = output_path / "gum-train.jsonl"
df_train.to_json(
    path_or_buf=train_path,
    orient="records",
    lines=True,
)

df_test = pd.DataFrame(test)
test_path = output_path / "gum-test.jsonl"
df_test.to_json(
    path_or_buf=test_path,
    orient="records",
    lines=True,
)

In [None]:
print(f"Number of sentences in the train set: {len(df_train)}")
print(f"Label count in train set: {df_train['ner_tags'].explode().value_counts()}")
print(f"Number of sentences in the test set: {len(df_test)}")
print(f"Label count in test set: {df_test['ner_tags'].explode().value_counts(normalize=)}")

### Formating

In [12]:
label_names = list(df_train["ner_tags"].explode().value_counts().keys())

In [43]:
def sort_labels(labels):
    # Separate the 'O' label from the rest
    o_labels = [label for label in labels if label == "O"]

    # Separate B- labels and corresponding I- labels
    b_labels = sorted([label for label in labels if label.startswith("B-")])
    i_labels = [label for label in labels if label.startswith("I-")]

    # Sort I- labels based on their corresponding B- labels
    sorted_labels = o_labels  # 'O' first
    for b_label in b_labels:
        sorted_labels.append(b_label)
        # Add the corresponding I- label
        corresponding_i_labels = [
            i_label for i_label in i_labels if i_label[2:] == b_label[2:]
        ]
        sorted_labels.extend(corresponding_i_labels)

    return sorted_labels

In [15]:
label_names_sorted = sort_labels(label_names)

In [16]:
features = Features(
    {
        "id": Value(dtype="string"),
        "words": Sequence(feature=Value(dtype="string")),
        "ner_tags": Sequence(feature=ClassLabel(names=label_names_sorted)),
    }
)

In [None]:
gum_dataset_train = load_dataset(
    "json",
    data_files={"train": str(train_path), "test": str(test_path)},
    features=features,
)

In [None]:
gum_dataset_train.push_to_hub(repo_id="Studeni/GUM-NER")

In [30]:
def format_labels(labels: List[str]) -> List[str]:
    """This method fixes the labels to be in the correct IOB format. Wikigold only has O and I tags, but we need to have B, I and O tags."""
    if any(["-" in label for label in labels if label != "O"]):
        labels = [label.split("-")[-1] for label in labels]

    new_labels = []
    current_label = None
    for label in labels:
        if current_label is None:
            current_label = label
            if label == "O":
                new_labels.append(label)
            else:
                new_labels.append(f"B-{label}")
        else:
            if label == "O":
                new_labels.append(label)
                current_label = label
            else:
                if label == current_label:
                    new_labels.append(f"I-{current_label}")
                else:
                    new_labels.append(f"B-{label}")
                    current_label = label

    return new_labels


def pust_to_hf(
    repo_id: str,
    label_names: List[str],
    data_files: Dict[str, str] | str,
    test_split_percentage: float = 0.2,
) -> DatasetDict:
    features = Features(
        {
            "id": Value(dtype="string"),
            "words": Sequence(feature=Value(dtype="string")),
            "ner_tags": Sequence(feature=ClassLabel(names=label_names)),
        }
    )
    dataset = load_dataset(
        "json",
        data_files=data_files,
        features=features,
    )

    if isinstance(data_files, str):
        dataset = dataset["train"].train_test_split(
            test_size=test_split_percentage, seed=42
        )

    dataset.push_to_hub(repo_id=repo_id)
    print(f"Dataset {repo_id} pushed to the hub")

    return dataset


def sort_labels(labels):
    # Separate the 'O' label from the rest
    o_labels = [label for label in labels if label == "O"]

    # Separate B- labels and corresponding I- labels
    b_labels = sorted([label for label in labels if label.startswith("B-")])
    i_labels = [label for label in labels if label.startswith("I-")]

    # Sort I- labels based on their corresponding B- labels
    sorted_labels = o_labels  # 'O' first
    for b_label in b_labels:
        sorted_labels.append(b_label)
        # Add the corresponding I- label
        corresponding_i_labels = [
            i_label for i_label in i_labels if i_label[2:] == b_label[2:]
        ]
        sorted_labels.extend(corresponding_i_labels)

    return sorted_labels


def print_analysis(df: pd.DataFrame):
    print(f"Number of sentences in the dataset: {len(df)}")
    print(f"Label count in dataset: {df['ner_tags'].explode().value_counts()}")


def vizualize_ner_dataset(dataset: DatasetDict, num_samples: int = 5):
    random_ids = random.sample(range(len(dataset)), num_samples)

    print("=" * 50 + "START" + "=" * 50)
    for id in random_ids:
        print(f"Sentence ID: {id}")
        print(check_labels(dataset[id]["words"], dataset[id]["ner_tags"]))
        print("-" * 100)
    print("=" * 50 + "END" + "=" * 50)


def proces_ner_dataset(
    file_path: Path, output_path: Path = None, split_by: str = "\t"
) -> pd.DataFrame:
    assert file_path.exists(), f"File not found at {file_path}"
    if output_path is None:
        assert output_path.suffix == ".jsonl", "Output path should be a JSONL file"

    output_path.parent.mkdir(exist_ok=True, parents=True)

    raw_data = load_conll_data(file_path, split_by=split_by)
    print(f"Number of sentences in the train set: {len(raw_data)}")
    vizualize_ner_dataset(raw_data)

    df = pd.DataFrame(raw_data)

    if output_path is not None:
        df.to_json(
            path_or_buf=output_path,
            orient="records",
            lines=True,
        )

    return df

## Wikigold

In [14]:
file_path = repo_path / "data/wikigold/CONLL-format/data/wikigold.conll.txt"
dataset_name = file_path.parts[6]
output_path = output_dir / dataset_name / "wikigold.jsonl"

df = proces_ner_dataset(file_path, output_path, split_by=" ")

Number of sentences in the train set: 1841
Sentence ID: 295
They were compelled to wear a distinctive dress , to which , in some places , was attached the foot of a goose or duck ( whence they were sometimes called Canards ) . 
O    O    O         O  O    O O           O     O O  O     O O  O    O      O O   O        O   O    O  O O     O  O    O O      O    O    O         O      I-MISC  O O 
----------------------------------------------------------------------------------------------------
Sentence ID: 987
In 1928 , Henrique and José  Pedr  left the company , and Caloi was directed by Guido Caloi alone . 
O  O    O I-PER    O   I-PER I-PER O    O   O       O O   I-ORG O   O        O  I-PER I-PER O     O 
----------------------------------------------------------------------------------------------------
Sentence ID: 1225
The Clarence Dock  is a dock , on the River Mersey and part of the Port  of    Liverpool it is situated in the northern end of the dock system , connected to Salisbu

In [31]:
df["ner_tags"] = df["ner_tags"].apply(format_labels)

In [32]:
df.head()

Unnamed: 0,id,words,ner_tags
0,0,"[010, is, the, tenth, album, from, Japanese, P...","[B-MISC, O, O, O, O, O, B-MISC, O, O, O, B-ORG..."
1,1,"[This, album, proved, to, be, more, commercial...","[O, O, O, O, O, O, O, O, O, O, O, B-MISC, O, O..."
2,2,"[Founding, member, Kojima, Minoru, played, gui...","[O, O, B-PER, I-PER, O, O, O, B-MISC, I-MISC, ..."
3,3,"[XXX, can, of, This, had, a, different, meanin...","[B-MISC, I-MISC, I-MISC, I-MISC, O, O, O, O, O..."
4,4,"[it, was, later, explained, that, the, song, w...","[O, O, O, O, O, O, O, O, O, B-MISC, O, O, O, O..."


In [33]:
print_analysis(df)

Number of sentences in the dataset: 1841
Label count in dataset: ner_tags
O         32721
I-ORG      1060
B-LOC      1014
B-PER       934
B-ORG       898
B-MISC      712
I-PER       700
I-MISC      680
I-LOC       433
Name: count, dtype: int64


In [36]:
label_names = list(df["ner_tags"].explode().value_counts().keys())
label_names = sort_labels(label_names)

dataset = pust_to_hf(
    repo_id="Studeni/Wikigold-NER",
    label_names=label_names,
    data_files=str(output_path),
    test_split_percentage=0.2,
)

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset Studeni/Wikigold-NER pushed to the hub


In [42]:
df_train = dataset["train"].to_pandas()
df_train["ner_tags"] = df_train["ner_tags"].apply(
    lambda x: [label_names[id] for id in x]
)
print_analysis(df_train)
df_train["ner_tags"].explode().value_counts(normalize=True) * 100

Number of sentences in the dataset: 1472
Label count in dataset: ner_tags
O         26086
I-ORG      1470
I-PER      1303
I-LOC      1175
I-MISC     1151
Name: count, dtype: int64


ner_tags
O         83.649190
I-ORG      4.713805
I-PER      4.178291
I-LOC      3.767837
I-MISC     3.690877
Name: proportion, dtype: float64

In [43]:
df_test = dataset["test"].to_pandas()
df_test["ner_tags"] = df_test["ner_tags"].apply(lambda x: [label_names[id] for id in x])
print_analysis(df_test)
df_test["ner_tags"].explode().value_counts(normalize=True) * 100

Number of sentences in the dataset: 369
Label count in dataset: ner_tags
O         6635
I-ORG      488
I-PER      331
I-LOC      272
I-MISC     241
Name: count, dtype: int64


ner_tags
O         83.281034
I-ORG      6.125267
I-PER      4.154638
I-LOC      3.414083
I-MISC     3.024978
Name: proportion, dtype: float64