In [1]:
import os
import random
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd

from datasets import (
    ClassLabel,
    DatasetDict,
    Features,
    Sequence,
    Value,
    load_dataset,
)

In [3]:
output_dir = Path("./processed_data")
output_dir.mkdir(exist_ok=True)

In [4]:
def check_labels(words: List[str], ner_tags: List[int]) -> str:

    line1 = ""
    line2 = ""

    for word, tag in zip(words, ner_tags):
        full_label = tag
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)

    return line1 + "\n" + line2

In [5]:
def load_conll_data(file_path: Path) -> Dict[int, Dict[str, List[str]]]:
    dataset = []
    sentence_id = 0
    current_words = []
    current_tags = []

    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line indicates a sentence boundary
                if current_words:  # If we have collected words and tags for a sentence
                    dataset.append(
                        {
                            "id": str(sentence_id),
                            "words": current_words,
                            "ner_tags": current_tags,
                        }
                    )
                    sentence_id += 1
                    current_words = []
                    current_tags = []
            else:
                word, tag = line.split("\t")  # Split by tab
                current_words.append(word)
                current_tags.append(tag)

        # Append the last sentence if the file doesn't end with a newline
        if current_words:
            dataset.append(
                {
                    "id": str(sentence_id),
                    "words": current_words,
                    "ner_tags": current_tags,
                }
            )

    return dataset

## GUM Dataset

In [6]:
repo_path = Path("~/Development/entity-recognition-datasets").expanduser()
assert repo_path.exists(), "Please clone the repository with the datasets"

In [7]:
file_path_train = repo_path / "data/GUM/CONLL-format/data/train/gum-train.conll"
assert file_path_train.exists(), f"File not found at {file_path_train}"
file_path_test = repo_path / "data/GUM/CONLL-format/data/test/gum-test.conll"
assert file_path_test.exists(), f"File not found at {file_path_test}"

In [8]:
train = load_conll_data(file_path_train)
print(f"Number of sentences in the train set: {len(train)}")
test = load_conll_data(file_path_test)
print(f"Number of sentences in the test set: {len(test)}")

Number of sentences in the train set: 2495
Number of sentences in the test set: 1000


In [9]:
random_ids = random.sample(range(len(test)), 5)

In [10]:
for id in random_ids:
    print(f"Sentence ID: {id}")
    print(check_labels(test[id]["words"], test[id]["ner_tags"]))
    print("-" * 100)

Sentence ID: 437
" Pregnant   women      in         Australia  are        getting    about      half       as         much       as         what       they       require    on         a          daily      basis      .          
O B-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract 
----------------------------------------------------------------------------------------------------
Sentence ID: 189
The     accident occurred at a          time       when the     mosque  was relatively uncrowded . 
B-event I-event  O        O  B-abstract I-abstract O    B-place I-place O   O          O         O 
----------------------------------------------------------------------------------------------------
Sentence ID: 411
We             took quite a              few            new            girls          over there   back then in 2005   , le

### Formating

In [11]:
output_path = output_dir / "gum"
output_path.mkdir(exist_ok=True)


df_train = pd.DataFrame(train)
train_path = output_path / "gum-train.jsonl"
df_train.to_json(
    path_or_buf=train_path,
    orient="records",
    lines=True,
)

df_test = pd.DataFrame(test)
test_path = output_path / "gum-test.jsonl"
df_test.to_json(
    path_or_buf=test_path,
    orient="records",
    lines=True,
)

In [12]:
label_names = list(df_train["ner_tags"].explode().value_counts().keys())

In [14]:
def sort_labels(labels):
    # Separate the 'O' label from the rest
    o_labels = [label for label in labels if label == "O"]

    # Separate B- labels and corresponding I- labels
    b_labels = sorted([label for label in labels if label.startswith("B-")])
    i_labels = [label for label in labels if label.startswith("I-")]

    # Sort I- labels based on their corresponding B- labels
    sorted_labels = o_labels  # 'O' first
    for b_label in b_labels:
        sorted_labels.append(b_label)
        # Add the corresponding I- label
        corresponding_i_labels = [
            i_label for i_label in i_labels if i_label[2:] == b_label[2:]
        ]
        sorted_labels.extend(corresponding_i_labels)

    return sorted_labels

In [15]:
label_names_sorted = sort_labels(label_names)

In [16]:
features = Features(
    {
        "id": Value(dtype="string"),
        "words": Sequence(feature=Value(dtype="string")),
        "ner_tags": Sequence(feature=ClassLabel(names=label_names_sorted)),
    }
)

In [None]:
gum_dataset_train = load_dataset(
    "json",
    data_files={"train": str(train_path), "test": str(test_path)},
    features=features,
)

In [None]:
gum_dataset_train.push_to_hub(repo_id="Studeni/GUM-NER")