In [1]:
import os
import random
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd

from datasets import (
    ClassLabel,
    DatasetDict,
    Features,
    Sequence,
    Value,
    load_dataset,
)

In [2]:
output_dir = Path("./processed_data")
output_dir.mkdir(exist_ok=True)

In [30]:
def check_labels(words: List[str], ner_tags: List[int]) -> str:

    line1 = ""
    line2 = ""

    for word, tag in zip(words, ner_tags):
        full_label = tag
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)

    return line1 + "\n" + line2

In [6]:
def load_conll_data(file_path: Path) -> Dict[int, Dict[str, List[str]]]:
    dataset = []
    sentence_id = 0
    current_words = []
    current_tags = []

    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line indicates a sentence boundary
                if current_words:  # If we have collected words and tags for a sentence
                    dataset.append(
                        {
                            "id": str(sentence_id),
                            "words": current_words,
                            "ner_tags": current_tags,
                        }
                    )
                    sentence_id += 1
                    current_words = []
                    current_tags = []
            else:
                word, tag = line.split("\t")  # Split by tab
                current_words.append(word)
                current_tags.append(tag)

        # Append the last sentence if the file doesn't end with a newline
        if current_words:
            dataset.append(
                {
                    "id": str(sentence_id),
                    "words": current_words,
                    "ner_tags": current_tags,
                }
            )

    return dataset

In [3]:
repo_path = Path("~/Development/entity-recognition-datasets").expanduser()
assert repo_path.exists(), "Please clone the repository with the datasets"

## GUM Dataset

In [4]:
file_path_train = repo_path / "data/GUM/CONLL-format/data/train/gum-train.conll"
assert file_path_train.exists(), f"File not found at {file_path_train}"
file_path_test = repo_path / "data/GUM/CONLL-format/data/test/gum-test.conll"
assert file_path_test.exists(), f"File not found at {file_path_test}"

In [7]:
train = load_conll_data(file_path_train)
print(f"Number of sentences in the train set: {len(train)}")
test = load_conll_data(file_path_test)
print(f"Number of sentences in the test set: {len(test)}")

Number of sentences in the train set: 2495
Number of sentences in the test set: 1000


In [9]:
random_ids = random.sample(range(len(test)), 5)

In [10]:
for id in random_ids:
    print(f"Sentence ID: {id}")
    print(check_labels(test[id]["words"], test[id]["ner_tags"]))
    print("-" * 100)

Sentence ID: 437
" Pregnant   women      in         Australia  are        getting    about      half       as         much       as         what       they       require    on         a          daily      basis      .          
O B-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract I-abstract 
----------------------------------------------------------------------------------------------------
Sentence ID: 189
The     accident occurred at a          time       when the     mosque  was relatively uncrowded . 
B-event I-event  O        O  B-abstract I-abstract O    B-place I-place O   O          O         O 
----------------------------------------------------------------------------------------------------
Sentence ID: 411
We             took quite a              few            new            girls          over there   back then in 2005   , le

### Analysis

In [8]:
output_path = output_dir / "gum"
output_path.mkdir(exist_ok=True)


df_train = pd.DataFrame(train)
train_path = output_path / "gum-train.jsonl"
df_train.to_json(
    path_or_buf=train_path,
    orient="records",
    lines=True,
)

df_test = pd.DataFrame(test)
test_path = output_path / "gum-test.jsonl"
df_test.to_json(
    path_or_buf=test_path,
    orient="records",
    lines=True,
)

In [None]:
print(f"Number of sentences in the train set: {len(df_train)}")
print(f"Label count in train set: {df_train['ner_tags'].explode().value_counts()}")
print(f"Number of sentences in the test set: {len(df_test)}")
print(f"Label count in test set: {df_test['ner_tags'].explode().value_counts(normalize=)}")

### Formating

In [12]:
label_names = list(df_train["ner_tags"].explode().value_counts().keys())

In [43]:
def sort_labels(labels):
    # Separate the 'O' label from the rest
    o_labels = [label for label in labels if label == "O"]

    # Separate B- labels and corresponding I- labels
    b_labels = sorted([label for label in labels if label.startswith("B-")])
    i_labels = [label for label in labels if label.startswith("I-")]

    # Sort I- labels based on their corresponding B- labels
    sorted_labels = o_labels  # 'O' first
    for b_label in b_labels:
        sorted_labels.append(b_label)
        # Add the corresponding I- label
        corresponding_i_labels = [
            i_label for i_label in i_labels if i_label[2:] == b_label[2:]
        ]
        sorted_labels.extend(corresponding_i_labels)

    return sorted_labels

In [15]:
label_names_sorted = sort_labels(label_names)

In [16]:
features = Features(
    {
        "id": Value(dtype="string"),
        "words": Sequence(feature=Value(dtype="string")),
        "ner_tags": Sequence(feature=ClassLabel(names=label_names_sorted)),
    }
)

In [None]:
gum_dataset_train = load_dataset(
    "json",
    data_files={"train": str(train_path), "test": str(test_path)},
    features=features,
)

In [None]:
gum_dataset_train.push_to_hub(repo_id="Studeni/GUM-NER")

In [56]:
def pust_to_hf(
    repo_id: str,
    label_names: List[str],
    data_files: Dict[str, str] | str,
    test_split_percentage: float = 0.2,
) -> DatasetDict:
    features = Features(
        {
            "id": Value(dtype="string"),
            "words": Sequence(feature=Value(dtype="string")),
            "ner_tags": Sequence(feature=ClassLabel(names=label_names)),
        }
    )
    dataset = load_dataset(
        "json",
        data_files=data_files,
        features=features,
    )

    if isinstance(data_files, str):
        dataset = dataset["train"].train_test_split(
            test_size=test_split_percentage, seed=42
        )

    dataset.push_to_hub(repo_id=repo_id)
    print(f"Dataset {repo_id} pushed to the hub")

    return dataset


def sort_labels(labels):
    # Separate the 'O' label from the rest
    o_labels = [label for label in labels if label == "O"]

    # Separate B- labels and corresponding I- labels
    b_labels = sorted([label for label in labels if label.startswith("B-")])
    i_labels = [label for label in labels if label.startswith("I-")]

    # Sort I- labels based on their corresponding B- labels
    sorted_labels = o_labels  # 'O' first
    for b_label in b_labels:
        sorted_labels.append(b_label)
        # Add the corresponding I- label
        corresponding_i_labels = [
            i_label for i_label in i_labels if i_label[2:] == b_label[2:]
        ]
        sorted_labels.extend(corresponding_i_labels)

    return sorted_labels


def print_analysis(df: pd.DataFrame):
    print(f"Number of sentences in the dataset: {len(df)}")
    print(f"Label count in dataset: {df['ner_tags'].explode().value_counts()}")


def vizualize_ner_dataset(dataset: DatasetDict, num_samples: int = 5):
    random_ids = random.sample(range(len(dataset)), num_samples)

    print("=" * 50 + "START" + "=" * 50)
    for id in random_ids:
        print(f"Sentence ID: {id}")
        print(check_labels(dataset[id]["words"], dataset[id]["ner_tags"]))
        print("-" * 100)
    print("=" * 50 + "END" + "=" * 50)


def proces_ner_dataset(file_path: Path, output_path: Path = None) -> pd.DataFrame:
    assert file_path.exists(), f"File not found at {file_path_train}"
    if output_path is None:
        assert output_path.suffix == ".jsonl", "Output path should be a JSONL file"

    output_path.parent.mkdir(exist_ok=True, parents=True)

    raw_data = load_conll_data(file_path_train)
    print(f"Number of sentences in the train set: {len(raw_data)}")
    vizualize_ner_dataset(raw_data)

    df = pd.DataFrame(raw_data)

    if output_path is not None:
        df.to_json(
            path_or_buf=output_path,
            orient="records",
            lines=True,
        )

    return df

## Wikigold

In [39]:
file_path = repo_path / "data/wikigold/CONLL-format/data/wikigold.conll.txt"
dataset_name = file_path.parts[6]
output_path = output_dir / dataset_name / "wikigold.jsonl"

df = proces_ner_dataset(file_path, output_path)

Number of sentences in the train set: 2495
Sentence ID: 1565
15       players  
B-person I-person 
----------------------------------------------------------------------------------------------------
Sentence ID: 387
Belle    
B-person 
----------------------------------------------------------------------------------------------------
Sentence ID: 1865
Read up on different  types      of         basil      and pick out one        - or several    - that appeal to you      , then order the     seeds   or buy them    at a       garden  store   . 
O    O  O  B-abstract I-abstract I-abstract I-abstract O   O    O   B-abstract O O  B-abstract O O    O      O  B-person O O    O     B-plant I-plant O  O   B-plant O  B-place I-place I-place O 
----------------------------------------------------------------------------------------------------
Sentence ID: 662
When Tulsa   expanded beyond the     bounds  of      its     original plat    , the     expanded areas   were platted in alignment with 

In [41]:
print_analysis(df)

Number of sentences in the dataset: 2495
Label count in dataset: ner_tags
O                 20460
I-abstract         4687
I-event            2707
I-place            2212
B-abstract         2002
B-person           1920
I-person           1866
I-object           1732
B-place            1150
B-object           1017
B-event             738
I-time              663
I-organization      552
I-substance         458
B-time              401
B-organization      397
B-substance         278
I-quantity          203
I-plant             166
B-plant             144
B-animal            141
I-animal            120
B-quantity           97
Name: count, dtype: int64


In [58]:
label_names = list(df_train["ner_tags"].explode().value_counts().keys())
label_names = sort_labels(label_names)

dataset = pust_to_hf(
    repo_id="Studeni/Wikigold-NER",
    label_names=label_names,
    data_files=str(output_path),
    test_split_percentage=0.2,
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Dataset Studeni/Wikigold-NER pushed to the hub


In [59]:
print_analysis(dataset["train"].to_pandas())

Number of sentences in the dataset: 1996
Label count in dataset: ner_tags
0     16334
2      3790
6      2158
14     1740
1      1621
11     1515
12     1502
8      1381
13      920
7       809
5       581
22      542
10      460
20      385
9       333
21      320
19      237
18      168
16      125
15      116
3       115
4       106
17       78
Name: count, dtype: int64


In [60]:
print_analysis(dataset["test"].to_pandas())

Number of sentences in the dataset: 499
Label count in dataset: ner_tags
0     4126
2      897
6      549
14     472
11     405
1      381
12     364
8      351
13     230
7      208
5      157
22     121
10      92
21      81
20      73
9       64
16      41
19      41
18      35
15      28
3       26
17      19
4       14
Name: count, dtype: int64
