In [174]:
!pip install datasets --upgrade

Collecting datasets
  Downloading datasets-1.6.2-py3-none-any.whl (221 kB)
[K     |████████████████████████████████| 221 kB 2.4 MB/s eta 0:00:01
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 1.5.0.dev0
    Uninstalling datasets-1.5.0.dev0:
      Successfully uninstalled datasets-1.5.0.dev0
Successfully installed datasets-1.6.2


In [3]:
from datasets import load_dataset,concatenate_datasets, load_from_disk

In [4]:
feature_column = ["tokens","ner_tags"]
split_list = ["train","validation","test"]

def remove_columns_from_dataset_dict(dataset_dict,feature_columns):
    assert sorted(split_list) == sorted(list(dataset_dict.keys())), "Dataset is not containing all splits for train,test,val"
    for split in split_list:
        remove_column_list = [col for col in list(dataset_dict[split].features) if col not in feature_column ]
        dataset_dict[split] = dataset_dict[split].remove_columns(remove_column_list)
    return dataset_dict


def merging_all_splits_from_dataset_dict(dataset1,dataset2):
    for split in split_list:
        assert dataset1[split].features.type == dataset2[split].features.type
        dataset1[split] = concatenate_datasets([dataset1[split],dataset2[split]])
    return dataset1    
            

# Preprocessing `wikiann`

In [5]:
wikiann= load_dataset("wikiann","en")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3948.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=12594.0, style=ProgressStyle(descriptio…

Reusing dataset wikiann (/Users/philipp/.cache/huggingface/datasets/wikiann/en/1.1.0/c0a0280cc1c835e2bb7db29f43ef89c8ea30e145b78c1ba2746d709fae3da112)





In [6]:
additional_selected_validation_wikiann = wikiann["validation"].train_test_split(test_size=0.5)
additional_selected_test_wikiann = wikiann["test"].train_test_split(test_size=0.5)

In [7]:
assert wikiann["train"].features.type == additional_selected_validation_wikiann["train"].features.type

In [8]:
wikiann["train"] = concatenate_datasets([additional_selected_test_wikiann["train"],wikiann["train"]])
wikiann["validation"] = additional_selected_validation_wikiann["test"]
wikiann["test"] = additional_selected_test_wikiann["test"]

In [9]:
wikiann_cleaned = remove_columns_from_dataset_dict(wikiann,feature_column)

In [10]:
wikiann_cleaned

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 25000
    })
})

In [96]:
wikiann_cleaned.save_to_disk("../data/wikiann")

# Preprocessing `conll2003`

In [11]:
conll = load_dataset("conll2003")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2603.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1781.0, style=ProgressStyle(description…

Reusing dataset conll2003 (/Users/philipp/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)





In [12]:
conll_cleaned = remove_columns_from_dataset_dict(conll,feature_column)

In [13]:
conll_cleaned

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3453
    })
})

In [97]:
conll_cleaned.save_to_disk("../data/conll")

# Merging the datasets

In [99]:
loaded_conll = load_from_disk("../data/conll")
wikiann_cleaned = load_from_disk("../data/conll")

In [14]:
merged_dataset = merging_all_splits_from_dataset_dict(conll_cleaned,wikiann_cleaned)

# Filter `ner_tags` to 3 or 4 classes

In [149]:
def change_label_to_zero(example):
    example["ner_tags"] = [0 if label==7 or label==8 else label for label in example["ner_tags"]]
    return example

In [150]:
conll_cleaned["train"] = conll_cleaned["train"].map(change_label_to_zero,batched=True)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [164]:
import datasets

In [165]:
new_feature = datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-PER",
                                "I-PER",
                                "B-ORG",
                                "I-ORG",
                                "B-LOC",
                                "I-LOC",
                            ]
                        )
                    ),

In [170]:
conll_cleaned["train"].features["ner_tags"] =new_feature

In [171]:
conll_cleaned["train"].features["ner_tags"]

(Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], names_file=None, id=None), length=-1, id=None),)

# test

In [1]:
import os
import sys
parent = os.path.dirname(os.getcwd())

sys.path.insert(0,f'{parent}/src/training')

In [2]:
from preprocess_utils import merge_datasets
from datasets import load_dataset

In [3]:
wikiann= load_dataset("wikiann","en")
conll = load_dataset("conll2003")

ds = merge_datasets(conll,wikiann,class_num=3)

Reusing dataset wikiann (/Users/philipp/.cache/huggingface/datasets/wikiann/en/1.1.0/c0a0280cc1c835e2bb7db29f43ef89c8ea30e145b78c1ba2746d709fae3da112)
Reusing dataset conll2003 (/Users/philipp/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['ner_tags', 'tokens'],
        num_rows: 28082
    })
    validation: Dataset({
        features: ['ner_tags', 'tokens'],
        num_rows: 6500
    })
    test: Dataset({
        features: ['ner_tags', 'tokens'],
        num_rows: 6906
    })
})

In [5]:
ds["train"].features["ner_tags"]

(Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], names_file=None, id=None), length=-1, id=None),)

# load_ner

In [1]:
import os
import sys
parent = os.path.dirname(os.getcwd())

sys.path.insert(0,f'{parent}/src/training')

In [2]:
from preprocess_utils import load_ner_dataset

In [3]:
datasets, num_labels, label_to_id, label_list = load_ner_dataset('wikiann-conll2003',3)

Reusing dataset conll2003 (/Users/philipp/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
Reusing dataset wikiann (/Users/philipp/.cache/huggingface/datasets/wikiann/en/1.1.0/c0a0280cc1c835e2bb7db29f43ef89c8ea30e145b78c1ba2746d709fae3da112)


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


{'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0], 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']}


ValueError: 

In [5]:
for d in datasets["train"]:
    if 7 in d["ner_tags"]:
        print(d)
        raise

{'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0], 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']}


RuntimeError: No active exception to reraise