# Necessary imports

In [1]:
from datasets import load_dataset, load_from_disk
import numpy as np
from tqdm import tqdm

In [2]:
# load the an existing dataset for NER that contians mountains from Huggingface
dataset = load_dataset("DFKI-SLT/few-nerd", "supervised")

Downloading builder script:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

  0%|          | 0/3359329 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

  0%|          | 0/482037 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

  0%|          | 0/958765 [00:00<?, ?it/s]

In [37]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]
val_dataset = dataset["validation"]

The mountains have the tag 24

In [21]:
def print_stats(dataset_x, mountain_tag=1):
    """
    print the stats relative to mountains
    :param dataset_x: the dataset
    :param mountain_tag: the tag that corresponds to the mountains
    """
    num_mountains = 0
    num_samples_with_mountains = 0
    mountains = set()
    for line in tqdm(dataset_x):
        tags = line['fine_ner_tags']
        flag = True
        for i, tag in enumerate(tags):
            if tag == mountain_tag:
                num_mountains += 1
                if flag:
                    flag = False
                    num_samples_with_mountains += 1
                mountains.add(line['tokens'][i])
    print(f'\n#mountains = {num_mountains}')
    print(f'#distinct mountains = {len(mountains)}')
    print(f'#samples with mountains = {num_samples_with_mountains}')

In [22]:
print_stats(train_dataset, 24)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 131767/131767 [00:35<00:00, 3749.41it/s]


#mountains = 4500
#distinct mountains = 1871
#samples with mountains = 1502





In [38]:
print_stats(test_dataset, 24)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 37648/37648 [00:10<00:00, 3639.80it/s]


#mountains = 1366
#distinct mountains = 776
#samples with mountains = 448





In [43]:
print_stats(val_dataset, 24)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 18824/18824 [00:05<00:00, 3507.44it/s]


#mountains = 734
#distinct mountains = 474
#samples with mountains = 218





# Process the dataset

In [9]:
def mapping_function(example):
    """
    A helper function that changes the tags like this:
    24 -> 1
    other -> 0
    :param example: a dataset sample
    :return: the example with tags modified
    """
    old_tags = example["fine_ner_tags"]
    new_tags = []

    for tag in old_tags:
      if tag == 24:
        new_tags.append(1)
      else:
        new_tags.append(0)

    example['fine_ner_tags'] = new_tags

    return example

def modify_dataset(dataset):
    """
    change the tags for every sample in the dataset
    :param dataset: the dataset
    :return: a modified dataset
    """
    feature = dataset.features["fine_ner_tags"]

    # apply the mapping function to each sample in the dataset
    dataset = dataset.map(mapping_function, batched=False)

    return dataset

In [33]:
train_dataset = modify_dataset(train_dataset)

In [39]:
test_dataset = modify_dataset(test_dataset)

In [44]:
val_dataset = modify_dataset(val_dataset)

In [13]:
def reduce_dataset(dataset, p):
    """
    reduce the dataset by leaving only p fraction of the samples that do not contain mountains
    :param dataset: the dataset
    :param p: the fraction, 0<=p<=1
    """
    def filter_samples(example):
        """
        a helper function to filter samples
        :param example: the sample
        :return: True, if keep, False, otherwise
        """
        if 1 in example['fine_ner_tags']:
            return True
        x = np.random.uniform()
        if x < p:
            return True
        else:
            return False

    reduced_dataset = dataset.filter(filter_samples)

    return reduced_dataset

In [35]:
reduced_train = reduce_dataset(train_dataset, 1502 / (131767 - 1502))

In [40]:
reduced_test = reduce_dataset(test_dataset, 448 / (37648 - 448))

Filter:   0%|          | 0/37648 [00:00<?, ? examples/s]

In [45]:
reduced_val = reduce_dataset(val_dataset, 218 / (18824 - 218))

Filter:   0%|          | 0/18824 [00:00<?, ? examples/s]

Chech dataset integrity with respect to the mountains

In [47]:
print_stats(reduced_train)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2995/2995 [00:01<00:00, 1817.08it/s]


#mountains = 4500
#distinct mountains = 1871
#samples with mountains = 1502





print_stats(reduced_test)

In [49]:
print_stats(reduced_val)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 449/449 [00:00<00:00, 2608.78it/s]


#mountains = 734
#distinct mountains = 474
#samples with mountains = 218





# Save the datasets

In [52]:
reduced_train.save_to_disk("./data/train_data")

Saving the dataset (0/1 shards):   0%|          | 0/2995 [00:00<?, ? examples/s]

In [53]:
reduced_test.save_to_disk("./data/test_data")

Saving the dataset (0/1 shards):   0%|          | 0/886 [00:00<?, ? examples/s]

In [54]:
reduced_val.save_to_disk("./data/val_data")

Saving the dataset (0/1 shards):   0%|          | 0/449 [00:00<?, ? examples/s]

# Check the saved data

In [55]:
train_loaded = load_from_disk('./data/train_data')
print_stats(train_loaded)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2995/2995 [00:00<00:00, 3014.00it/s]


#mountains = 4500
#distinct mountains = 1871
#samples with mountains = 1502





In [56]:
test_loaded = load_from_disk('./data/test_data')
print_stats(test_loaded)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 886/886 [00:00<00:00, 2732.35it/s]


#mountains = 1366
#distinct mountains = 776
#samples with mountains = 448





In [57]:
val_loaded = load_from_disk('./data/val_data')
print_stats(val_loaded)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 449/449 [00:00<00:00, 2971.15it/s]


#mountains = 734
#distinct mountains = 474
#samples with mountains = 218



