In [2]:
# pip install datasets
# pip install evaluate
# pip install transformers

# HuggingFace Datasets library - Quick overview

```
Models come and go (linear models, LSTM, Transformers, ...) but two core 
elements have consistently been the beating heart of Natural Language
Processing: Datasets & Metrics
```
* Solution: HuggingFace Library 🤗 provides efficient and extensitve functionality to facilitate pre- and post-machine learning tasks.



### About HuggingFace Datasets

* 🤗 Datasets is a fast and efficient library to easily share and load datasets and evaluation metrics

* Makes it easy to load datasets from common file formats

* Provides access to 150+ datasets and dozens of evaluation metrics

* Supports caching

  * Supports (interopperable with) common machine learning frameworks, pandas and numpy

  * Uses Apache Arrow to memory map large file (frees memory) 



### Main datasets API

This notebook is a quick dive into the main user API for loading datasets in datasets


In [3]:
# install datasets
# !pip install datasets

# Make sure that we have pyarrow version > 6
import pyarrow
print(pyarrow.__version__)

9.0.0


## Listing the currently available datasets and metrics

In [7]:

from datasets import list_datasets
from pprint import pprint

# Currently available datasets and metrics
datasets = list_datasets()
print(f"There are {len(datasets)} available on theh HuggingFace hub")


There are 13571 available on theh HuggingFace hub


In [8]:
pprint(datasets[0:100], compact=True)

['acronym_identification', 'ade_corpus_v2', 'adversarial_qa', 'aeslc',
 'afrikaans_ner_corpus', 'ag_news', 'ai2_arc', 'air_dialogue',
 'ajgt_twitter_ar', 'allegro_reviews', 'allocine', 'alt', 'amazon_polarity',
 'amazon_reviews_multi', 'amazon_us_reviews', 'ambig_qa', 'americas_nli', 'ami',
 'amttl', 'anli', 'app_reviews', 'aqua_rat', 'aquamuse', 'ar_cov19',
 'ar_res_reviews', 'ar_sarcasm', 'arabic_billion_words', 'arabic_pos_dialect',
 'arabic_speech_corpus', 'arcd', 'arsentd_lev', 'art', 'arxiv_dataset',
 'ascent_kb', 'aslg_pc12', 'asnq', 'asset', 'assin', 'assin2', 'atomic',
 'autshumato', 'babi_qa', 'banking77', 'bbaw_egyptian', 'bbc_hindi_nli',
 'bc2gm_corpus', 'beans', 'best2009', 'bianet', 'bible_para', 'big_patent',
 'billsum', 'bing_coronavirus_query_set', 'biomrc', 'biosses', 'blbooks',
 'blbooksgenre', 'blended_skill_talk', 'blimp', 'blog_authorship_corpus',
 'bn_hate_speech', 'bnl_newspapers', 'bookcorpus', 'bookcorpusopen', 'boolq',
 'bprec', 'break_data', 'brwac', 'bsd_ja

In [6]:
import evaluate
# from datasets.inspect import list_metrics
# metrics = list_metrics()

metrics = evaluate.list_evaluation_modules()

print(f"There are {len(metrics)} implemented in HuggingFace")

There are 110 implemented in HuggingFace


In [9]:
pprint(metrics[0:10], compact=True)

['lvwerra/test', 'precision', 'code_eval', 'roc_auc', 'cuad', 'xnli', 'rouge',
 'pearsonr', 'mse', 'super_glue']


In [10]:
datasets.index('squad')

573

In [11]:
# You can access various attributes of the datasets before downloading them
squad_dataset = list_datasets(with_details=True)[datasets.index('squad')]

pprint(squad_dataset.__dict__)  # It's a simple python dataclass

{'_id': '621ffdd236468d709f181f95',
 'author': None,
 'cardData': {'annotations_creators': ['crowdsourced'],
              'dataset_info': {'config_name': 'plain_text',
                               'dataset_size': 89789763,
                               'download_size': 35142551,
                               'features': [{'dtype': 'string', 'name': 'id'},
                                            {'dtype': 'string',
                                             'name': 'title'},
                                            {'dtype': 'string',
                                             'name': 'context'},
                                            {'dtype': 'string',
                                             'name': 'question'},
                                            {'name': 'answers',
                                             'sequence': [{'dtype': 'string',
                                                           'name': 'text'},
                                 

## An example with SQuAD

* Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset
* Consisting of questions posed by crowdworkers on a set of Wikipedia articles
  * The answer to every question is a segment of text, or span, from the corresponding reading passage
  * Question might be unanswerable.


See: https://huggingface.co/datasets/squad



In [13]:
# Downloading and loading a dataset
from datasets import load_dataset
ds = load_dataset('squad')
ds

Found cached dataset squad (/Users/mahdi/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 555.10it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [15]:
dataset = load_dataset('squad', split='validation[:10%]')
dataset

Found cached dataset squad (/Users/mahdi/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1057
})

### Loading an External Dataset

* `datasets.load_dataset()` does:
 
1. Download and import in the library the **SQuAD python processing script** from HuggingFace


2. Run the SQuAD python processing script which will:
    - **Download the SQuAD dataset**

3. Return a **dataset built from the splits** asked by the user (default: all); in the above example we create a dataset with the first 10% of the validation split.

In [16]:
# Informations on the dataset (description, citation, size, splits, format...)
# are provided in `dataset.info` (a simple python dataclass) and also as direct attributes in the dataset object

pprint(dataset.info.__dict__)

{'builder_name': 'squad',
 'citation': '@article{2016arXiv160605250R,\n'
             '       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and '
             '{Lopyrev},\n'
             '                 Konstantin and {Liang}, Percy},\n'
             '        title = "{SQuAD: 100,000+ Questions for Machine '
             'Comprehension of Text}",\n'
             '      journal = {arXiv e-prints},\n'
             '         year = 2016,\n'
             '          eid = {arXiv:1606.05250},\n'
             '        pages = {arXiv:1606.05250},\n'
             'archivePrefix = {arXiv},\n'
             '       eprint = {1606.05250},\n'
             '}\n',
 'config_name': 'plain_text',
 'dataset_size': 89819092,
 'description': 'Stanford Question Answering Dataset (SQuAD) is a reading '
                'comprehension dataset, consisting of questions posed by '
                'crowdworkers on a set of Wikipedia articles, where the answer '
                'to every question is a segment of

### Inspecting and Using the Dataset: Elements, Slices and Columns


* `Dataset` is memory mapped 

  * Backed by an Apache Arrow table

* Conceptually similar to other column-based implementation of a datatset 

  * Interaction with datasets is conceptually similar, albeit less functionally rich, than with Spark
  
 

In [None]:
print(dataset)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1057
})


In [None]:
dataset.features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}

```python
dir(dataset)
```

```
...
 'add_faiss_index',
 ...
 'get_index',
 'get_nearest_examples',
 'get_nearest_examples_batch',
...
 'to_csv',
 'to_dict',
 'to_json',
 'to_pandas',
 'to_parquet',
 'to_sql',
 'to_tf_dataset',
 ```

### Accessing Data

* You can query it's length and get items or slices like you would do normally with Python objects.

* A `__getitem__` method with polymorphic behavior

* The method will return different formats depending on the type of query:

* Items like `dataset[0]` are returned as a dictionary of elements.

* Slices like `dataset[10:20]` are returned as a dictionary of lists of elements.

* Columns like `dataset['question']` are returned as a list of elements.

* Makes it easier to process data, even though the retuned types differehe Dataset Object

* Many additional misc properties to get information about the data


In [None]:
print(f"👉 Dataset len(dataset): {len(dataset)}")
print("\n👉 First item 'dataset[0]':")
pprint(dataset[0])

👉 Dataset len(dataset): 1057

👉 First item 'dataset[0]':
{'answers': {'answer_start': [177, 177, 177],
             'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']},
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015 '
            'season. The American Football Conference (AFC) champion Denver '
            'Broncos defeated the National Football Conference (NFC) champion '
            'Carolina Panthers 24–10 to earn their third Super Bowl title. The '
            "game was played on February 7, 2016, at Levi's Stadium in the San "
            'Francisco Bay Area at Santa Clara, California. As this was the '
            '50th Super Bowl, the league emphasized the "golden anniversary" '
            'with various gold-themed initiatives, as well as temporarily '
            'suspending the tradition of naming each Super Bowl game with '
            'Roman numerals (under whic

In [None]:
print("\n👉 It 2 in the dataset:")
pprint(dataset[2])


👉 It 2 in the dataset:
{'answers': {'answer_start': [403, 355, 355],
             'text': ['Santa Clara, California',
                      "Levi's Stadium",
                      "Levi's Stadium in the San Francisco Bay Area at Santa "
                      'Clara, California.']},
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015 '
            'season. The American Football Conference (AFC) champion Denver '
            'Broncos defeated the National Football Conference (NFC) champion '
            'Carolina Panthers 24–10 to earn their third Super Bowl title. The '
            "game was played on February 7, 2016, at Levi's Stadium in the San "
            'Francisco Bay Area at Santa Clara, California. As this was the '
            '50th Super Bowl, the league emphasized the "golden anniversary" '
            'with various gold-themed initiatives, as well as temporarily '
            

In [None]:
# Or get slices with several examples:

print("\n👉Slice of the two items 'dataset[10:12]':")

pprint(dataset[10:12])



👉Slice of the two items 'dataset[10:12]':
{'answers': [{'answer_start': [334, 334, 334],
              'text': ['February 7, 2016', 'February 7', 'February 7, 2016']},
             {'answer_start': [177, 177, 177],
              'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']}],
 'context': ['Super Bowl 50 was an American football game to determine the '
             'champion of the National Football League (NFL) for the 2015 '
             'season. The American Football Conference (AFC) champion Denver '
             'Broncos defeated the National Football Conference (NFC) champion '
             'Carolina Panthers 24–10 to earn their third Super Bowl title. '
             "The game was played on February 7, 2016, at Levi's Stadium in "
             'the San Francisco Bay Area at Santa Clara, California. As this '
             'was the 50th Super Bowl, the league emphasized the "golden '
             'anniversary" with various gold-themed initiatives, as well as '
   

In [None]:
# You can get a full column of the dataset by indexing with its name as a string:
print(dataset['question'][:10])

['Which NFL team represented the AFC at Super Bowl 50?', 'Which NFL team represented the NFC at Super Bowl 50?', 'Where did Super Bowl 50 take place?', 'Which NFL team won Super Bowl 50?', 'What color was used to emphasize the 50th anniversary of the Super Bowl?', 'What was the theme of Super Bowl 50?', 'What day was the game played on?', 'What is the AFC short for?', 'What was the theme of Super Bowl 50?', 'What does AFC stand for?']


In [None]:
print(dataset[0]['question'] == dataset['question'][0])
print(dataset[10:20]['context'] == dataset['context'][10:20])

True
True


### Dataset are Internally Typed and Structured

* The dataset is backed by an Apache Arrow table
  * Tables are formatted and the types are clearly defined
*  You can load datasets of arbitrary size without worrying about RAM memory limitations
  * Dataset take no space in RAM and directly read from drive when needed with fast IO access

In [17]:
# You can inspect the dataset column names and types 
print("Column names:")
pprint(dataset.column_names)
print("Features:")
pprint(dataset.features)

Column names:
['id', 'title', 'context', 'question', 'answers']
Features:
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
 'context': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None)}


In [18]:
# Some properties of the data

print("The number of rows", dataset.num_rows, "also available as len(dataset)", len(dataset))
print("The number of columns", dataset.num_columns)
print("The shape (rows, columns)", dataset.shape)

The number of rows 1057 also available as len(dataset) 1057
The number of columns 5
The shape (rows, columns) (1057, 5)


### Modifying the Dataset with `dataset.map` and `dataset.filter`

* `.map()` and `.filter()` can be used to apply functions to the data

  * Can work with examples one at a time or in batch

* Take a callable that accepts a dict as argument (same dict as the one returned by dataset[i]) 

  * Convert the data to an Example type

  * Iterates over the dataset by calling the function on each example.

### Modifying the Dataset - Cont'd

* Using `dataset.map` and `dataset.filter` is similar to:
```python
for example in dataset:
    function(example)
```
* With `map`, function can modify any of the dataset features


* With `filter`, the function need to return `True` or `False`
  * `True`: keep example
  * `False`: do not keep example


In [19]:
dataset.shape

(1057, 5)

In [20]:
type(dataset)

datasets.arrow_dataset.Dataset

In [21]:
type(dataset[:2])

dict

In [22]:
type(dataset[1])

dict

In [23]:
dataset.select(range(10))

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10
})

In [24]:
dataset.select(range(10)).map(lambda some_instance: print(type(some_instance)))

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 11302.36ex/s]

<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>
<class 'datasets.arrow_dataset.Example'>





Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10
})

In [25]:
dataset.select(range(10)).map(lambda some_instance: print(type(dict(some_instance))))

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 22215.59ex/s]

<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>





Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10
})

In [26]:
dataset.select(range(10)).map(lambda some_instance: print(some_instance['id']))

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 23967.45ex/s]

56be4db0acb8001400a502ec
56be4db0acb8001400a502ed
56be4db0acb8001400a502ee
56be4db0acb8001400a502ef
56be4db0acb8001400a502f0
56be8e613aeaaa14008c90d1
56be8e613aeaaa14008c90d2
56be8e613aeaaa14008c90d3
56bea9923aeaaa14008c91b9
56bea9923aeaaa14008c91ba





Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10
})

In [27]:
def uppercase_id(example):
  example["id"] = example["id"].upper()
  return example

new_dataset = dataset.select(range(10)).map(uppercase_id)

100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9129.96ex/s]


In [28]:
new_dataset["id"]

['56BE4DB0ACB8001400A502EC',
 '56BE4DB0ACB8001400A502ED',
 '56BE4DB0ACB8001400A502EE',
 '56BE4DB0ACB8001400A502EF',
 '56BE4DB0ACB8001400A502F0',
 '56BE8E613AEAAA14008C90D1',
 '56BE8E613AEAAA14008C90D2',
 '56BE8E613AEAAA14008C90D3',
 '56BEA9923AEAAA14008C91B9',
 '56BEA9923AEAAA14008C91BA']

In [29]:
dataset.select(range(10)).filter(lambda x: x['id'][-2] == 'e' )

  0%|                                                                                         | 0/1 [00:00<?, ?ba/s]


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 4
})

### Modifying the dataset example by example

* `.map()` is typically used ot update and modify the content, while leverage smart caching and a fast backend.

  * Requires a function with the following signature: `function(example: dict) -> dict`.

* You can also remove columns when running map with the `remove_columns=Lis[str]` argument.

In [30]:
# Let's add a prefix 'My cute title: ' to each of our titles

def add_prefix_to_title(example):
    example['title'] = 'SOME RANDOM PREFIX: ' + example['title']
    return example

prefixed_dataset = dataset.map(add_prefix_to_title)

print(prefixed_dataset.unique('title'))  # `.unique()` is a super fast way to print the unique elemnts in a column (see the doc for all the methods)

100%|████████████████████████████████████████████████████████████████████████| 1057/1057 [00:00<00:00, 24034.37ex/s]

['SOME RANDOM PREFIX: Super_Bowl_50', 'SOME RANDOM PREFIX: Warsaw']





In [31]:
# Since the input example dict is updated with our function output dict,
# we can actually just return the updated 'title' field
titled_dataset = dataset.map(lambda example: {'title': 'My cutest title: ' + example['title']})

print(titled_dataset[0])

100%|████████████████████████████████████████████████████████████████████████| 1057/1057 [00:00<00:00, 27443.28ex/s]

{'id': '56be4db0acb8001400a502ec', 'title': 'My cutest title: Super_Bowl_50', 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'question': 'Which NFL team represented the AFC at Super Bowl 50?', 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denve




In [None]:
# This will remove the 'title' column while doing the update (after having send it the the mapped function so you can use it in your function!)
less_columns_dataset = dataset.map(lambda example: {'new_title': 'Wouhahh: ' + example['title']}, remove_columns=['title'])

print(less_columns_dataset.column_names)
print(less_columns_dataset.unique('new_title'))

  0%|          | 0/1057 [00:00<?, ?ex/s]

['id', 'context', 'question', 'answers', 'new_title']
['Wouhahh: Super_Bowl_50', 'Wouhahh: Warsaw']


#### Using Examples Indices 

* With `with_indices=True`, dataset indices (from `0` to `len(dataset)-1`) will be supplied to the function 
  * Function must have the signature following signature: `function(example: dict, indice: int) -> dict`

In [None]:
# This will add the index in the dataset to the 'question' field
with_indices_dataset = dataset.map(lambda example, idx: {'question_with_number': f'{idx}: ' + example['question']},
                                   with_indices=True)
with_indices_dataset.shuffle().select([1,2,3])["question_with_number"]



['585: What brand sponsored the "Crash the Super Bowl" contest?',
 '610: Along with 10 Cloverfield Lane, what Paramount trailer appeared during the Super Bowl?',
 '1036: What was the total nominal GDP of Warsaw in 2010?']

In [32]:
# This will add the index in the dataset to the 'question' field
dataset.map(lambda example: {"q": example["question"], "t": example["title"]})


100%|████████████████████████████████████████████████████████████████████████| 1057/1057 [00:00<00:00, 26008.94ex/s]


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'q', 't'],
    num_rows: 1057
})

In [33]:
dataset.map(lambda example: {"q": example["question"], "t": example["title"]}).remove_columns(['id', 'title', 'context', 'question', 'answers'])

Loading cached processed dataset at /Users/mahdi/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-3bf9b0cbed3a7681.arrow


Dataset({
    features: ['q', 't'],
    num_rows: 1057
})

### Modifying the dataset with batched updates

* `.map()` can also work with batch of examples (slices of the dataset).

  * Useful for function that can handle batch, e.g., HuggingFace `tokenizers.`

* Set using `batched=True`

* Function should have the following signature:  
`function(examples: Dict[List]) -> Dict[List] or, if you use indices, function(examples: Dict[List], indices: List[int]) -> Dict[List]).`

  * Function should accept input with the format of a slice of the dataset, e.g., `function(dataset[:10]).`



In [35]:
dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1057
})

In [38]:
# https://github.com/huggingface/datasets/issues/5117
# https://discuss.huggingface.co/t/while-tokenizing-the-dataset-im-getting-this-type-of-error/24779
import random
i = 1
def process_one_batch(batch):
  global i
  print(f"processing one batch {i}")
  i+=1
  print(type(batch))
  print(type(batch['id']))

  return {
      "id": [random.randint(0,100) for _ in range(len(batch['id']))],
      'title': batch["title"], 
      'context': batch["context"],
      'question': batch["question"], 
      'answers': batch["answers"]
          }


temp_db = dataset.select([0, 1,2,3,4,5]).map(process_one_batch, batched=True, batch_size=2)
temp_db["id"]

 67%|█████████████████████████████████████████████████████▎                          | 2/3 [00:00<00:00, 540.96ba/s]

processing one batch 1
<class 'datasets.arrow_dataset.Batch'>
<class 'list'>
processing one batch 2
<class 'datasets.arrow_dataset.Batch'>
<class 'list'>
processing one batch 3
<class 'datasets.arrow_dataset.Batch'>
<class 'list'>





[89, 20, 59, 68, 93, 48]

In [39]:
# use map to change existing column
i = 1
def process_one_batch(batch):
  global i
  print(f"processing one batch {i}")
  i+=1
  print(type(batch))
  print(type(batch['id']))

  return {
      "id": [random.randint(0,100) for _ in range(len(batch['id']))],
          }


temp_db = dataset.select([0, 1,2,3,4,5]).map(process_one_batch, batched=True, batch_size=2)
temp_db["id"]

 67%|█████████████████████████████████████████████████████▎                          | 2/3 [00:00<00:00, 597.05ba/s]

processing one batch 1
<class 'datasets.arrow_dataset.Batch'>
<class 'list'>
processing one batch 2
<class 'datasets.arrow_dataset.Batch'>
<class 'list'>
processing one batch 3
<class 'datasets.arrow_dataset.Batch'>
<class 'list'>





[16, 62, 32, 11, 34, 90]

In [40]:
temp_db['id']

[16, 62, 32, 11, 34, 90]

In [41]:
# use map to augment the data
def process_one_batch(batch):
  global i
  # print(f"processing one batch {i}")
  i+=1
  return {
      "SOME_FIELD": [random.randint(0,100) for _ in range(len(batch['id']))],
          }

temp_db = dataset.map(process_one_batch, batched=True, batch_size=100, num_proc=4)
temp_db

#0:  67%|██████████████████████████████████████████████████▋                         | 2/3 [00:00<00:00, 297.13ba/s]

#1:   0%|                                                                                     | 0/3 [00:00<?, ?ba/s][A

#1:  67%|██████████████████████████████████████████████████▋                         | 2/3 [00:00<00:00, 249.19ba/s][A[A



#2:  67%|██████████████████████████████████████████████████▋                         | 2/3 [00:00<00:00, 207.77ba/s][A[A[A
#3:  67%|██████████████████████████████████████████████████▋                         | 2/3 [00:00<00:00, 259.70ba/s]


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'SOME_FIELD'],
    num_rows: 1057
})

In [42]:
temp_db.select([1,2,3,4])['SOME_FIELD']

[66, 35, 71, 21]

In [43]:
# Let's import a fast tokenizer that can work on batched inputs
# (the 'Fast' tokenizers in HuggingFace)
from transformers import BertTokenizerFast, logging as transformers_logging

transformers_logging.set_verbosity_warning()

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading: 100%|████████████████████████████████████████████████████████████████| 436k/436k [00:00<00:00, 623kB/s]


In [44]:
# Now let's batch tokenize our dataset 'context'

encoded_dataset = dataset.map(lambda example: tokenizer(example['context']), batched=True)
print("encoded_dataset[0]")
pprint(encoded_dataset[0], compact=True)

 50%|████████████████████████████████████████▌                                        | 1/2 [00:00<00:00,  3.97ba/s]

encoded_dataset[0]
{'answers': {'answer_start': [177, 177, 177],
             'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']},
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1],
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015 '
            'season. The American Football Co




In [45]:
# we have added additional columns
pprint(dataset.column_names)

['id', 'title', 'context', 'question', 'answers']


In [46]:
pprint(encoded_dataset.column_names, compact=True)

['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids',
 'attention_mask']


In [47]:
list(enumerate(dataset.select([0])["answers"]))

[(0,
  {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],
   'answer_start': [177, 177, 177]})]

In [48]:
# Let show a more complex processing with the full preparation of the SQuAD dataset
# for training a model from Transformers
def convert_to_features(batch):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = list(zip())
    encodings = tokenizer(batch['context'], batch['question'], truncation=True)

    # Compute start and end tokens for labels
    start_positions, end_positions = [], []
    for i, answer in enumerate(batch['answers']):
        first_char = answer['answer_start'][0]
        last_char = first_char + len(answer['text'][0]) - 1
        start_positions.append(encodings.char_to_token(i, first_char))
        end_positions.append(encodings.char_to_token(i, last_char))

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

encoded_dataset = dataset.map(convert_to_features, batched=True)

 50%|████████████████████████████████████████▌                                        | 1/2 [00:00<00:00,  2.87ba/s]


In [49]:
encodings = tokenizer(dataset.select([0])['context'], dataset.select([0])['question'], truncation=True)
pprint(encodings['input_ids'], compact=True)

[[101, 3198, 5308, 1851, 1108, 1126, 1237, 1709, 1342, 1106, 4959, 1103, 3628,
  1104, 1103, 1305, 2289, 1453, 113, 4279, 114, 1111, 1103, 1410, 1265, 119,
  1109, 1237, 2289, 3047, 113, 10402, 114, 3628, 7068, 14722, 2378, 1103, 1305,
  2289, 3047, 113, 24743, 114, 3628, 2938, 13598, 1572, 782, 1275, 1106, 7379,
  1147, 1503, 3198, 5308, 1641, 119, 1109, 1342, 1108, 1307, 1113, 1428, 128,
  117, 1446, 117, 1120, 12388, 112, 188, 3339, 1107, 1103, 1727, 2948, 2410,
  3894, 1120, 3364, 10200, 117, 1756, 119, 1249, 1142, 1108, 1103, 13163, 3198,
  5308, 117, 1103, 2074, 13463, 1103, 107, 5404, 5453, 107, 1114, 1672, 2284,
  118, 12005, 11751, 117, 1112, 1218, 1112, 7818, 28117, 20080, 16264, 1103,
  3904, 1104, 10505, 1296, 3198, 5308, 1342, 1114, 2264, 183, 15447, 16179, 113,
  1223, 1134, 1103, 1342, 1156, 1138, 1151, 1227, 1112, 107, 3198, 5308, 149,
  107, 114, 117, 1177, 1115, 1103, 7998, 1180, 15199, 2672, 1103, 4944, 183,
  15447, 16179, 1851, 119, 102, 5979, 4279, 1264, 2533, 110

In [50]:
encodings.char_to_token(0, 177)

34

In [51]:
encodings['input_ids'][0][34]

7068

In [52]:
tokenizer.convert_ids_to_tokens(7068)

'Denver'

In [57]:
dataset[0]["answers"]["text"][0]

'Denver Broncos'

In [60]:
len(dataset.select([0])['answers'][0]['text'][0]) - 1

13

In [61]:
encodings.char_to_token(0, 177+13)

35

In [62]:
encodings['input_ids'][0][35]

14722

In [63]:
tokenizer.convert_ids_to_tokens(14722)

'Broncos'

In [64]:
# Now our dataset comprise the labels for the start and end position
# as well as the offsets for converting back tokens
# in span of the original string for evaluation
print("column_names", encoded_dataset.column_names)
print("start_positions", encoded_dataset[:5]['start_positions'])

column_names ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
start_positions [34, 45, 80, 34, 98]


## Formatting Outputs for PyTorch, Tensorflow, Numpy, Pandas

* Common to change the data format so it's compatible with the ML library used
  * Example, pandas PyTorch or TensorFlow

 * Using the `set_format()` method, we can:
  * change the datatype to numpy/pytorch/tensorflow tensors instead of python objects, and
  * Return only the subset of the columns needed

* Use `.set_format(type: Union[None, str], columns: Union[None, str, List[str]])` where:
  * `type` one of `[None, 'numpy', 'pandas', 'torch', 'tensorflow']`  
    * (`None` means return python objects), and
  * `columns` takes the name of column(s)
    * `None` means return all columns

In [65]:
encoded_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 1057
})

In [66]:
columns_to_return = ['input_ids', 'token_type_ids', 
                     'attention_mask', 'start_positions', 
                     'end_positions']

# Uncomment whichever one is appropriate for you
encoded_dataset.set_format(type='torch', columns=columns_to_return)

encoded_dataset.select([0,1,2])["title"]

['Super_Bowl_50', 'Super_Bowl_50', 'Super_Bowl_50']

In [67]:
encoded_dataset[0]

{'input_ids': tensor([  101,  3198,  5308,  1851,  1108,  1126,  1237,  1709,  1342,  1106,
          4959,  1103,  3628,  1104,  1103,  1305,  2289,  1453,   113,  4279,
           114,  1111,  1103,  1410,  1265,   119,  1109,  1237,  2289,  3047,
           113, 10402,   114,  3628,  7068, 14722,  2378,  1103,  1305,  2289,
          3047,   113, 24743,   114,  3628,  2938, 13598,  1572,   782,  1275,
          1106,  7379,  1147,  1503,  3198,  5308,  1641,   119,  1109,  1342,
          1108,  1307,  1113,  1428,   128,   117,  1446,   117,  1120, 12388,
           112,   188,  3339,  1107,  1103,  1727,  2948,  2410,  3894,  1120,
          3364, 10200,   117,  1756,   119,  1249,  1142,  1108,  1103, 13163,
          3198,  5308,   117,  1103,  2074, 13463,  1103,   107,  5404,  5453,
           107,  1114,  1672,  2284,   118, 12005, 11751,   117,  1112,  1218,
          1112,  7818, 28117, 20080, 16264,  1103,  3904,  1104, 10505,  1296,
          3198,  5308,  1342,  1114,  2

In [68]:
# Note that the columns are not removed from the dataset, just not returned when calling __getitem__
# Similarly the inner type of the dataset is not changed to torch.Tensor, the conversion and filtering is done on-the-fly when querying the dataset

print(encoded_dataset.column_names)

['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']


In [69]:
# We can remove the formatting with `.reset_format()`
# or, identically, a call to `.set_format()` with no arguments

encoded_dataset.reset_format()

pprint(encoded_dataset[1], compact=True)

{'answers': {'answer_start': [249, 249, 249],
             'text': ['Carolina Panthers', 'Carolina Panthers',
                      'Carolina Panthers']},
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015

In [70]:
# The current format can be checked with `.format`,
# which is a dict of the type and formatting

pprint(encoded_dataset.format)

{'columns': ['id',
             'title',
             'context',
             'question',
             'answers',
             'input_ids',
             'token_type_ids',
             'attention_mask',
             'start_positions',
             'end_positions'],
 'format_kwargs': {},
 'output_all_columns': False,
 'type': None}


# Metrics API

* `datasets` also provide easy access and sharing of metrics.

* Like datasets, metrics are added as small scripts wrapping common metrics in a common API.
  *  Metrics in datasets leverage the powerful backend to provide smart features out-of-the-box like support for distributed evaluation in `PyTorch`

* Example with a NER metric: `seqeval`

In [76]:
from datasets import load_metric
ner_metric = load_metric('seqeval')
references =   [['O', 'O',   'O',    'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
predictions =  [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
ner_metric.compute(predictions=predictions, references=references)

  ner_metric = load_metric('seqeval')
Downloading builder script: 6.33kB [00:00, 1.03MB/s]                                                                


{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.8}

In [200]:
from datasets import load_metric

In [77]:
ner_metric = load_metric('seqeval')
references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
predictions =  [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
ner_metric.compute(predictions=predictions, references=references)

{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.8}

In [None]:
###Convert to other types 

* Datasets can be converted to many other data types, including numpy and pandas
* Useful for exploring the data in depth
* Is it possible to change the format without having to copy the data?
  * In the spirit of using arrow references, we can update the getter dunder method


In [204]:
dataset.set_format('pandas')

In [207]:
dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1057
})

In [208]:
dataset[0]

Unnamed: 0,id,title,context,question,answers
0,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"{'text': ['Denver Broncos', 'Denver Broncos', ..."


In [210]:
dataset[0:10]

Unnamed: 0,id,title,context,question,answers
0,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"{'text': ['Denver Broncos', 'Denver Broncos', ..."
1,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,"{'text': ['Carolina Panthers', 'Carolina Panth..."
2,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,"{'text': ['Santa Clara, California', 'Levi's S..."
3,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,"{'text': ['Denver Broncos', 'Denver Broncos', ..."
4,56be4db0acb8001400a502f0,Super_Bowl_50,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,"{'text': ['gold', 'gold', 'gold'], 'answer_sta..."
5,56be8e613aeaaa14008c90d1,Super_Bowl_50,Super Bowl 50 was an American football game to...,What was the theme of Super Bowl 50?,"{'text': ['""golden anniversary""', 'gold-themed..."
6,56be8e613aeaaa14008c90d2,Super_Bowl_50,Super Bowl 50 was an American football game to...,What day was the game played on?,"{'text': ['February 7, 2016', 'February 7', 'F..."
7,56be8e613aeaaa14008c90d3,Super_Bowl_50,Super Bowl 50 was an American football game to...,What is the AFC short for?,"{'text': ['American Football Conference', 'Ame..."
8,56bea9923aeaaa14008c91b9,Super_Bowl_50,Super Bowl 50 was an American football game to...,What was the theme of Super Bowl 50?,"{'text': ['""golden anniversary""', 'gold-themed..."
9,56bea9923aeaaa14008c91ba,Super_Bowl_50,Super Bowl 50 was an American football game to...,What does AFC stand for?,"{'text': ['American Football Conference', 'Ame..."


In [212]:
df = dataset[:]

In [214]:
df.head()

Unnamed: 0,id,title,context,question,answers
0,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"{'text': ['Denver Broncos', 'Denver Broncos', ..."
1,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,"{'text': ['Carolina Panthers', 'Carolina Panth..."
2,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,"{'text': ['Santa Clara, California', 'Levi's S..."
3,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,"{'text': ['Denver Broncos', 'Denver Broncos', ..."
4,56be4db0acb8001400a502f0,Super_Bowl_50,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,"{'text': ['gold', 'gold', 'gold'], 'answer_sta..."


In [225]:
df[["id", "title"]].groupby("title").count()

Unnamed: 0_level_0,id
title,Unnamed: 1_level_1
Super_Bowl_50,810
Warsaw,247


### Cost Free Format Comversion

* Is it possible to change the format without having to copy the data?
  * In the spirit of using arrow references, we can update the getter dunder method





In [79]:
class MyCollection:
  def __init__(self, elems):
    self.my_elems = elems

In [80]:
col = MyCollection([1,2,3,4,5])

In [81]:
# This produces an error
col[0]

TypeError: 'MyCollection' object is not subscriptable

In [82]:
class MyCollection:
  def __init__(self, elems):
    self.my_elems = elems
  def __getitem__(self, idx):
    return self.my_elems[idx]

col = MyCollection([1,2,3,4,5])

In [83]:
class MyCollection:
    def __init__(self, elems):
      self.my_elems = elems

    def __getitem__(self, item):
        if "instance_getitem" in self.__dict__:
            return self.instance_getitem(self, item)
        else:
            return self.my_elems[item]

col = MyCollection([1,2,3,4,5])
col[0]

1

In [273]:
col.instance_getitem = lambda self, item: self.my_elems[::-1][item]
col[0]

5