# Load Datasets

load_dataset_builder - to explore datasets before downloading them

In [1]:
from datasets import load_dataset_builder

ds_builder = load_dataset_builder("rotten_tomatoes")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Inspect dataset description
print(ds_builder.info)
print('------------------------------')
print(ds_builder.info.description)

DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='parquet', dataset_name='rotten_tomatoes', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=1074810, num_examples=8530, shard_lengths=None, dataset_name=None), 'validation': SplitInfo(name='validation', num_bytes=134679, num_examples=1066, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=135972, num_examples=1066, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=487770, post_processing_size=None, dataset_size=1345461, size_in_bytes=None)
------------------------------



In [3]:
ds_builder.info.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

Load the dataset

In [7]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes", split="train")

dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

### Split

In [8]:
from datasets import get_dataset_split_names

get_dataset_split_names("rotten_tomatoes")

['train', 'validation', 'test']

In [9]:
dataset = load_dataset("rotten_tomatoes", split="train")

dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

### Configurations

- Some datasets contain several sub-datasets. These sub-datasets are known as configurations.
Use the get_dataset_config_names() function to retrieve a list of all the possible configurations available to your dataset


In [11]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("PolyAI/minds14", trust_remote_code=True)
print(configs)

['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN', 'all']


In [15]:
get_dataset_split_names("PolyAI/minds14", 'en-US')

['train']

In [21]:
from datasets import load_dataset

minds_14 = load_dataset("PolyAI/minds14", "en-US", split='train')

### Remote code

- Python code included within a dataset repository on the Hub
- set trust_remote_code=True to use a dataset with a loading script


# know the dataset

There are two types of dattaset objects
- Dataset : provides fast random access to the rows, and memory-mapping so that loading even large datasets only uses a relatively small amount of device memory
- IterableDataset : allows you to access and use the dataset without waiting for it to download completely!

In [22]:
configs = get_dataset_config_names("nvidia/HelpSteer2", trust_remote_code=True)
print(configs)

Downloading readme: 100%|██████████| 10.6k/10.6k [00:00<00:00, 38.1MB/s]


['default']


In [24]:
get_dataset_split_names("nvidia/HelpSteer2", 'default')

['train', 'validation']

In [25]:
dataset = load_dataset("nvidia/HelpSteer2", "default", split='train')

Downloading data: 100%|██████████| 11.3M/11.3M [00:01<00:00, 7.30MB/s]
Downloading data: 100%|██████████| 582k/582k [00:00<00:00, 1.21MB/s]
Generating train split: 100%|██████████| 20324/20324 [00:00<00:00, 78589.31 examples/s]
Generating validation split: 100%|██████████| 1038/1038 [00:00<00:00, 76564.51 examples/s]


In [26]:
dataset

Dataset({
    features: ['prompt', 'response', 'helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity'],
    num_rows: 20324
})

### Indexing

In [27]:
dataset[0]

{'prompt': 'c#',
 'response': 'C# is a high-level, object-oriented programming language developed by Microsoft as part of its .NET initiative. It was created as a modern alternative to Java and supports a variety of programming paradigms, including imperative, functional, and event-driven. C# is primarily used for Windows application development, but it can also be used for web, mobile, and game development. The language is designed to be safe, secure, and efficient, and it provides developers with a rich set of libraries and tools for building robust and scalable applications. C# is also widely used in the game development industry, particularly in the development of games for the Xbox 360 and Xbox One consoles.',
 'helpfulness': 3,
 'correctness': 4,
 'coherence': 4,
 'complexity': 2,
 'verbosity': 1}

In [28]:
dataset['correctness']

[4,
 4,
 4,
 4,
 3,
 3,
 3,
 4,
 4,
 3,
 1,
 4,
 4,
 4,
 4,
 3,
 2,
 1,
 4,
 4,
 3,
 3,
 0,
 4,
 1,
 4,
 2,
 4,
 2,
 4,
 3,
 3,
 4,
 2,
 3,
 4,
 4,
 4,
 4,
 3,
 2,
 4,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 3,
 4,
 3,
 1,
 0,
 3,
 4,
 4,
 3,
 3,
 1,
 1,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 3,
 0,
 1,
 4,
 4,
 2,
 4,
 2,
 2,
 2,
 2,
 1,
 3,
 2,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 4,
 3,
 2,
 4,
 2,
 2,
 2,
 2,
 3,
 4,
 4,
 4,
 2,
 0,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 3,
 3,
 1,
 4,
 2,
 4,
 3,
 3,
 3,
 0,
 4,
 2,
 4,
 3,
 3,
 2,
 3,
 2,
 4,
 4,
 3,
 0,
 3,
 2,
 3,
 3,
 2,
 2,
 3,
 4,
 1,
 3,
 4,
 2,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 0,
 4,
 3,
 4,
 4,
 3,
 0,
 0,
 1,
 3,
 4,
 2,
 3,
 3,
 4,
 4,
 1,
 1,
 0,
 1,
 4,
 4,
 4,
 0,
 4,
 3,
 4,
 2,
 4,
 3,
 4,
 3,
 3,
 4,
 3,
 4,
 4,
 2,
 1,
 1,
 2,
 1,
 3,
 3,
 4,
 4,
 2,
 4,
 4,
 4,
 4,
 1,
 4,
 4,
 0,
 1,
 4,
 4,
 1,
 3,
 4,
 4,
 0,
 0,
 3,
 2,
 4,
 4,
 3,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 2,
 2,
 4,
 3,
 0,
 0,


In [29]:
dataset[0]['correctness']

4

In [31]:
dataset[:3]

{'prompt': ['c#', 'c#', 'bacillus subtilus'],
 'response': ['C# is a high-level, object-oriented programming language developed by Microsoft as part of its .NET initiative. It was created as a modern alternative to Java and supports a variety of programming paradigms, including imperative, functional, and event-driven. C# is primarily used for Windows application development, but it can also be used for web, mobile, and game development. The language is designed to be safe, secure, and efficient, and it provides developers with a rich set of libraries and tools for building robust and scalable applications. C# is also widely used in the game development industry, particularly in the development of games for the Xbox 360 and Xbox One consoles.',
  'C# (pronounced "C sharp") is a modern, object-oriented programming language developed by Microsoft. It is widely used for building various types of applications, including web applications, desktop applications, mobile applications, and games

## IterableDataset

- An IterableDataset is loaded when you set the streaming parameter to True in load_dataset()

In [32]:
iterable_dataset = load_dataset("food101", split="train", streaming=True)
for example in iterable_dataset:
    print(example)
    break

Downloading readme: 100%|██████████| 10.5k/10.5k [00:00<00:00, 38.3MB/s]


{'image': <PIL.Image.Image image mode=RGB size=384x512 at 0x7FC5E60A9D80>, 'label': 6}


In [33]:
iterable_dataset = dataset.to_iterable_dataset()

- You don’t get random access to examples in an IterableDataset. Instead, you should iterate over its elements, for example, by calling next(iter()) or with a for loop to return the next item from the IterableDataset

In [35]:
next(iter(iterable_dataset))

{'prompt': 'c#',
 'response': 'C# is a high-level, object-oriented programming language developed by Microsoft as part of its .NET initiative. It was created as a modern alternative to Java and supports a variety of programming paradigms, including imperative, functional, and event-driven. C# is primarily used for Windows application development, but it can also be used for web, mobile, and game development. The language is designed to be safe, secure, and efficient, and it provides developers with a rich set of libraries and tools for building robust and scalable applications. C# is also widely used in the game development industry, particularly in the development of games for the Xbox 360 and Xbox One consoles.',
 'helpfulness': 3,
 'correctness': 4,
 'coherence': 4,
 'complexity': 2,
 'verbosity': 1}

In [36]:
for example in iterable_dataset:
    print(example)
    break

{'prompt': 'c#', 'response': 'C# is a high-level, object-oriented programming language developed by Microsoft as part of its .NET initiative. It was created as a modern alternative to Java and supports a variety of programming paradigms, including imperative, functional, and event-driven. C# is primarily used for Windows application development, but it can also be used for web, mobile, and game development. The language is designed to be safe, secure, and efficient, and it provides developers with a rich set of libraries and tools for building robust and scalable applications. C# is also widely used in the game development industry, particularly in the development of games for the Xbox 360 and Xbox One consoles.', 'helpfulness': 3, 'correctness': 4, 'coherence': 4, 'complexity': 2, 'verbosity': 1}


In [37]:
# Get first three examples
list(iterable_dataset.take(3))

[{'prompt': 'c#',
  'response': 'C# is a high-level, object-oriented programming language developed by Microsoft as part of its .NET initiative. It was created as a modern alternative to Java and supports a variety of programming paradigms, including imperative, functional, and event-driven. C# is primarily used for Windows application development, but it can also be used for web, mobile, and game development. The language is designed to be safe, secure, and efficient, and it provides developers with a rich set of libraries and tools for building robust and scalable applications. C# is also widely used in the game development industry, particularly in the development of games for the Xbox 360 and Xbox One consoles.',
  'helpfulness': 3,
  'correctness': 4,
  'coherence': 4,
  'complexity': 2,
  'verbosity': 1},
 {'prompt': 'c#',
  'response': 'C# (pronounced "C sharp") is a modern, object-oriented programming language developed by Microsoft. It is widely used for building various types

unlike slicing, IterableDataset.take() creates a new IterableDataset

# Preprocessing

## Tokenize text