# Creating a dataset
Creating a dataset for our own data.

In [9]:
# !pip install datasets

Datasets supports many common formats such as `csv`, `json/jsonl`, `parquet`, `txt`. we can simply load a dataset file

In [10]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="/content/classification.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'intent'],
        num_rows: 1500
    })
})

In [12]:
dataset = load_dataset("csv", data_files={
    "train": "/content/classification.csv",
    "test": "/content/classification.csv"
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'intent'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'intent'],
        num_rows: 1500
    })
})

In [14]:
# remote dataset loading for json
base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
dataset = load_dataset("json", data_files={"train": base_url + "train-v1.1.json", "validation": base_url + "dev-v1.1.json"}, field="data")

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    validation: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

In [16]:
# multiprocessing
from datasets import load_dataset

ml_librispeech_spanish = load_dataset("facebook/multilingual_librispeech", "spanish", num_proc=8)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

9_hours-00000-of-00001.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

1_hours-00000-of-00001.parquet:   0%|          | 0.00/15.7M [00:00<?, ?B/s]

Setting num_proc from 8 back to 1 for the dev split to disable multiprocessing as it only contains one shard.


Generating dev split:   0%|          | 0/2408 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/2385 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/220701 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the 9_hours split to disable multiprocessing as it only contains one shard.


Generating 9_hours split:   0%|          | 0/2110 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the 1_hours split to disable multiprocessing as it only contains one shard.


Generating 1_hours split:   0%|          | 0/233 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [18]:
ml_librispeech_spanish

DatasetDict({
    dev: Dataset({
        features: ['audio', 'original_path', 'begin_time', 'end_time', 'transcript', 'audio_duration', 'speaker_id', 'chapter_id', 'file', 'id'],
        num_rows: 2408
    })
    test: Dataset({
        features: ['audio', 'original_path', 'begin_time', 'end_time', 'transcript', 'audio_duration', 'speaker_id', 'chapter_id', 'file', 'id'],
        num_rows: 2385
    })
    train: Dataset({
        features: ['audio', 'original_path', 'begin_time', 'end_time', 'transcript', 'audio_duration', 'speaker_id', 'chapter_id', 'file', 'id'],
        num_rows: 220701
    })
    9_hours: Dataset({
        features: ['audio', 'original_path', 'begin_time', 'end_time', 'transcript', 'audio_duration', 'speaker_id', 'chapter_id', 'file', 'id'],
        num_rows: 2110
    })
    1_hours: Dataset({
        features: ['audio', 'original_path', 'begin_time', 'end_time', 'transcript', 'audio_duration', 'speaker_id', 'chapter_id', 'file', 'id'],
        num_rows: 233
    })

## Folder based builders
For image and audio: folder/train/bottle/bottle_1.png (folder/split_data/label/image)

## From dictionary data

In [19]:
from datasets import Dataset

def gen():
  yield {"data1": "val1", "data2": "val2"}
  yield {"data1": "val3", "data2": "val4"}

ds = Dataset.from_generator(gen)
ds

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['data1', 'data2'],
    num_rows: 2
})

In [20]:
from datasets import IterableDataset

ds = IterableDataset.from_generator(gen)
ds

IterableDataset({
    features: ['data1', 'data2'],
    num_shards: 1
})

In [21]:
for example in ds:
  print(example)

{'data1': 'val1', 'data2': 'val2'}
{'data1': 'val3', 'data2': 'val4'}


In [22]:
from datasets import Dataset

ds = Dataset.from_dict({"data1": ["val1", "val2"], "data2": ["val3", "val4"]})
ds

Dataset({
    features: ['data1', 'data2'],
    num_rows: 2
})