# Quickstart

https://huggingface.co/docs/datasets/quickstart

## Audio

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", "en-US", split="train")

In [None]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

model = AutoModelForAudioClassification.from_pretrained("facebook/wav2vec2-base")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=16000,
        padding=True,
        max_length=100000,
        truncation=True,
    )
    return inputs

dataset = dataset.map(preprocess_function, batched=True)

In [None]:
dataset = dataset.rename_column("intent_class", "labels")

In [None]:
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=["input_values", "labels"])
dataloader = DataLoader(dataset, batch_size=4)

In [None]:
batch = next(iter(dataloader))

In [None]:
batch.keys()

dict_keys(['labels', 'input_values'])

In [None]:
batch["input_values"].shape

torch.Size([4, 100000])

## Vision

In [None]:
from datasets import load_dataset, Image

dataset = load_dataset("beans", split="train")

In [None]:
dataset = dataset.cast_column("image", Image(mode="RGB"))

In [None]:
from torchvision.transforms import Compose, ColorJitter, ToTensor

jitter = Compose(
    [ColorJitter(brightness=0.5, hue=0.5), ToTensor()]
)

In [None]:
def transforms(examples):
    examples["pixel_values"] = [jitter(image.convert("RGB")) for image in examples["image"]]
    return examples

In [None]:
dataset = dataset.with_transform(transforms)

In [None]:
import torch
from torch.utils.data import DataLoader

def collate_fn(examples):
    images = []
    labels = []
    for example in examples:
        images.append((example["pixel_values"]))
        labels.append(example["labels"])
        
    pixel_values = torch.stack(images)
    labels = torch.tensor(labels)
    return {"pixel_values": pixel_values, "labels": labels}

dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=4)

In [None]:
batch = next(iter(dataloader))

In [None]:
batch.keys()

dict_keys(['pixel_values', 'labels'])

In [None]:
batch["pixel_values"].shape

torch.Size([4, 3, 500, 500])

## NLP

In [None]:
from datasets import load_dataset

dataset = load_dataset("glue", "mrpc", split="train")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def encode(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length")

dataset = dataset.map(encode, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.map(lambda examples: {"labels": examples["label"]}, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

In [None]:
import torch

dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)

In [None]:
batch = next(iter(dataloader))

In [None]:
batch.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
batch['input_ids'].shape

torch.Size([32, 512])