In [None]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git



# Preprocess

Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, they need to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for:

* Text, use a [Tokenizer](https://huggingface.co/docs/transformers/main/en/./main_classes/tokenizer) to convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors.
* Speech and audio, use a [Feature extractor](https://huggingface.co/docs/transformers/main/en/./main_classes/feature_extractor) to extract sequential features from audio waveforms and convert them into tensors.
* Image inputs use a [ImageProcessor](https://huggingface.co/docs/transformers/main/en/./main_classes/image) to convert images into tensors.
* Multimodal inputs, use a [Processor](https://huggingface.co/docs/transformers/main/en/./main_classes/processors) to combine a tokenizer and a feature extractor or image processor.

<Tip>

`AutoProcessor` **always** works and automatically chooses the correct class for the model you're using, whether you're using a tokenizer, image processor, feature extractor or processor.

</Tip>

Before you begin, install 🤗 Datasets so you can load some datasets to experiment with:

```bash
pip install datasets
```

## Natural Language Processing

The main tool for preprocessing textual data is a [tokenizer](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer). A tokenizer splits text into *tokens* according to a set of rules. The tokens are converted into numbers and then tensors, which become the model inputs. Any additional inputs required by the model are added by the tokenizer.

<Tip>

If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining.

</Tip>

Get started by loading a pretrained tokenizer with the [AutoTokenizer.from_pretrained()](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) method. This downloads the *vocab* a model was pretrained with:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Then pass your text to the tokenizer:

In [None]:
tokens = tokenizer.tokenize("Using a Transformer network is simple.")
print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple', '.']


The tokenizer returns a dictionary with three important items:

* [input_ids](https://huggingface.co/docs/transformers/main/en/glossary#input-ids) are the indices corresponding to each token in the sentence.
* [attention_mask](https://huggingface.co/docs/transformers/main/en/glossary#attention-mask) indicates whether a token should be attended to or not.
* [token_type_ids](https://huggingface.co/docs/transformers/main/en/glossary#token-type-ids) identifies which sequence a token belongs to when there is more than one sequence.

Return your input by decoding the `input_ids`:

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(token_ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014, 119]


In [None]:
tokenizer.decode(token_ids)

'Using a Transformer network is simple.'

As you can see, the tokenizer added two special tokens - `CLS` and `SEP` (classifier and separator) - to the sentence. Not all models need
special tokens, but if they do, the tokenizer automatically adds them for you.

If there are several sentences you want to preprocess, pass them as a list to the tokenizer:

In [None]:
from pprint import pprint

In [None]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_inputs = tokenizer(batch_sentences)
pprint(encoded_inputs)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
               [101,
                1790,
                112,
                189,
                1341,
                1119,
                3520,
                1164,
                1248,
                6462,
                117,
                21902,
                1643,
                119,
                102],
               [101, 1327, 1164, 5450, 23434, 136, 102]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0]]}


### Pad

Sentences aren't always the same length which can be an issue because tensors, the model inputs, need to have a uniform shape. Padding is a strategy for ensuring tensors are rectangular by adding a special *padding token* to shorter sentences.

Set the `padding` parameter to `True` to pad the shorter sequences in the batch to match the longest sequence:

In [None]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True)
pprint(encoded_input)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]],
 'input_ids': [[101,
                1252,
                1184,
                1164,
                1248,
                6462,
                136,
                102,
                0,
                0,
                0,
                0,
                0,
                0,
                0],
               [101,
                1790,
                112,
                189,
                1341,
                1119,
                3520,
                1164,
                1248,
                6462,
                117,
                21902,
                1643,
                119,
                102],
               [101,
                1327,
                1164,
                5450,
                23434,
                136,
                102,
             

The first and third sentences are now padded with `0`'s because they are shorter.

### Truncation

On the other end of the spectrum, sometimes a sequence may be too long for a model to handle. In this case, you'll need to truncate the sequence to a shorter length.

Set the `truncation` parameter to `True` to truncate a sequence to the maximum length accepted by the model:

In [None]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
pprint(encoded_input)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]],
 'input_ids': [[101,
                1252,
                1184,
                1164,
                1248,
                6462,
                136,
                102,
                0,
                0,
                0,
                0,
                0,
                0,
                0],
               [101,
                1790,
                112,
                189,
                1341,
                1119,
                3520,
                1164,
                1248,
                6462,
                117,
                21902,
                1643,
                119,
                102],
               [101,
                1327,
                1164,
                5450,
                23434,
                136,
                102,
             

<Tip>

Check out the [Padding and truncation](https://huggingface.co/docs/transformers/main/en/./pad_truncation) concept guide to learn more different padding and truncation arguments.

</Tip>

### Multilinguial processing

In [None]:
# Choose the model checkpoint that supports Hindi
model_name = "bert-base-multilingual-cased"

# Create the tokenizer
multilinguial_tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
# Tokenize a sentence
text = "इस साल के एशियन गेम्स में भारत ने 100 से अधिक पदक जीते।"
tokens = multilinguial_tokenizer.tokenize(text)
print(tokens)

['इस', 'साल', 'के', 'ए', '##श', '##ियन', 'ग', '##ेम', '##्स', 'में', 'भारत', 'ने', '100', 'से', 'अधिक', 'पद', '##क', 'जी', '##ते', '।']


In [None]:
# Convert tokens to input IDs
input_ids = multilinguial_tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[14265, 53749, 10412, 860, 21835, 31332, 867, 80261, 18869, 10532, 14311, 13088, 10407, 11072, 23586, 110126, 12151, 49069, 17203, 920]


In [None]:
# Choose the model checkpoint that supports Hindi
model_name = "ai4bharat/indicwav2vec-hindi"

# Create the tokenizer
hindi_tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [None]:
text = "इस साल के एशियन गेम्स में भारत ने 100 से अधिक पदक जीते।"
tokens = hindi_tokenizer.tokenize(text)
print(tokens)

['इ', 'स', '|', 'स', 'ा', 'ल', '|', 'क', 'े', '|', 'ए', 'श', 'ि', 'य', 'न', '|', 'ग', 'े', 'म', '्', 'स', '|', 'म', 'े', 'ं', '|', 'भ', 'ा', 'र', 'त', '|', 'न', 'े', '|', '1', '0', '0', '|', 'स', 'े', '|', 'अ', 'ध', 'ि', 'क', '|', 'प', 'द', 'क', '|', 'ज', 'ी', 'त', 'े', '।']
