# Transformers
Huggingface Transformers provides API and tools to easily download and train state of the art pretrained models.

In [None]:
# !pip install transformers

In [None]:
# !pip install datasets

In [None]:
# !pip install evaluate

In [None]:
# !pip install accelerate

# The Pipeline
We can use pipeline() to use a pretrained model for inference and for many different tasks of different modalities.

- Text Classification
- Text Generation
- Summarization
- NER
- QA

In [5]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
# download and caches a default pretrained model and tokenizer for sentiment analysis

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
classifier("I am happy to learn NLP")

[{'label': 'POSITIVE', 'score': 0.9998495578765869}]

In [7]:
# for more than one input

results = classifier(
    ["I am very happy to lern NLP", "I am sad to not learn NLP"]
)

results

[{'label': 'POSITIVE', 'score': 0.9997096657752991},
 {'label': 'NEGATIVE', 'score': 0.997201681137085}]

In [8]:
# using model of our choice

MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment"

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
classifier = pipeline(
    "sentiment-analysis", model=model, tokenizer=tokenizer
)

classifier(
    ["I am very happy to lern NLP", "I am sad to not learn NLP"]
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': '5 stars', 'score': 0.6437772512435913},
 {'label': '2 stars', 'score': 0.3674001097679138}]

The **AutoClass** automatically gets the model given the name of the model.

In [11]:
# autoTokenizer to get the tokenizer of our choice

from transformers import AutoTokenizer

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
encoding = tokenizer("I am very happy to learn NLP")
print(encoding)

{'input_ids': [101, 151, 10345, 12495, 19308, 10114, 34990, 19848, 10373, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [13]:
# tokenizing a batch of text

pt_batch = tokenizer(
    ["I am very happy to learn NLP", "I am sad to not learn NLP"],
    padding=True,
    truncation=True,
    max_length=20,
    return_tensors="pt"
)

pt_batch

{'input_ids': tensor([[  101,   151, 10345, 12495, 19308, 10114, 34990, 19848, 10373,   102],
        [  101,   151, 10345, 14628, 10114, 10497, 34990, 19848, 10373,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [14]:
tokenizer(
  "Hi", # first setence
  "Hello there what are you doing", # second one
  # "Hey there you", ? this will act like label
  padding=True,
  truncation=True,
  max_length=5, # including CLS and SEP token
  return_tensors="pt"
)
# [CLS Hi SEP HELLO THERE SEP] 5 max length so what are you doing truncate

{'input_ids': tensor([[  101, 11463,   102, 29155,   102]]), 'token_type_ids': tensor([[0, 0, 0, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [15]:
# AutoModel

from transformers import AutoModelForSequenceClassification

pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [16]:
output = pt_model(**pt_batch)
output

SequenceClassifierOutput(loss=None, logits=tensor([[-2.3199, -2.3144, -0.3738,  1.7923,  2.4240],
        [ 0.8103,  1.0855,  0.7776, -0.7419, -1.6248]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [17]:
import torch
from torch import nn

tensor_ = torch.tensor(
    [[1,5],
     [6,1]],
    dtype=torch.float32
)

tensor2_ = torch.tensor(
    [[1,6],
     [5,1]],
    dtype=torch.float32
)

preds_0 = nn.functional.softmax(tensor_, dim=0) # looks col for 2d
preds_1 = nn.functional.softmax(tensor_, dim=1) # looks for each row in 2d
# preds_1_ = nn.functional.softmax(tensor_, dim=-1) # looks for each row in 2d

preds_2_0 = nn.functional.softmax(tensor2_, dim=0) # looks col for 2d
preds_2_1 = nn.functional.softmax(tensor2_, dim=1) # looks for each row in 2d
# preds2_1_ = nn.functional.softmax(tensor_, dim=-1) # looks for each row in 2d


print("For tensor 1 and b with dim 0 and 1")
print(preds_0)
print(preds_1)

print("\n")

print("For tensor 2 with dim 0 and 1")
print(preds_2_0)
print(preds_2_1)

# so for 2d softmax will apply to each col if dim=0 and to each row if dim=1

For tensor 1 and b with dim 0 and 1
tensor([[0.0067, 0.9820],
        [0.9933, 0.0180]])
tensor([[0.0180, 0.9820],
        [0.9933, 0.0067]])


For tensor 2 with dim 0 and 1
tensor([[0.0180, 0.9933],
        [0.9820, 0.0067]])
tensor([[0.0067, 0.9933],
        [0.9820, 0.0180]])


In [18]:
from torch import nn

pt_predictions = nn.functional.softmax(
    output.logits,
    dim=1 # apply to each row
)
pt_predictions

tensor([[0.0054, 0.0054, 0.0379, 0.3302, 0.6211],
        [0.2790, 0.3674, 0.2700, 0.0591, 0.0244]], grad_fn=<SoftmaxBackward0>)

In [19]:
id2label = model.config.id2label

In [20]:
for pred in pt_predictions:
  print(f"Sentiment level: {id2label[torch.argmax(pred).tolist()]}")

Sentiment level: 5 stars
Sentiment level: 2 stars


In [21]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "nlptown/bert-base-multilingual-uncased-sentiment",
  "_num_labels": 5,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "finetuning_task": "sentiment-analysis",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1 star",
    "1": "2 stars",
    "2": "3 stars",
    "3": "4 stars",
    "4": "5 stars"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1 star": 0,
    "2 stars": 1,
    "3 stars": 2,
    "4 stars": 3,
    "5 stars": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_pe

## Saving a model

In [22]:
MODEL_PATH = "./saved_model"

In [23]:
tokenizer.save_pretrained(MODEL_PATH)
pt_model.save_pretrained(MODEL_PATH)

## Loading a saved model

In [24]:
loaded_model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
loaded_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [25]:
classifier_loaded = pipeline(
    "sentiment-analysis",
    model=loaded_model,
    tokenizer=loaded_tokenizer
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [26]:
classifier_loaded("I am very happy to learn Deep Learning")

[{'label': '5 stars', 'score': 0.6557235717773438}]

## Load in another framework
We can save model in one format and load in another format

In [27]:
from transformers import TFAutoModel

tf_model = TFAutoModel.from_pretrained(MODEL_PATH, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [28]:
tf_model

<transformers.models.bert.modeling_tf_bert.TFBertModel at 0x7ae3a0224940>

In [29]:
tf_model.config

BertConfig {
  "_name_or_path": "./saved_model",
  "_num_labels": 5,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "finetuning_task": "sentiment-analysis",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1 star",
    "1": "2 stars",
    "2": "3 stars",
    "3": "4 stars",
    "4": "5 stars"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1 star": 0,
    "2 stars": 1,
    "3 stars": 2,
    "4 stars": 3,
    "5 stars": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embeddi

In [30]:
loaded_tokenizer

BertTokenizerFast(name_or_path='./saved_model', vocab_size=105879, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Custom Model Build

In [31]:
from transformers import AutoConfig

my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased")

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [32]:
my_config

DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.46.2",
  "vocab_size": 30522
}

In [34]:
from transformers import AutoModel

my_model = AutoModel.from_config(my_config) # load a model from the config

In [35]:
my_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [37]:
from transformers import TFAutoModel

my_model_tf = TFAutoModel.from_config(my_config)

my_model_tf

<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel at 0x7ae3a020c100>

## Trainer
We can use trainer to create a training loop with additional features like `mixed precision`, `distributed training` and many more.

In [38]:
from transformers import AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./trained_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
)

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [41]:
from datasets import load_dataset

# loading dataset for training
dataset = load_dataset(
    "rotten_tomatoes"
)

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [42]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [43]:
def tokenize(dataset):
  return tokenizer(dataset["text"])

In [44]:
dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [45]:
# data collator to create a batch of examples

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [46]:
# using trainer

from transformers import Trainer

trainer = Trainer(
    model=model, # our model
    args=training_args, # training config
    train_dataset=dataset["train"], # train dataset
    eval_dataset=dataset["test"], # evaluation dataset
    processing_class=tokenizer, # processor
    data_collator=data_collator, # batcher
)

In [47]:
trainer.train() # need wandb api key to train

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,0.4495
1000,0.388
1500,0.2597
2000,0.2708


TrainOutput(global_step=2134, training_loss=0.3359757041752394, metrics={'train_runtime': 178.1232, 'train_samples_per_second': 95.776, 'train_steps_per_second': 11.98, 'total_flos': 195974132394480.0, 'train_loss': 0.3359757041752394, 'epoch': 2.0})

For tasks - like translation or summarization - that use a sequence-to-sequence model, use the Seq2SeqTrainer and Seq2SeqTrainingArguments classes instead.

And for training we need to login to weightandbiases to get the wandb api key during training

In [52]:
dataset["train"][0]["text"]

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'

In [53]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
})

We can subclass the method inside trainer to customize for feature like optimizer, scheduler etc.

In [55]:
trainer

<transformers.trainer.Trainer at 0x7ae32258dba0>

In [59]:
training_arguments = trainer.args

In this way we can train

In [60]:
training_arguments

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=F

In [63]:
training_arguments.learning_rate

2e-05

We can get the config from trainig arg