# Hugging Face Pipeline

In [1]:
from transformers import pipeline

In [3]:
# Sentiment Analysis
classifier = pipeline('sentiment-analysis')
result = classifier("I love using Hugging Face transformers!")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9971315860748291}]


In [5]:
# Named Entity Recognition (NER)
ner = pipeline('ner')
result = ner("My name is Nikita and I live in Guwahati.")
print(result)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


model.safetensors:  82%|########1 | 1.09G/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[{'entity': 'I-PER', 'score': 0.9986278, 'index': 4, 'word': 'Nik', 'start': 11, 'end': 14}, {'entity': 'I-PER', 'score': 0.99777955, 'index': 5, 'word': '##ita', 'start': 14, 'end': 17}, {'entity': 'I-LOC', 'score': 0.9988055, 'index': 10, 'word': 'G', 'start': 32, 'end': 33}, {'entity': 'I-LOC', 'score': 0.9780768, 'index': 11, 'word': '##u', 'start': 33, 'end': 34}, {'entity': 'I-LOC', 'score': 0.97291934, 'index': 12, 'word': '##wa', 'start': 34, 'end': 36}, {'entity': 'I-LOC', 'score': 0.9935022, 'index': 13, 'word': '##hat', 'start': 36, 'end': 39}, {'entity': 'I-LOC', 'score': 0.9986638, 'index': 14, 'word': '##i', 'start': 39, 'end': 40}]


In [6]:
# Question Answering
question_answerer = pipeline('question-answering')
result = question_answerer(question="What is Hugging Face?", context="Hugging Face is a company that provides open-source NLP technologies.")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.6646777987480164, 'start': 16, 'end': 68, 'answer': 'a company that provides open-source NLP technologies'}


In [7]:
# Text Generation
generator = pipeline('text-generation', model='gpt2')
result = generator("Once upon a time,")
print(result)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Once upon a time, men from the country had fought with the people of the west to defend their homeland and uphold their common values in an age of unparalleled civil war. It was in those times that the nation of Great Britain emerged victorious from a long'}]


# AutoTokenizer Class

In [8]:
from transformers import AutoTokenizer

# Specify the model checkpoint
model_checkpoint = 'bert-base-uncased'

# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Example text
text = ["Transformers are incredibly powerful.","Transformers are awesome"]

# Tokenize text
tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')  # 'pt' for PyTorch tensors ,'tf' for tensorflow tensors.
print(tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': tensor([[  101, 19081,  2024, 11757,  3928,  1012,   102],
        [  101, 19081,  2024, 12476,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0]])}


In [9]:
# Example text
text = "tokenization is all you need ?"

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)

Tokens: ['token', '##ization', 'is', 'all', 'you', 'need', '?']
Input IDs: [19204, 3989, 2003, 2035, 2017, 2342, 1029]


In [10]:
# Convert tokens to token IDs
token_ids = tokenizer(text, return_tensors='pt')
print("Token IDs:", token_ids)
print(tokenizer.decode(input_ids))

Token IDs: {'input_ids': tensor([[  101, 19204,  3989,  2003,  2035,  2017,  2342,  1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tokenization is all you need?


In [11]:
# To get the token_id for the PAD
print(tokenizer.pad_token_id)
print(tokenizer.pad_token)

0
[PAD]


# Model-Specific Tokenizers

In [12]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize a sample text
inputs = tokenizer("Hello, this is an example using BertTokenizer.", padding=True, truncation=True, return_tensors='pt')

print(inputs)

{'input_ids': tensor([[  101,  7592,  1010,  2023,  2003,  2019,  2742,  2478, 14324, 18715,
         18595,  6290,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [16]:
from transformers import GPT2Tokenizer

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize a sample text
inputs = tokenizer("Hello, this is an example using GPT2Tokenizer.", padding=True, truncation=True, return_tensors='pt')

print(inputs)

{'input_ids': tensor([[15496,    11,   428,   318,   281,  1672,  1262,   402, 11571,    17,
         30642,  7509,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}




In [15]:
from transformers import RobertaTokenizer

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize a sample text
inputs = tokenizer("Hello, this is an example using RobertaTokenizer.", padding=True, truncation=True, return_tensors='pt')

print(inputs)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

{'input_ids': tensor([[    0, 31414,     6,    42,    16,    41,  1246,   634,  1738,   102,
         45643,  6315,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


# AutoModel Class

In [17]:
from transformers import AutoTokenizer, AutoModel

model_checkpoint = 'bert-base-uncased'

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize a sample text
inputs = tokenizer("Hello, this is an example using AutoModel.", return_tensors='pt')

# Initialize the model
model = AutoModel.from_pretrained(model_checkpoint)

# Perform a forward pass
outputs = model(**inputs)

# Print the outputs
print(outputs)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.2771, -0.5205, -0.0631,  ..., -0.9065,  0.1194,  1.0039],
         [-0.2234, -0.5760,  0.0866,  ..., -0.4757,  0.5685,  0.6233],
         [-0.5544, -0.1714, -0.0568,  ..., -0.9153,  0.1127,  0.5515],
         ...,
         [ 0.8083, -0.1716,  0.1858,  ..., -0.3737, -0.3537, -0.1281],
         [-0.3825, -0.9431, -0.2828,  ..., -0.1046,  0.1665,  0.0370],
         [ 0.5245,  0.0810, -0.3756,  ...,  0.3192, -0.7704, -0.0090]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.5531e-01, -5.2481e-01, -7.9953e-01,  6.8851e-01,  6.4076e-01,
         -3.5061e-01,  7.5902e-01,  3.2545e-01, -7.1094e-01, -9.9998e-01,
         -3.2064e-01,  7.9689e-01,  9.7916e-01,  4.0093e-01,  8.7427e-01,
         -6.5004e-01, -1.1536e-01, -5.7262e-01,  4.2141e-01, -2.1894e-01,
          6.5455e-01,  9.9998e-01,  1.9717e-01,  2.9857e-01,  5.5143e-01,
          9.4572e-01, -7.0518e-01,  9.1952e-01,  9.4771e-01,  7.832

In [18]:
outputs.last_hidden_state.shape

torch.Size([1, 13, 768])

In [19]:
from transformers import AutoModel

bert_model = AutoModel.from_pretrained('bert-base-uncased')
print(type(bert_model))
print(bert_model)


gpt_model = AutoModel.from_pretrained('gpt2')
print(type(gpt_model))
print(gpt_model)

bart_model = AutoModel.from_pretrained('facebook/bart-large-cnn')
print(type(bart_model))
print(bart_model)

<class 'transformers.models.bert.modeling_bert.BertModel'>
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

<class 'transformers.models.bart.modeling_bart.BartModel'>
BartModel(
  (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
  (encoder): BartEncoder(
    (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
    (layers): ModuleList(
      (0-11): 12 x BartEncoderLayer(
        (self_attn): BartSdpaAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_lay

# Custom Classification Model Using AutoModel

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# Define a custom classification model
class CustomClassificationModel(nn.Module):
    def __init__(self, model_checkpoint, num_labels):
        super(CustomClassificationModel, self).__init__()
        self.automodel = AutoModel.from_pretrained(model_checkpoint)
        self.classifier = nn.Linear(self.automodel.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.automodel(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # Get the hidden state of the [CLS] token
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

# Model checkpoint and number of labels
model_checkpoint = 'bert-base-uncased'
num_labels = 2  # Example for binary classification

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize a sample text
inputs = tokenizer("Hello, this is an example using a custom classification head.", return_tensors='pt')

# Initialize the custom model
model = CustomClassificationModel(model_checkpoint, num_labels)

# Perform a forward pass
logits = model(**inputs)

# Apply softmax to get probabilities
probabilities = F.softmax(logits, dim=-1)

# Convert probabilities to predicted class labels
predictions = torch.argmax(probabilities, dim=-1)

# Print the probabilities and the predicted class
print(probabilities)
print(predictions)

tensor([[0.3616, 0.6384]], grad_fn=<SoftmaxBackward0>)
tensor([1])


# AutoModelFor** Classes

In [21]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch

# Initialize the tokenizer and model
model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

# Tokenize input text
inputs = tokenizer("Hello, this is an example for sequence classification.", return_tensors='pt')

# Perform a forward pass
outputs = model(**inputs)
logits = outputs.logits

# Apply softmax to get probabilities
probabilities = F.softmax(logits, dim=-1)
predictions = torch.argmax(probabilities, dim=-1)

# Print the probabilities and predicted class
print(probabilities)
print(predictions)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[0.1450, 0.8550]], grad_fn=<SoftmaxBackward0>)
tensor([1])


In [22]:
# AutoModelForCausalLM

from transformers import AutoTokenizer, AutoModelForCausalLM

# Initialize the tokenizer and model
model_checkpoint = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

# Tokenize input text
inputs = tokenizer("Once upon a time", return_tensors='pt')

# Generate text
outputs = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a


In [24]:
# AutoModelForTokenClassification

from transformers import AutoTokenizer, AutoModelForTokenClassification

# Initialize the tokenizer and model
model_checkpoint = 'dbmdz/bert-large-cased-finetuned-conll03-english'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

# Tokenize input text
inputs = tokenizer("John lives in New York City.", return_tensors='pt')

# Perform a forward pass
outputs = model(**inputs)
logits = outputs.logits

# Get the predictions
predictions = torch.argmax(logits, dim=-1)

# Print the predictions
predicted_tokens = [model.config.id2label[prediction.item()] for prediction in predictions[0]]
print(predicted_tokens)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['O', 'I-PER', 'O', 'O', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O']


# Model-Specific Classes

In [25]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the tokenizer and model
model_checkpoint = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
model = BertForSequenceClassification.from_pretrained(model_checkpoint)

# Define labels (these are examples; adjust based on your actual model's training)
labels = ["Negative", "Positive"]

# Input sentences
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this"
]

# Tokenize and encode the sentences
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Perform a forward pass and get logits
outputs = model(**inputs).logits

# Apply softmax to get probabilities
probabilities = torch.nn.functional.softmax(outputs, dim=-1)

# Get the predicted class
predictions = torch.argmax(probabilities, dim=-1)

# Print the probabilities and predicted classes
for i, sentence in enumerate(sentences):
    print(f"Sentence: {sentence}")
    print(f"Probabilities: {probabilities[i].tolist()}")
    print(f"Predicted Class: {labels[predictions[i]]}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentence: I've been waiting for a HuggingFace course my whole life.
Probabilities: [0.4392324984073639, 0.5607675313949585]
Predicted Class: Positive
Sentence: I hate this
Probabilities: [0.5435839891433716, 0.4564160406589508]
Predicted Class: Negative


In [26]:
# GPT2LMHeadModel

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Input prompt
prompt = "Once upon a time"

# Tokenize and encode the prompt
inputs = tokenizer(prompt, return_tensors='pt')

# Generate text
outputs = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a


# AutoConfig Class

In [27]:
from transformers import AutoConfig

# Load configuration for a specific model checkpoint
model_checkpoint = 'bert-base-uncased'
config = AutoConfig.from_pretrained(model_checkpoint)

# Print the configuration
print(config)

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [29]:
# Modify the configuration
config.num_labels = 5  # Change the number of labels for classification

# Print the modified configuration
print(config)
config.hidden_act

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



'gelu'

In [30]:
# Using Configuration to Initialize a Model

from transformers import AutoConfig, AutoModelForSequenceClassification

# Load and modify the configuration
model_checkpoint = 'bert-base-uncased'
config = AutoConfig.from_pretrained(model_checkpoint)
config.num_labels = 5  # Change the number of labels for classification

# Initialize the model with the modified configuration
model = AutoModelForSequenceClassification.from_config(config)

# Print the model
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [31]:
# BERT Configuration
from transformers import BertConfig, BertForSequenceClassification

# Load and modify the BERT configuration
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = 5  # Change the number of labels for classification

# Initialize the BERT model with the modified configuration
model = BertForSequenceClassification(config)

# Print the model
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [32]:
# GPT-2 Configuration

from transformers import GPT2Config, GPT2LMHeadModel

# Load and modify the GPT-2 configuration
config = GPT2Config.from_pretrained('gpt2')
config.output_hidden_states = True  # Change the configuration to output hidden states

# Initialize the GPT-2 model with the modified configuration
model = GPT2LMHeadModel(config)

# Print the model
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [33]:
# DistilBERT Configuration

from transformers import DistilBertConfig, DistilBertForTokenClassification

# Load and modify the DistilBERT configuration
config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
config.num_labels = 9  # Change the number of labels for NER

# Initialize the DistilBERT model with the modified configuration
model = DistilBertForTokenClassification(config)

# Print the model
print(model)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

# Dataset Class

In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset

# Load a dataset from the Hugging Face Hub
dataset = load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
train_subset = dataset['train'].select(range(10000))

In [5]:
train_subset

Dataset({
    features: ['text', 'label'],
    num_rows: 10000
})

In [6]:
dataset['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [7]:
dataset['train'][0:3]

{'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

In [8]:
train_subset = dataset['train'].select(range(10000))

In [9]:
train_subset

Dataset({
    features: ['text', 'label'],
    num_rows: 10000
})

In [10]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [11]:
# Split the dataset into training and test sets
split_dataset = dataset['train'].train_test_split(test_size=0.1)
train_data = split_dataset['train']
test_data = split_dataset['test']

In [13]:
# Define a preprocessing function
!pip install transformers datasets
from transformers import AutoTokenizer
from datasets import load_dataset

# Assuming you're using 'bert-base-uncased'
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define a preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset['train'].map(preprocess_function, batched=True)





Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [14]:
from torch.utils.data import DataLoader

# Create a DataLoader for the training data
train_dataloader = DataLoader(tokenized_dataset, batch_size=8, shuffle=True)

In [15]:
# Print the first batch
for batch in train_dataloader:
    print(batch)
    break

{'text': ['After viewing this film, I felt the compelling need to vent a bit of my frustration. Selma Blair is a fabulous, currently underrated actress and Max Beesley was rather charming in "Kill Me Later". The story, while not exactly original, certainly showed some promise. None of that mattered though...at all.<br /><br />I don\'t know what her deal is, but director Dana Lustig has virtually no talent whatsoever as a director. She slowed footage down, sped footage up, reversed footage, used awkward camera angles, used annoying color filters, made a zillion quick cuts, jumped back and forth in the timeline and topped it all off with an obnoxious "modern" soundtrack of blaring junk. I can\'t remember the last time I saw such an incompetent job of directing a film. Her ego must be huge to toss out the acting and story and put her direction front and center for the audience members to take notice of. It is crammed down their throats.<br /><br />There are a couple of good scenes in "Kil

# Dynamic Padding

In [16]:
from transformers import DataCollatorWithPadding, AutoTokenizer
from torch.utils.data import DataLoader

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Define a simple dataset
dataset = [
    {"text": "I've been waiting for a HuggingFace course my whole life."},
    {"text": "I hate this"}
]

# Tokenize the dataset
tokenized_dataset = [tokenizer(data['text'], truncation=True) for data in dataset]

# Initialize the DataCollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create a DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=2, collate_fn=data_collator)

# Print the batch
for batch in dataloader:
    print(batch)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}




In [17]:
# DataCollatorForLanguageModeling
# Prepares data for language modeling tasks by masking tokens.

from transformers import DataCollatorForLanguageModeling, AutoTokenizer
from torch.utils.data import DataLoader

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Define a simple dataset
dataset = [
    {"text": "I've been waiting for a HuggingFace course my whole life."},
    {"text": "I hate this"}
]

# Tokenize the dataset
tokenized_dataset = [tokenizer(data['text'], truncation=True) for data in dataset]

# Initialize the DataCollator with masking
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Create a DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=2, collate_fn=data_collator)

# Print the batch
for batch in dataloader:
    print(batch)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,   103,  2005,  1037, 17662,   103,
           103,  2026,   103,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  3403,  -100,  -100,  -100, 12172,
          2607,  -100,  2878,  2166,  -100,  -100],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100]])}


In [18]:
# DataCollatorForSeq2Seq
# Prepares data for sequence-to-sequence tasks such as translation and summarization.

from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('t5-small')
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

# Define a simple dataset
dataset = [
    {"text": "translate English to French: HuggingFace is a great library."},
    {"text": "translate English to French: I love programming."}
]

# Tokenize the dataset
tokenized_dataset = [tokenizer(data['text'], truncation=True) for data in dataset]

# Initialize the DataCollator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Create a DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=2, collate_fn=data_collator)

# Print the batch
for batch in dataloader:
    print(batch)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

{'input_ids': tensor([[13959,  1566,    12,  2379,    10, 11560,  3896,   371,  3302,    19,
             3,     9,   248,  3595,     5,     1],
        [13959,  1566,    12,  2379,    10,    27,   333,  6020,     5,     1,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]), 'labels': None}


# TrainingArguments Class

In [19]:
from transformers import TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save the model checkpoints
    evaluation_strategy='epoch',     # Evaluate at the end of every epoch
    learning_rate=2e-5,              # Learning rate for the optimizer
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Weight decay for the optimizer
    logging_dir='./logs',            # Directory to save the logs
    logging_steps=10,                # Log training metrics every 10 steps
    save_steps=500,                  # Save model checkpoint every 500 steps
    save_total_limit=2,              # Limit the total number of checkpoints
)

