In [3]:
## creating a transformer

# only model (not trained model)
from transformers import BertConfig, BertModel

# building the config
config = BertConfig()

# building the model from the config above
model = BertModel(config)

In [4]:
# view the config
print("Bert configuration")
print(config)

Bert configuration
BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [6]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased")

In [7]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

The model above `bert-base-cased` is model checkpoint and already trained on large dataset and now we can use this for different purpose and also can fine tune on our own dataset.

In [8]:
### saving the model
model.save_pretrained("./bert")

In [9]:
!ls ./bert

config.json  model.safetensors


In [11]:
### inference

# example text
sequences = ["Hello", "Hi", "Hey"]

# example tokenized output
encoded_sequences = [
    [101, 7592, 102],
    [101, 7592, 102],
    [101, 7592, 102]
]

In [13]:
import torch

model_inputs = torch.tensor(encoded_sequences)

In [14]:
output = model(model_inputs)

In [15]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1346,  0.2425,  0.0835,  ...,  0.0392,  0.1245, -0.2297],
         [-0.1441, -0.3777,  0.3432,  ..., -0.2696,  0.5613, -0.1820],
         [ 1.3870,  0.2144, -0.5577,  ...,  0.4530,  0.4656,  0.2042]],

        [[ 0.1346,  0.2425,  0.0835,  ...,  0.0392,  0.1245, -0.2297],
         [-0.1441, -0.3777,  0.3432,  ..., -0.2696,  0.5613, -0.1820],
         [ 1.3870,  0.2144, -0.5577,  ...,  0.4530,  0.4656,  0.2042]],

        [[ 0.1346,  0.2425,  0.0835,  ...,  0.0392,  0.1245, -0.2297],
         [-0.1441, -0.3777,  0.3432,  ..., -0.2696,  0.5613, -0.1820],
         [ 1.3870,  0.2144, -0.5577,  ...,  0.4530,  0.4656,  0.2042]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.2460,  0.2466,  0.9985,  ...,  0.9997, -0.8470,  0.9821],
        [-0.2460,  0.2466,  0.9985,  ...,  0.9997, -0.8470,  0.9821],
        [-0.2460,  0.2466,  0.9985,  ...,  0.9997, -0.8470,  0.9821]],
       grad_fn=<TanhBac

## Tokenizer
Main purpose is to convert text into numerical.

## Some of tokenization algorithm
- `**Word-based**`: split on spaces
- `**Character-based**`: split into character
* `**Subword**`: frequentlu used words should not be split into smaller subwords but rare used words should be decomposed into meaningful subwords.
- some other are `Byte-level BPE`, `WordPiece`, `SentencePiece`.

In [17]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") # automatically get the tokenizer for the model

In [20]:
tokenizer("Hello this is really beautiful")

{'input_ids': [101, 8667, 1142, 1110, 1541, 2712, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [21]:
### saving tokenizer
tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

## Tokenization
-  text -> split -> add [CLS] at the start and [SEP] at the end with 101 and 102 token value and then tokenize the split text.
- `**Encoding**`: tokenization followed by the conversion to input id so first tokenize the text and then convert into number using `vocabulary`.


In [24]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Hello this is really beautiful"

tokens = tokenizer.tokenize(sequence) # tokenize()

print(tokens)

['Hello', 'this', 'is', 'really', 'beautiful']


In [27]:
tokenize_nepali = tokenizer.tokenize("nepal sundar desh ho.")

In [31]:
print(tokenize_nepali) # subword tokenizer split word until it obtains token that can be represented by its vocabulary.

['ne', '##pal', 'sun', '##dar', 'des', '##h', 'ho', '.']


In [32]:
tokenize_another = tokenizer.tokenize("Environmental education should be given to everyone.")

In [33]:
tokenize_another

['Environmental', 'education', 'should', 'be', 'given', 'to', 'everyone', '.']

In [35]:
tokenize = tokenizer.tokenize("Learning by doing.")
tokenize

['Learning', 'by', 'doing', '.']

In [37]:
### token to number
ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens), print(ids)

['Hello', 'this', 'is', 'really', 'beautiful']
[8667, 1142, 1110, 1541, 2712]


(None, None)

## Decoding

In [40]:
decoded_string = tokenizer.decode([8667, 1142, 1110, 1541, 2712])
print(f"Decoded text: {decoded_string}")

Decoded text: Hello this is really beautiful


### Handling multiple sentence

In [43]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sentence = "I am happy"

tokens = tokenizer.tokenize(sentence)
token_id = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([token_id])
print(input_ids)
# here add extra dimension because it expect multiple sentences by default.

output = model(input_ids)
print(output)

tensor([[1045, 2572, 3407]])
SequenceClassifierOutput(loss=None, logits=tensor([[-3.4687,  3.6865]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [44]:
## batching is the act of sending multiple sentences, but when we have one we can just build a batch with it.

batched_ids = [token_id, token_id]

In [45]:
batched_ids

[[1045, 2572, 3407], [1045, 2572, 3407]]

In [46]:
## batching two sentence of variable length

batched_ids = [
    [200, 200, 200],
    [200, 200]
]

# for this we use padding technique
# padding make sure all our sentence have the same length by adding a special word called padding token.
# if we have 20 sentence with 20 words and 1 sentence with 10 word, padding will ensure that all the sentences have equal words and use padding to make sure each have same size.

padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id]
]

In [47]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [48]:
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

In [49]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

In [51]:
print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)
# logit for different sequence

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [52]:
### Attention mask:
# to get the same result when the same token are there but one is with padding we can tell to not consider the padding.
# 0: donot consider the padding ignore token
# 1: consider the padding also

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]
attention_mask = [
    [1, 1, 1],
    [1, 1, 0]
]
output = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


### Longer sequences

In [59]:
## there is a limit on how long sentence we can take at once.
## model hacve different supporting.
## truncate specifying the max length

sentence = "This is a really long sentence"
max_length = 10

truncate = sentence[:max_length]
print(f"Original: {sentence}"), print(f"After: {truncate}")

Original: This is a really long sentence
After: This is a 


(None, None)