In [18]:
from transformers import BertConfig, BertModel

In [19]:
config = BertConfig()

In [20]:
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [21]:
config.hidden_size # defines the size of the hidden_states vector

768

In [22]:
config.num_hidden_layers # defines the number of layers the Transformer model has.

12

In [23]:
model = BertModel(config)

In [24]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [26]:
print(f"Number of Model Parameters: {round(model.num_parameters()/10e6, 2)} M")

Number of Model Parameters: 10.95 M


In [29]:
model.pooler.dense.weight.detach().numpy()

array([[-4.7761323e-03,  3.3261843e-02,  1.3391490e-03, ...,
        -1.0093682e-02, -2.0984944e-02,  3.0956868e-02],
       [-8.6294618e-03,  1.4971571e-02, -5.3771334e-03, ...,
        -2.3622787e-02,  5.3849813e-05, -1.2659279e-02],
       [ 6.2763658e-03,  1.2280456e-03,  4.6687871e-03, ...,
        -5.6032054e-03,  4.2445972e-03, -1.0656831e-02],
       ...,
       [ 1.3440725e-02, -1.1845819e-02, -1.6712217e-02, ...,
         2.1449083e-02, -5.7968069e-03, -1.6273479e-03],
       [-2.2355940e-04, -3.1497553e-02,  1.5658418e-02, ...,
         7.9985969e-03,  2.1920078e-02,  2.9532006e-02],
       [-4.8434441e-03,  8.3913207e-03, -8.5673193e-03, ...,
        -8.5522952e-03,  2.7002890e-02,  8.5937018e-03]], dtype=float32)

In [30]:
model = BertModel.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
model.pooler.dense.weight.detach().numpy()

array([[ 0.01198541,  0.00904225, -0.01089565, ..., -0.01370323,
        -0.02773503, -0.01926875],
       [ 0.00070473, -0.00884226,  0.01250864, ...,  0.00190787,
         0.007036  ,  0.0337044 ],
       [ 0.04902944,  0.02762233, -0.00163483, ..., -0.039266  ,
         0.0475751 ,  0.04069884],
       ...,
       [ 0.04361719,  0.03217682, -0.00913559, ..., -0.0532724 ,
         0.04549621,  0.03823399],
       [-0.02214781, -0.00306864, -0.04136439, ...,  0.00384682,
        -0.01343189,  0.01965056],
       [ 0.0129369 ,  0.0239422 , -0.02681959, ..., -0.07378349,
         0.04688236, -0.02581599]], dtype=float32)

## Tokenizer Alogorithms

### Word-based Tokenizer

In [34]:
sentence = "Md Abul Hayat was a student."
tokenized_text = sentence.split()
tokenized_text

['Md', 'Abul', 'Hayat', 'was', 'a', 'student.']

### Character-based Tokenizer

In [40]:
tokenized_text = [*sentence]
tokenized_text

['M',
 'd',
 ' ',
 'A',
 'b',
 'u',
 'l',
 ' ',
 'H',
 'a',
 'y',
 'a',
 't',
 ' ',
 'w',
 'a',
 's',
 ' ',
 'a',
 ' ',
 's',
 't',
 'u',
 'd',
 'e',
 'n',
 't',
 '.']

### BERT

In [44]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [45]:
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [46]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [47]:
tokenizer("a")

{'input_ids': [101, 170, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [48]:
tokenizer("is")

{'input_ids': [101, 1110, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [67]:
from transformers import AutoTokenizer

In [68]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokens = tokenizer.tokenize("Using a Transformer network is simple")

In [69]:
tokens

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']

In [70]:
# tokens = tokenizer.tokenize("Annoyingly")

In [71]:
# tokens

In [72]:
# tokens = tokenizer.tokenize("Happening")

In [73]:
# tokens

In [74]:
ids = tokenizer.convert_tokens_to_ids(tokens)

In [75]:
ids

[7993, 170, 13809, 23763, 2443, 1110, 3014]

In [76]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [80]:
decoded_string = tokenizer.decode([7993, 170, 13809, 23763, 2443, 1110, 3014])
decoded_string

'Using a Transformer network is simple'

In [82]:
decoded_string = tokenizer.decode([101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102])
decoded_string

'[CLS] Using a Transformer network is simple [SEP]'

In [83]:
decoded_string = tokenizer.decode(tokenizer("Using a Transformer network is simple")['input_ids'])
decoded_string

'[CLS] Using a Transformer network is simple [SEP]'