# Training Bert model

### Importing libraries

In [1]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json

2022-06-26 10:26:50.955929: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Selecting dataset

In [2]:
dataset = load_dataset('text',data_files={'train':['full_dataset.txt'],'test':'full_dataset.txt'})

d = dataset

d['train'], d['test']

Using custom data configuration default-be8aa9a43e6a8c69
Reusing dataset text (/Users/nitin/.cache/huggingface/datasets/text/default-be8aa9a43e6a8c69/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)
100%|██████████| 2/2 [00:00<00:00, 126.40it/s]


(Dataset({
     features: ['text'],
     num_rows: 44954
 }),
 Dataset({
     features: ['text'],
     num_rows: 44954
 }))

### Training tokenizer

In [3]:
special_tokens = [
    '[PAD]','[UNK]','[CLS]','[SEP]','[MASK]','<S>','<T>'
]

files = ['full_dataset.txt']

vocab_size = 30_522
max_length = 512
truncate_longer_samples = False

tokenizer = BertWordPieceTokenizer()

tokenizer.train(files=files,vocab_size=vocab_size,special_tokens=special_tokens)

tokenizer.enable_truncation(max_length=max_length)






### Saving the model

In [4]:
model_path = 'custom-bert'

if not os.path.isdir(model_path):
    os.mkdir(model_path)

tokenizer.save_model(model_path)

tokenizer.save_model(model_path)
with open(os.path.join(model_path, "config.json"), "w") as f:
  tokenizer_cfg = {
      "do_lower_case": True,
      "unk_token": "[UNK]",
      "sep_token": "[SEP]",
      "pad_token": "[PAD]",
      "cls_token": "[CLS]",
      "mask_token": "[MASK]",
      "model_max_length": max_length,
      "max_len": max_length,
  }

  #in json format
  json.dump(tokenizer_cfg, f)

### Tokenizing the dataset

In [5]:
tokenizer = BertTokenizerFast.from_pretrained(model_path)

def encode_with_truncation(examples):
  #Mapping function to tokenize the sentences passed with truncation
  return tokenizer(examples["text"], truncation=True, padding="max_length",
                   max_length=max_length, return_special_tokens_mask=True)

def encode_without_truncation(examples):
  #Mapping function to tokenize the sentences passed without truncation
  return tokenizer(examples["text"], return_special_tokens_mask=True)

# the encode function will depend on the truncate_longer_samples variable
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation
# tokenizing the train dataset
train_dataset = d["train"].map(encode, batched=True)
# tokenizing the testing dataset
test_dataset = d["test"].map(encode, batched=True)
if truncate_longer_samples:
  # remove other columns and set input_ids and attention_mask as PyTorch tensors
  train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
  test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
  # remove other columns, and remain them as Python lists
  test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
  train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

Didn't find file custom-bert/tokenizer.json. We won't load it.
Didn't find file custom-bert/added_tokens.json. We won't load it.
Didn't find file custom-bert/special_tokens_map.json. We won't load it.
Didn't find file custom-bert/tokenizer_config.json. We won't load it.
loading file custom-bert/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file custom-bert/config.json
Model config BertConfig {
  "_name_or_path": "custom-bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model_max_length": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token": "[PAD]",
  "pad_token_id": 0,
  

In [6]:
from itertools import chain
import torch

print(torch.__version__)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

if not truncate_longer_samples:
  train_dataset = train_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")
  test_dataset = test_dataset.map(group_texts, batched=True,
                                  desc=f"Grouping texts in chunks of {max_length}")
  # convert them from lists to torch tensors
  train_dataset.set_format("torch")
  test_dataset.set_format("torch")

1.11.0


Grouping texts in chunks of 512: 100%|██████████| 45/45 [00:01<00:00, 25.90ba/s]
Grouping texts in chunks of 512: 100%|██████████| 45/45 [00:01<00:00, 22.95ba/s]


In [8]:
len(train_dataset), len(test_dataset)

(1541, 1541)

### Loading the model

In [9]:
model_config = BertConfig(vocab_size=vocab_size,max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

### Training

In [12]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

using `logging_steps` to initialize `eval_steps` to 1000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1541
  Num Epochs = 10
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 8
  Total optimization steps = 190
100%|██████████| 190/190 [22:20:58<00:00, 416.98s/it]   

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 190/190 [22:20:58<00:00, 423.46s/it]

{'train_runtime': 80458.1518, 'train_samples_per_second': 0.192, 'train_steps_per_second': 0.002, 'train_loss': 7.815504214638158, 'epoch': 9.98}





TrainOutput(global_step=190, training_loss=7.815504214638158, metrics={'train_runtime': 80458.1518, 'train_samples_per_second': 0.192, 'train_steps_per_second': 0.002, 'train_loss': 7.815504214638158, 'epoch': 9.98})

### Using the model

In [22]:
BertForMaskedLM.save_pretrained(model,'custom-bert')

Configuration saved in custom-bert/config.json
Model weights saved in custom-bert/pytorch_model.bin


In [40]:

model = BertModel.from_pretrained('/Users/nitin/Code/PyCode/Bert/custom-bert')

tokenizer = AutoTokenizer.from_pretrained('/Users/nitin/Code/PyCode/Bert/custom-bert/')


sentences = [
  "The sky was cloudy today, but later today it's rainy",
  "The sky was cloudy yesterday, but today it's rainy.",
]

tokens = {'input_ids':[],'attention_mask':[]}

for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

outputs = model(**tokens)
outputs.keys()
embeddings = outputs.last_hidden_state

attention_mask = tokens['attention_mask']
attention_mask.shape

mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

masked_embeddings = embeddings * mask
masked_embeddings.shape

summed = torch.sum(masked_embeddings, 1)
summed.shape

summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

mean_pooled = summed / summed_mask


loading configuration file /Users/nitin/Code/PyCode/Bert/custom-bert/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file /Users/nitin/Code/PyCode/Bert/custom-bert/pytorch_model.bin
Some weights of the model checkpoint at /Users/nitin/Code/PyCode/Bert/custom-bert were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNo

In [41]:
from sklearn.metrics.pairwise import cosine_similarity


mean_pooled = mean_pooled.detach().numpy()

cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.98094076]], dtype=float32)