# Training From Scratch

## Fetching Data

In [2]:
import pandas as pd
imdb_df = pd.read_csv("IMDB-Dataset.csv")
reviews = imdb_df.review.to_string(index=None) 
with open("corpus.txt", "w") as f: 
    f.writelines(reviews) 

## Training the Tokenizer

In [3]:
from tokenizers import BertWordPieceTokenizer
bert_wordpiece_tokenizer = BertWordPieceTokenizer() 
bert_wordpiece_tokenizer.train("corpus.txt") 






To see the tokenized words along with their index:

In [7]:
# bert_wordpiece_tokenizer.get_vocab() # for all items

In [5]:
list(bert_wordpiece_tokenizer.get_vocab().items())[:4]

[('stow', 11786), ('carrie', 11020), ('disbel', 8959), ('astronauts', 17309)]

### Saving/Loading the Tokenizer

We have to save the tokenizer so we can reuse it later for training/inference.

In [8]:
# To save
!mkdir tokenizer
bert_wordpiece_tokenizer.save_model("tokenizer")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


['tokenizer/vocab.txt']

In [2]:
# To load
from tokenizers import BertWordPieceTokenizer
bert_wordpiece_tokenizer = BertWordPieceTokenizer() 
mytokenizer = BertWordPieceTokenizer.from_file("tokenizer/vocab.txt")

### Using the tokenizer (just for testing purpose)

In [3]:
tokenized_sentence = mytokenizer.encode("Oh it works just fine")

In [4]:
tokenized_sentence.tokens

['[CLS]', 'oh', 'it', 'works', 'just', 'fine', '[SEP]']

Similarly, for unknown words, it will be broken into subwords

In [5]:
tokenized_sentence = mytokenizer.encode("ohoh i thougt it might be workingg well")

In [6]:
print(tokenized_sentence.tokens)

['[CLS]', 'oh', '##o', '##h', 'i', 'thoug', '##t', 'it', 'might', 'be', 'working', '##g', 'well', '[SEP]']


### Saving Config

Omitting this step will give a warning.

In [None]:
config = AutoConfig.from_pretrained('distilroberta-base')

## Tokenization (i.e. processing input for model)

In [5]:
from transformers import BertTokenizerFast 
tokenizer = BertTokenizerFast.from_pretrained("tokenizer") 

file tokenizer/config.json not found
file tokenizer/config.json not found


## Creating the Dataset for MLM

In [9]:
# OLD WAY
from transformers import LineByLineTextDataset 
dataset_old = LineByLineTextDataset(tokenizer=tokenizer, file_path="corpus.txt", block_size=128) 



In [17]:
len(dataset_old)

50022

In [21]:
dataset_old[10]

{'input_ids': tensor([   2, 2129,  136, 2237,  148,  195,  146,  508, 3868,  394,  169,   18,
           18,   18,    3])}

In [25]:
' '.join(tokenizer.convert_ids_to_tokens(dataset_old[10]['input_ids']))

'[CLS] phil the alien is one of those quirky films wh . . . [SEP]'

In [20]:
# NEW WAY
from datasets import load_dataset
dataset = load_dataset('text', data_files='corpus.txt')

Using custom data configuration default-1fabf752ad1491af
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-1fabf752ad1491af/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


  0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 50028
    })
})

In [22]:
dataset["train"][10]

{'text': ' Phil the Alien is one of those quirky films wh...'}

## Tokenization

In [23]:
tokenized_dataset = dataset.map( lambda d: tokenizer(d['text']), batched=True)

  0%|          | 0/51 [00:00<?, ?ba/s]

In [24]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'text', 'token_type_ids'],
        num_rows: 50028
    })
})

In [25]:
from pprint import pprint

In [26]:
pprint(tokenized_dataset["train"][10])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
               6316,
               1996,
               7344,
               2003,
               2028,
               1997,
               2216,
               21864,
               15952,
               3152,
               1059,
               2232,
               1012,
               1012,
               1012,
               102],
 'text': ' Phil the Alien is one of those quirky films wh...',
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [27]:
tokenized_dataset = tokenized_dataset.remove_columns(['text'])

In [28]:
pprint(tokenized_dataset["train"][10])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
               6316,
               1996,
               7344,
               2003,
               2028,
               1997,
               2216,
               21864,
               15952,
               3152,
               1059,
               2232,
               1012,
               1012,
               1012,
               102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [29]:
# For cpu training and demo purposes
# tokenized_train_dataset_1k = tokenized_dataset['train'].shuffle(seed=42).select(range(1000))
tokenized_train_dataset_1k = tokenized_dataset['train'].select(range(1000))

In [30]:
tokenized_train_dataset_1k

Dataset({
    features: ['attention_mask', 'input_ids', 'token_type_ids'],
    num_rows: 1000
})

In [31]:
tokenized_train_dataset_1k[10]

{'input_ids': [101,
  6316,
  1996,
  7344,
  2003,
  2028,
  1997,
  2216,
  21864,
  15952,
  3152,
  1059,
  2232,
  1012,
  1012,
  1012,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [32]:
tokenized_train_dataset_1k.set_format("torch")
tokenized_train_dataset_1k[10]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([  101,  6316,  1996,  7344,  2003,  2028,  1997,  2216, 21864, 15952,
          3152,  1059,  2232,  1012,  1012,  1012,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

## Data Collator

This is like a pre-processing function for the input. In this case, it will mask the input with a probability of 15%.

Note that any preprocessing will be done on the fly as the data is passed to the model.

In [44]:
from transformers import DataCollatorForLanguageModeling 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) 

## Training Arguments

In [45]:
from transformers import TrainingArguments 
training_args = TrainingArguments(output_dir="BERT",
                                  overwrite_output_dir=True,
                                  num_train_epochs=1,
                                  per_device_train_batch_size=128) 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## Model Configuration

Here, we will provide the configurations for the model. We will inherit from the BERT model.

In [50]:
from transformers import BertConfig 
BertConfig() 

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

To change configuration, set the appropraite values

In [51]:
# tiny_bert_config = BertConfig(max_position_embeddings=512, hidden_size=128, num_attention_heads=2, num_hidden_layers=2, intermediate_size=512) 
# tiny_bert_config 

In [52]:
# tiny_bert = BertForMaskedLM(tiny_bert_config) 
# trainer = Trainer(model=tiny_bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 
# trainer.train() 

But we will use the default settings for now.

In [46]:
from transformers import BertConfig, BertForMaskedLM 
bert = BertForMaskedLM(BertConfig())

## Trainer Object

In [47]:
from transformers import Trainer 
trainer = Trainer(model=bert,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=tokenized_train_dataset_1k) 

In [48]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 8


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=8, training_loss=8.57533073425293, metrics={'train_runtime': 154.6716, 'train_samples_per_second': 6.465, 'train_steps_per_second': 0.052, 'total_flos': 12284262122400.0, 'train_loss': 8.57533073425293, 'epoch': 1.0})

In [49]:
trainer.save_model("MyBERT")

Saving model checkpoint to MyBERT
Configuration saved in MyBERT/config.json
Model weights saved in MyBERT/pytorch_model.bin


# Training for NLP Tasks - Without Trainer

Now that the model has been trained from scratch, we can use it to train for other task. Let's train it for a Sentiment Classification task on the same IMDB dataset, but this time the output will be either a 0 (negative) or 1 (positive).

## Loading Model

In [1]:
from transformers import BertModel, BertTokenizerFast 
bert = BertModel.from_pretrained("bert-base-uncased") 
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 
# bert.layers 

Some weights of the model checkpoint at ../../models/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Feeding a Single Input

In [2]:
tokenized_text = tokenizer.batch_encode_plus(["hello how is it going with you","lets test it"], return_tensors="pt", max_length=256, truncation=True, padding='max_length') 
# tokenized_text

In [3]:
bert(**tokenized_text)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 1.0047e-01,  6.7703e-02, -8.3360e-02,  ..., -4.9330e-01,
           1.1654e-01,  2.2665e-01],
         [ 3.2362e-01,  3.7072e-01,  6.1469e-01,  ..., -6.2727e-01,
           3.7908e-01,  7.0531e-02],
         [ 1.9953e-01, -8.7551e-01, -6.4786e-02,  ..., -1.2808e-02,
           3.0765e-01, -2.0732e-02],
         ...,
         [-6.5330e-02,  1.1905e-01,  5.7685e-01,  ..., -2.9546e-01,
           2.4974e-02,  1.1396e-01],
         [-2.6472e-01, -7.8638e-02,  5.4728e-01,  ..., -1.3752e-01,
          -5.9469e-02, -5.1793e-02],
         [-2.4496e-01, -1.1480e-01,  5.9217e-01,  ..., -1.5688e-01,
          -3.3976e-02, -8.4614e-02]],

        [[ 2.9457e-02,  2.3087e-01,  2.9265e-01,  ..., -1.3042e-01,
           1.8966e-01,  4.6843e-01],
         [ 1.7052e+00,  6.9136e-01,  7.3151e-01,  ...,  2.8930e-01,
           5.3676e-01, -1.5455e-01],
         [ 1.0460e-01,  9.6368e-02,  6.9966e-02,  ..., -4.1592e-01,
          -1.

Our model has two outputs: `last_hidden_state` and `pooler_output`.  

The `last_hidden_state` provides all token embeddings from BERT with additional [CLS] and [SEP] tokens at the start and end, respectively.
    
We will add layers to these outputs to complete our classification model

In [29]:
bert(**tokenized_text)[1].size()

torch.Size([2, 768])

## Creating the Complete Model

In [2]:
import torch
import torch.nn as nn

from torchsummary import summary

In [3]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        
       
        self.bert = BertModel.from_pretrained("bert-base-uncased") 
        self.out = nn.Linear(bert.config.hidden_size, 2)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#         output = self.out(outputs.pooler_output)
        output = self.out(outputs.last_hidden_state[:,1,:])
        return output

In [4]:
model = Model()

Some weights of the model checkpoint at ../../models/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [65]:
# summary(model, [(768, 1), (768, 1), (768, 1)])

In [66]:
print(model)

Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      

In [52]:
# Total number of parameters
sum(p.numel() for p in model.parameters() if p.requires_grad)

109483778

## Feeding a Single Input to the (Complete) Model

In [68]:
tokenized_text = tokenizer.batch_encode_plus(["hello how is it going with you","hello how is it going with you"], return_tensors="pt", max_length=256, truncation=True, padding='max_length') 
# tokenized_text

In [69]:
tokenized_text["input_ids"].size(), tokenized_text["attention_mask"].size(), tokenized_text["token_type_ids"].size()

(torch.Size([2, 256]), torch.Size([2, 256]), torch.Size([2, 256]))

In [70]:
outputs = model(tokenized_text["input_ids"],tokenized_text["attention_mask"],tokenized_text["token_type_ids"]) 

In [71]:
outputs.shape

torch.Size([2, 2])

In [72]:
outputs

tensor([[-0.6137, -0.1417],
        [-0.6137, -0.1417]], grad_fn=<AddmmBackward>)

## Creating Dataset

In [5]:
from transformers import BertModel, BertTokenizerFast 
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 

In [6]:
import pandas as pd 
# imdb_df = pd.read_csv("IMDB-Dataset.csv") 
imdb_df = pd.read_csv("IMDB-Dataset.csv").head(100)
reviews = list(imdb_df.review) 
tokenized_reviews = tokenizer.batch_encode_plus(reviews, return_tensors="pt", max_length=256, truncation=True, padding='max_length') 

import numpy as np 
train_split = int(0.8 * len(tokenized_reviews["attention_mask"])) 
train_tokens = tokenized_reviews["input_ids"][:train_split] 
val_tokens = tokenized_reviews["input_ids"][train_split:] 
train_masks = tokenized_reviews["attention_mask"][:train_split] 
val_masks = tokenized_reviews["attention_mask"][train_split:]
train_ids = tokenized_reviews["token_type_ids"][:train_split] 
val_ids = tokenized_reviews["token_type_ids"][train_split:] 
sentiments = list(imdb_df.sentiment) 
labels = np.array([1 if sentiment == "positive" else 0 for sentiment in sentiments]) 
train_labels = torch.tensor(labels[:train_split])
val_labels = torch.tensor(labels[train_split:]) 

In [7]:
train_split

80

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [9]:
# train_seq = torch.tensor(tokens_train['input_ids'])
# train_mask = torch.tensor(tokens_train['attention_mask'])
# train_y = torch.tensor(train_pd['label'], dtype=torch.long)

# test_seq = torch.tensor(tokens_test['input_ids'])
# test_mask = torch.tensor(tokens_test['attention_mask'])
# test_y = torch.tensor(test_pd['label'], dtype=torch.long)

In [10]:
batch_size=32

In [11]:
train_dataset = TensorDataset(train_tokens, train_masks, train_ids, train_labels)
train_sampler = RandomSampler(train_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size = batch_size)

In [12]:
val_dataset = TensorDataset(val_tokens, val_masks, val_ids, val_labels)
val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, sampler=val_sampler, batch_size = batch_size)

## Training Loop

In [13]:

import time
import datetime


import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint

import tableprint as tp
import torchmetrics



In [14]:
class Model_TL(pl.LightningModule):
    def __init__(self, model):
        super(Model_TL, self).__init__()
        self.model = model
        self.avg_train_loss = 0.
        self.avg_valid_loss = 0.
        self.table_context = None
        self.loss_fn = nn.CrossEntropyLoss()
        self.start_time = 0
        self.end_time = 0
        self.epoch_mins = 0
        self.epoch_secs = 0
        self.table_context = None
        self.train_accm = torchmetrics.Accuracy()
        self.valid_accm = torchmetrics.Accuracy()
        self.train_acc = 0.
        self.valid_acc = 0.
        

    def configure_optimizers(self):
        optim = torch.optim.Adam(self.parameters(), lr=0.0005)
        return optim


    def training_step(self, batch, batch_idx):
        input_ids, masks, type_ids, labels = batch
        output = self.model(input_ids, masks, type_ids)
        _, predictions = torch.max(output, 1)
        acc_train = self.train_accm(predictions, labels)
        loss = self.loss_fn(output, labels)
        return {"loss": loss, "p": predictions, "y": labels}
    
    
    def validation_step(self, batch, batch_idx):
        input_ids, masks, type_ids, labels = batch
        output = self.model(input_ids, masks, type_ids)
        _, predictions = torch.max(output, 1)
        acc_train = self.valid_accm(predictions, labels)
        loss_valid = self.loss_fn(output, labels)
        return {"loss": loss_valid, "p": predictions, "y": labels}


    def on_train_epoch_start(self) :
        self.start_time = time.time()


    def validation_epoch_end(self, outputs):
        if self.trainer.sanity_checking:
            return
        
        self.avg_valid_loss = torch.stack([x['loss'] for x in outputs]).mean().item()
        self.valid_acc = (self.valid_accm.compute() * 100).item()
        self.valid_accm.reset()
        self.log("epoch_num", int(self.current_epoch+1), on_step=False, on_epoch=True, prog_bar=False, logger=False)
        self.log("val_loss", self.avg_valid_loss, on_step=False, on_epoch=True, prog_bar=False, logger=False)
        self.log("val_acc", self.valid_acc, on_step=False, on_epoch=True, prog_bar=False, logger=False)
        
#         if self.current_epoch == self.trainer.max_epochs - 1:
#             y = torch.cat([x['y'] for x in outputs])
#             p = torch.cat([x['p'] for x in outputs])
          

    def training_epoch_end(self, outputs):
        self.avg_train_loss = torch.stack([x['loss'] for x in outputs]).mean().item()
        self.train_acc = (self.train_accm.compute() * 100).item()
        self.train_accm.reset()

    def on_train_epoch_end(self):
        self.end_time = time.time()
        time_int = self.format_time(self.start_time, self.end_time)
    
        metrics = {'epoch': self.current_epoch+1, 'Train Acc': self.train_acc, 'Train Loss': self.avg_train_loss,  'Valid Acc': self.valid_acc, 'Valid Loss': self.avg_valid_loss}
        if self.table_context is None:
            self.table_context = tp.TableContext(headers=['epoch', 'Train Acc', 'Train Loss', 'Valid Acc', 'Valid Loss', 'Time'])
            self.table_context.__enter__()
        self.table_context([self.current_epoch+1, self.train_acc, self.avg_train_loss, self.valid_acc, self.avg_valid_loss, time_int])
        self.logger.log_metrics(metrics)

        if self.current_epoch == self.trainer.max_epochs - 1:
            self.table_context.__exit__()

    
    def format_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_rounded = int(round((elapsed_time)))
        return str(datetime.timedelta(seconds=elapsed_rounded))

In [15]:
plmodel = Model_TL(model)

In [16]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_acc',
    dirpath='./',
    filename='model',
    mode='max'
)
csvlogger = CSVLogger('csv_logs', name='Ch3', version=0)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


In [None]:
trainer = pl.Trainer(max_epochs=1, num_sanity_val_steps=0, logger=csvlogger, gpus=0, callbacks=[checkpoint_callback], log_every_n_steps=1)
trainer.fit(plmodel, train_dataloaders=train_loader, val_dataloaders=val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(

  | Name       | Type             | Params
------------------------------------------------
0 | model      | Model            | 109 M 
1 | loss_fn    | CrossEntropyLoss | 0     
2 | train_accm | Accuracy         | 0     
3 | valid_accm | Accuracy         | 0     
------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

# Other autoencoding models

## BERT-BASE

In [1]:
from transformers import BertConfig, BertModel
bert_base= BertConfig()
model = BertModel(bert_base)
print(f"{model.num_parameters() /(10**6)} million parameters")

109.48224 million parameters


## Albert-base

In [3]:
from transformers import AlbertConfig, AlbertModel
albert_base = AlbertConfig(
     hidden_size=768,
     num_attention_heads=12,
     intermediate_size=3072,
 )
model = AlbertModel(albert_base)
print(f"{model.num_parameters() /(10**6)} million parameters")

11.683584 million parameters


## BERT-LARGE

In [4]:
from transformers import BertConfig, BertModel
bert_large= BertConfig(hidden_size=1024, 
                      num_hidden_layers=24 ,
          num_attention_heads=16,
          intermediate_size=4096
     )
model = BertModel(bert_large)
print(f"{model.num_parameters() /(10**6)} million parameters")

335.141888 million parameters


## ALBERT-large

In [6]:
from transformers import AlbertConfig, AlbertModel
albert_xxlarge= AlbertConfig()
model = AlbertModel(albert_xxlarge)
print(f"{model.num_parameters() /(10**6)} million parameters")

222.595584 million parameters


## Roberta

In [7]:
from transformers import RobertaConfig, RobertaModel
conf= RobertaConfig()
model = RobertaModel(conf)
print(f"{model.num_parameters() /(10**6)} million parameters")

109.48224 million parameters


# Tokenization

## Loading a Turkish Pre-trained Tokenizer

In [9]:
from transformers import AutoModel, AutoTokenizer
tokenizerTUR = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased",)
print(f"VOC size is: {tokenizerTUR.vocab_size}")
print(f"The model is {type(tokenizerTUR)}")

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256k [00:00<?, ?B/s]

VOC size is: 32000
The model is <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


## Loading an English Pre-trained Tokenizer

In [10]:
from transformers import AutoModel, AutoTokenizer
tokenizerEN = AutoTokenizer.from_pretrained("bert-base-uncased")
print(f"VOC size is: {tokenizerEN.vocab_size}")
print(f"The model is {type(tokenizerEN)}")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

VOC size is: 30522
The model is <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


## Tokenizing Word in different language

In [11]:
word_en="telecommunications"
print(f"is in Turkish Model ? {word_en in tokenizerTUR.vocab}")
print(f"is in English Model ? {word_en in tokenizerEN.vocab}")

is in Turkish Model ? False
is in English Model ? True


In [12]:
tokens=tokenizerTUR.tokenize(word_en)
tokens

['tel', '##eco', '##mm', '##un', '##ica', '##tions']

In [13]:
tokens= tokenizerEN.tokenize(word_en)
tokens

['telecommunications']

# The tokenizers library

## Steps

- Modeling
- Normalizer
- PreTokenizer
- Post-Processor
- Decoding

## Obtaining Data for subsequent training

In [14]:
import nltk 
from nltk.corpus import gutenberg 
nltk.download('gutenberg') 
nltk.download('punkt') 
plays=['shakespeare-macbeth.txt','shakespeare-hamlet.txt','shakespeare-caesar.txt']
shakespeare=[" ".join(s) for ply in plays for s in gutenberg.sents(ply)]

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Training BPE from Scratch

### Modeling

We have to chose the type of Model we want to train. Examples include WordLevel, BPE, WordPiece, and Unigram.

In [18]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE())

### Normalizer

It is reponsible for pre-processing the input string in order to normalize it as relevant for a given use case. 

In [19]:
from tokenizers.normalizers import (Sequence, Lowercase, NFD, StripAccents)
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

### PreTokenizer

This is responsible for splitting the input according to a set of rules. 

In [20]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace() 

### Post Processing

This defines how to add special tokens to our sentences.

In [15]:
from tokenizers.processors import TemplateProcessing
special_tokens= ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
temp_proc= TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

In [23]:
tokenizer.post_processor=temp_proc

### Decoder

This lets the tokenizer convert the token IDs back to readable text by processing the special characters or identifiers used. 

In [24]:
from tokenizers.decoders import BPEDecoder
tokenizer.decoder = BPEDecoder()

### Training!

In [25]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(vocab_size=5000, special_tokens= special_tokens)
tokenizer.train_from_iterator(shakespeare, trainer=trainer)
print(f"Trained vocab size: {tokenizer.get_vocab_size()}" )




Trained vocab size: 5000


## Using the tokenizer

In [26]:
sen= "Is this a dagger which I see before me, the handle toward my hand?"
sen_enc=tokenizer.encode(sen)
print(f"Output: {format(sen_enc.tokens)}")

Output: ['[CLS]', 'is', 'this', 'a', 'dagger', 'which', 'i', 'see', 'before', 'me', ',', 'the', 'hand', 'le', 'toward', 'my', 'hand', '?', '[SEP]']


In [27]:
sen_enc2=tokenizer.encode("Macbeth and Hugging Face")
print(f"Output: {format(sen_enc2.tokens)}")

Output: ['[CLS]', 'macbeth', 'and', 'hu', 'gg', 'ing', 'face', '[SEP]']


In [28]:
two_enc=tokenizer.encode("I like Hugging Face!","He likes Macbeth!")
print(f"Output: {format(two_enc.tokens)}")

Output: ['[CLS]', 'i', 'like', 'hu', 'gg', 'ing', 'face', '!', '[SEP]', 'he', 'likes', 'macbeth', '!', '[SEP]']


## Saving the tokenizer

### Just the model

In [31]:
!mkdir token_trained

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
tokenizer.model.save('token_trained')

['token_trained/vocab.json', 'token_trained/merges.txt']

In [36]:
!wc -l ./token_trained/merges.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
4948 ./token_trained/merges.txt


In [37]:
!head -6 ./token_trained/merges.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
#version: 0.2 - Trained by `huggingface/tokenizers`
t h
o u
a n
th e
r e


In [38]:
!head -1000 ./token_trained/merges.txt| tail -5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ch ance
si g
your s
ti a
po int


### Entire pipeline

In [33]:
!mkdir token_pipeline

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
tokenizer.save("token_pipeline/MyBPETokenizer.json")
tokenizerFromFile=Tokenizer.from_file("token_pipeline/MyBPETokenizer.json")
sen_enc3 = tokenizerFromFile.encode("I like HuggingFace and Macbeth")
print(f"Output: {format(sen_enc3.tokens)}")

Output: ['[CLS]', 'i', 'like', 'hu', 'gg', 'ing', 'face', 'and', 'macbeth', '[SEP]']


## Training WordPiece from Scratch

### Modeling

We have to chose the type of Model we want to train. Examples include WordLevel, BPE, WordPiece, and Unigram.

In [40]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
tokenizer = Tokenizer(WordPiece())

### Normalizer

It is reponsible for pre-processing the input string in order to normalize it as relevant for a given use case. 

In [41]:
from tokenizers.normalizers import BertNormalizer 
tokenizer.normalizer=BertNormalizer()

### PreTokenizer

This is responsible for splitting the input according to a set of rules. 

In [42]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

### Post Processing

This defines how to add special tokens to our sentences. (Not needed here!)

### Decoder

This lets the tokenizer convert the token IDs back to readable text by processing the special characters or identifiers used. 

In [24]:
from tokenizers.decoders import WordPiece as WordPieceDecoder
tokenizer.decoder= WordPieceDecoder()

### Training!

In [45]:
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(vocab_size=5000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train_from_iterator(shakespeare, trainer=trainer)
output = tokenizer.encode(sen)
print(output.tokens)




['is', 'this', 'a', 'dagger', 'which', 'i', 'see', 'before', 'me', ',', 'the', 'hand', '##le', 'toward', 'my', 'hand', '?']


In [46]:
tokenizer.decode(output.ids)

'is this a dagger which i see before me , the hand ##le toward my hand ?'

In [47]:
tokenizer.encode("Kralsın aslansın Macbeth!").tokens

['[UNK]', '[UNK]', 'macbeth', '!']

# Untrained ready-to-use Tokenizer Pipelines

Only the above steps have been associated with these pipelines. You have to train these pipelines! 

* CharBPETokenizer: The original BPE
* ByteLevelBPETokenizer: The byte level version of the BPE
* SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece
* BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece

In [51]:
from tokenizers import (ByteLevelBPETokenizer,
                            CharBPETokenizer,
                            SentencePieceBPETokenizer,
                            BertWordPieceTokenizer)

In [52]:
tokenizer= SentencePieceBPETokenizer()
print(tokenizer.normalizer)
print(tokenizer.pre_tokenizer)
print(tokenizer.decoder)
print(tokenizer.post_processor)

<tokenizers.normalizers.NFKC object at 0x7f4530d96970>
<tokenizers.pre_tokenizers.Metaspace object at 0x7f4530d96970>
<tokenizers.decoders.Metaspace object at 0x7f4530e02960>
None


In [53]:
tokenizer= BertWordPieceTokenizer()
print(tokenizer.normalizer)
print(tokenizer.pre_tokenizer)
print(tokenizer.decoder)
print(tokenizer.post_processor)

<tokenizers.normalizers.BertNormalizer object at 0x7f4530d964b0>
<tokenizers.pre_tokenizers.BertPreTokenizer object at 0x7f4530d964b0>
<tokenizers.decoders.WordPiece object at 0x7f4530e02390>
None
