In [2]:
!pip install transformers
!pip install bertviz
from transformers import GPT2Tokenizer, BertTokenizer, T5Tokenizer
import torch
import matplotlib.pyplot as plt
from bertviz import head_view, model_view

# Loading pre-trained BPE tokenizer
bpe_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
bpe_tokenizer.add_special_tokens({'pad_token': '<PAD>'})

# BPE Tokenization and Padding Function
def bpe_tokenize_and_pad(sentences, vocab, n):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = bpe_tokenizer.encode(sentence, add_special_tokens=True, max_length=n, truncation=True)
        if len(tokens) < n:
            tokens += [bpe_tokenizer.pad_token_id] * (n - len(tokens))
        tokenized_sentences.append(tokens)
    return tokenized_sentences

def bpe_decode_tokens(token_ids):
    tokens = bpe_tokenizer.convert_ids_to_tokens(token_ids)
    return tokens

# Loading pre-trained WordPiece tokenizer
wordpiece_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# WordPiece Tokenization and Padding Function
def wordpiece_tokenize_and_pad(sentences, vocab, n):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = wordpiece_tokenizer.encode(sentence, add_special_tokens=True, max_length=n, truncation=True)
        if len(tokens) < n:
            tokens += [wordpiece_tokenizer.pad_token_id] * (n - len(tokens))
        tokenized_sentences.append(tokens)
    return tokenized_sentences

def wordpiece_decode_tokens(token_ids):
    tokens = wordpiece_tokenizer.convert_ids_to_tokens(token_ids)
    return tokens

# Loading pre-trained SentencePiece tokenizer
sentencepiece_tokenizer = T5Tokenizer.from_pretrained('t5-base')

# SentencePiece Tokenization and Padding Function
def sentencepiece_tokenize_and_pad(sentences, vocab, n):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sentencepiece_tokenizer.encode(sentence, add_special_tokens=True, max_length=n, truncation=True)
        if len(tokens) < n:
            tokens += [sentencepiece_tokenizer.pad_token_id] * (n - len(tokens))
        tokenized_sentences.append(tokens)
    return tokenized_sentences

def sentencepiece_decode_tokens(token_ids):
    tokens = sentencepiece_tokenizer.convert_ids_to_tokens(token_ids)
    return tokens

# Example sentences
if __name__ == "__main__":
    sentences = [
        "As the aircraft becomes lighter, it flies higher in air of lower density to maintain the same airspeed.",
        " When the engine heats up, it operates more efficiently, consuming less fuel to maintain speed.",
        "As the sun sets, the temperature drops, causing the lake to cool down and lose its warm surface layer.",
        "When the ice melts, it becomes water, occupying more volume in its liquid state.",
        "As the river flows downstream, it slows down in wider sections to maintain a constant volume of water."
    ]
    n = 20

    # BPE Tokenization and Decoding
    print("BPE Tokenization and Decoding")
    bpe_vocab = bpe_tokenizer.get_vocab()
    bpe_tokenized_sentences = bpe_tokenize_and_pad(sentences, bpe_vocab, n)
    for ts in bpe_tokenized_sentences:
        print(ts)
        print(bpe_decode_tokens(ts))
        print("\n")
    print("\n")

    # WordPiece Tokenization and Decoding
    print("WordPiece Tokenization and Decoding")
    wordpiece_vocab = wordpiece_tokenizer.get_vocab()
    wordpiece_tokenized_sentences = wordpiece_tokenize_and_pad(sentences, wordpiece_vocab, n)
    for ts in wordpiece_tokenized_sentences:
        print(ts)
        print(wordpiece_decode_tokens(ts))
        print("\n")
    print("\n")

    # SentencePiece Tokenization and Decoding
    print("SentencePiece Tokenization and Decoding")
    sentencepiece_vocab = sentencepiece_tokenizer.get_vocab()
    sentencepiece_tokenized_sentences = sentencepiece_tokenize_and_pad(sentences, sentencepiece_vocab, n)
    for ts in sentencepiece_tokenized_sentences:
        print(ts)
        print(sentencepiece_decode_tokens(ts))
        print("\n")

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting boto3 (from bertviz)
  Downloading boto3-1.35.0-py3-none-any.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0->bertviz)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0->bertviz)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0->bertviz)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.0->bertviz)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.0->bertviz)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Coll

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


BPE Tokenization and Decoding
[1722, 262, 6215, 4329, 14871, 11, 340, 17607, 2440, 287, 1633, 286, 2793, 12109, 284, 5529, 262, 976, 1633, 12287]
['As', 'Ġthe', 'Ġaircraft', 'Ġbecomes', 'Ġlighter', ',', 'Ġit', 'Ġflies', 'Ġhigher', 'Ġin', 'Ġair', 'Ġof', 'Ġlower', 'Ġdensity', 'Ġto', 'Ġmaintain', 'Ġthe', 'Ġsame', 'Ġair', 'speed']


[1649, 262, 3113, 37876, 510, 11, 340, 14051, 517, 18306, 11, 18587, 1342, 5252, 284, 5529, 2866, 13, 50257, 50257]
['ĠWhen', 'Ġthe', 'Ġengine', 'Ġheats', 'Ġup', ',', 'Ġit', 'Ġoperates', 'Ġmore', 'Ġefficiently', ',', 'Ġconsuming', 'Ġless', 'Ġfuel', 'Ġto', 'Ġmaintain', 'Ġspeed', '.', '<PAD>', '<PAD>']


[1722, 262, 4252, 5621, 11, 262, 5951, 10532, 11, 6666, 262, 13546, 284, 3608, 866, 290, 4425, 663, 5814, 4417]
['As', 'Ġthe', 'Ġsun', 'Ġsets', ',', 'Ġthe', 'Ġtemperature', 'Ġdrops', ',', 'Ġcausing', 'Ġthe', 'Ġlake', 'Ġto', 'Ġcool', 'Ġdown', 'Ġand', 'Ġlose', 'Ġits', 'Ġwarm', 'Ġsurface']


[2215, 262, 4771, 48813, 11, 340, 4329, 1660, 11, 30876, 517, 6115, 287, 66