# Task 2 & 3: Text Tokenization and Encoding

This notebook demonstrates text tokenization and encoding using BERT tokenizer from Hugging Face Transformers.


In [None]:
import matplotlib.pyplot as plt
from transformers import BertTokenizer
import numpy as np


In [None]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("Tokenizer loaded successfully!")


## Example 1: Basic Tokenization


In [None]:
text = "A beautiful sunset over the mountains"

encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
token_ids = encoded['input_ids'][0].tolist()

print("Original Text:", text)
print("Tokens:", tokens)
print("Token IDs:", token_ids)

plt.figure(figsize=(12, 4))
plt.bar(range(len(token_ids)), token_ids)
plt.xticks(range(len(token_ids)), tokens, rotation=45, ha='right')
plt.title("Token IDs for Input Text")
plt.xlabel("Tokens")
plt.ylabel("Token IDs")
plt.tight_layout()
plt.show()


## Example 2: Multiple Text Samples


In [None]:
text_samples = [
    "Generate an image of a cat",
    "A red car driving on a highway",
    "Beautiful landscape with mountains and lake"
]

for i, text in enumerate(text_samples, 1):
    encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    token_ids = encoded['input_ids'][0].tolist()
    
    print(f"\nSample {i}: {text}")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {token_ids}")
    
    plt.figure(figsize=(12, 4))
    plt.bar(range(len(token_ids)), token_ids)
    plt.xticks(range(len(token_ids)), tokens, rotation=45, ha='right')
    plt.title(f"Token IDs for Sample {i}")
    plt.xlabel("Tokens")
    plt.ylabel("Token IDs")
    plt.tight_layout()
    plt.show()
