In [None]:
# Imports

import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer

# Chosen model DistilBERT
MODEL_NAME = "distilbert-base-uncased"


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load dataset

raw_datasets = load_dataset("imdb")

print("--- Dataset Structure ---")
print(raw_datasets)
print("\n--- Example Training Review ---")
print(raw_datasets["train"][0])

--- Dataset Structure ---
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

--- Example Training Review ---
{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race iss

In [None]:
# Initialize tokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"\nModel: {MODEL_NAME}")


Model: distilbert-base-uncased


In [None]:
# Special tokens

print(f"CLS token: {tokenizer.cls_token} | ID: {tokenizer.cls_token_id}")
print(f"SEP token: {tokenizer.sep_token} | ID: {tokenizer.sep_token_id}")
print(f"PAD token: {tokenizer.pad_token} | ID: {tokenizer.pad_token_id}")

CLS token: [CLS] | ID: 101
SEP token: [SEP] | ID: 102
PAD token: [PAD] | ID: 0


In [14]:
# Tokenization function

MAX_LENGTH = 512

def tokenize_function(example):
    return(tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    ))
    

In [15]:
# Apply tokenization to the dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # batched=True allows the tokenizer to process multiple samples simultaneously, significantly accelerating the process
print("\n--- Tokenized Dataset Structure ---")
print(tokenized_datasets)

Map: 100%|██████████| 25000/25000 [00:05<00:00, 4469.46 examples/s]
Map: 100%|██████████| 25000/25000 [00:04<00:00, 5072.18 examples/s]
Map: 100%|██████████| 50000/50000 [00:10<00:00, 4896.05 examples/s]


--- Tokenized Dataset Structure ---
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})





In [17]:
# First processed sample
print("\n--- First Tokenized Training Sample ---")
sample_index = 0
print(raw_datasets["train"][sample_index]['text'][:150])
print(tokenized_datasets["train"][sample_index]['label'])


--- First Tokenized Training Sample ---
I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard th
0


In [24]:
# Generated tensors
# input_ids: The numerical IDs of the tokens (including [CLS] and [SEP])
print(f"Input IDs Shape: {len(tokenized_datasets['train'][sample_index]['input_ids'])}")
print(f"Input IDs (First 10): {tokenized_datasets['train'][sample_index]['input_ids'][:10]}")

# attention_mask: A tensor of 1s and 0s. 1 indicates a real token, 0 indicates a [PAD] token.
print(f"Attention Mask (Snippet): {tokenized_datasets['train'][sample_index]['attention_mask'][:10]}")


Input IDs Shape: 512
Input IDs (First 10): [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026]
Attention Mask (Snippet): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
