In [1]:
# Imports

import pandas as pd

from datasets import load_dataset
from transformers import AutoTokenizer

# Chosen model DistilBERT
MODEL_NAME = "distilbert-base-uncased"


In [2]:
# Load dataset

raw_datasets = load_dataset("imdb")

print("--- Dataset Structure ---")
print(raw_datasets)
print("\n--- Example Training Review ---")
print(raw_datasets["train"][0])

--- Dataset Structure ---
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

--- Example Training Review ---
{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race iss

In [3]:
# Initialize tokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"\nModel: {MODEL_NAME}")


Model: distilbert-base-uncased


In [4]:
# Special tokens

print(f"CLS token: {tokenizer.cls_token} | ID: {tokenizer.cls_token_id}")
print(f"SEP token: {tokenizer.sep_token} | ID: {tokenizer.sep_token_id}")
print(f"PAD token: {tokenizer.pad_token} | ID: {tokenizer.pad_token_id}")

CLS token: [CLS] | ID: 101
SEP token: [SEP] | ID: 102
PAD token: [PAD] | ID: 0


In [5]:
# Tokenization function

MAX_LENGTH = 512

def tokenize_function(example):
    return(tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    ))
    

In [11]:
# Apply tokenization to the dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # batched=True allows the tokenizer to process multiple samples simultaneously, significantly accelerating the process
print("\n--- Tokenized Dataset Structure ---")
print(tokenized_datasets)


--- Tokenized Dataset Structure ---
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [12]:
# First processed sample
print("\n--- First Tokenized Training Sample ---")
sample_index = 0
print(raw_datasets["train"][sample_index]['text'][:150])
print(tokenized_datasets["train"][sample_index]['label'])


--- First Tokenized Training Sample ---
I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard th
0


In [13]:
# Generated tensors
# input_ids: The numerical IDs of the tokens (including [CLS] and [SEP])
print(f"Input IDs Shape: {len(tokenized_datasets['train'][sample_index]['input_ids'])}")
print(f"Input IDs (First 10): {tokenized_datasets['train'][sample_index]['input_ids'][:10]}")

# attention_mask: A tensor of 1s and 0s. 1 indicates a real token, 0 indicates a [PAD] token.
print(f"Attention Mask (Snippet): {tokenized_datasets['train'][sample_index]['attention_mask'][:10]}")


Input IDs Shape: 512
Input IDs (First 10): [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026]
Attention Mask (Snippet): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [14]:
# Selecting and renaming columns
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # This is a common requirement in the Hugging Face Trainer API for sequence classification tasks, as it looks for a column named 'labels' to calculate the loss.

# Casting the lables to PyTorch Long tensors (integer type)
tokenized_datasets = tokenized_datasets.with_format("torch")
print("\n--- Final Tokenized Dataset Structure ---")
print(tokenized_datasets)
print(tokenized_datasets["train"].features)


--- Final Tokenized Dataset Structure ---
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})
{'labels': ClassLabel(names=['neg', 'pos']), 'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8'))}


In [21]:
tokenized_datasets['test'][0]['input_ids']


tensor([  101,  1045,  2293, 16596,  1011, 10882,  1998,  2572,  5627,  2000,
         2404,  2039,  2007,  1037,  2843,  1012, 16596,  1011, 10882,  5691,
         1013,  2694,  2024,  2788,  2104, 11263, 25848,  1010,  2104,  1011,
        12315,  1998, 28947,  1012,  1045,  2699,  2000,  2066,  2023,  1010,
         1045,  2428,  2106,  1010,  2021,  2009,  2003,  2000,  2204,  2694,
        16596,  1011, 10882,  2004, 17690,  1019,  2003,  2000,  2732, 10313,
         1006,  1996,  2434,  1007,  1012, 10021,  4013,  3367, 20086,  2015,
         1010, 10036, 19747,  4520,  1010, 25931,  3064, 22580,  1010,  1039,
         2290,  2008,  2987,  1005,  1056,  2674,  1996,  4281,  1010,  1998,
        16267,  2028,  1011,  8789,  3494,  3685,  2022,  9462,  2007,  1037,
         1005, 16596,  1011, 10882,  1005,  4292,  1012,  1006,  1045,  1005,
         1049,  2469,  2045,  2024,  2216,  1997,  2017,  2041,  2045,  2040,
         2228, 17690,  1019,  2003,  2204, 16596,  1011, 10882, 

In [None]:
# Defining Training and Evaluations Sets

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

# Optional: Small subsets for quick testing, comment out for full training
train_subset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
eval_subset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

print(f"\nTraining Subset Size: {len(train_subset)}")
print(f"Evaluation Subset Size: {len(eval_subset)}")   


Training Subset Size: 1000
Evaluation Subset Size: 1000
