# A7: Training Distillation vs LoRA

In [2]:
# !pip install datasets --upgrade
import datasets
import transformers
import torch
datasets.__version__, transformers.__version__, torch.__version__

  from .autonotebook import tqdm as notebook_tqdm


('3.4.1', '4.49.0', '2.5.1+cu121')

In [3]:
import torch.nn as nn
import torch
from tqdm.auto import tqdm
import random, math, time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda


# 1. Load HateXplain Dataset from Hugging Face

In [None]:
from datasets import load_dataset

# Load the HateXplain dataset from Hugging Face
dataset = load_dataset("hate_speech_offensive")

# # Check dataset structure
# print(dataset)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 24783/24783 [00:00<00:00, 461769.01 examples/s]


In [5]:
# Label Mapping
label_list = ["Non-Hate", "Offensive", "Hate"]
label2id = {v: i for i, v in enumerate(label_list)}
id2label = {i: v for v, i in label2id.items()}

In [6]:
# Assign feature key
task_to_keys = {"hatexplain": "tweet"}
task_name = "hatexplain"
sentence_key = task_to_keys[task_name]

In [9]:
# Print dataset overview
print(dataset)
print("Example:", dataset["train"][0][sentence_key])
print("Label2ID:", label2id)
print("ID2Label:", id2label)

DatasetDict({
    train: Dataset({
        features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'class', 'tweet'],
        num_rows: 24783
    })
})
Example: !!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
Label2ID: {'Non-Hate': 0, 'Offensive': 1, 'Hate': 2}
ID2Label: {0: 'Non-Hate', 1: 'Offensive', 2: 'Hate'}


# 2. Tokenization and Data Preprocessing

In [13]:
# Check number of unique labels
num_labels = len(label_list)
num_labels

3

In [14]:
import numpy as np  
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load BERT tokenizer
teacher_id = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
# Load Teacher Model (BERT)
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id, 
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Tokenization Function (Modified for HateXplain)
def tokenize_function(examples):
    return tokenizer(examples["tweet"], max_length=128, truncation=True, padding="max_length")

In [17]:
# Apply Tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["count", "hate_speech_count", "offensive_language_count", "neither_count", "tweet"])

# Rename "class" column to "labels" for PyTorch compatibility
tokenized_datasets = tokenized_datasets.rename_column("class", "labels")

# Set dataset format for PyTorch
tokenized_datasets.set_format("torch")

# Print an example tokenized input
print(tokenized_datasets["train"][0]["input_ids"])
print(tokenizer.decode(tokenized_datasets["train"][0]["input_ids"]))

Map: 100%|██████████| 24783/24783 [00:02<00:00, 10907.61 examples/s]

tensor([  101,   999,   999,   999, 19387,  1030,  9815, 19454, 21818,  2135,
         1024,  2004,  1037,  2450,  2017,  5807,  1005,  1056, 17612,  2055,
         9344,  2039,  2115,  2160,  1012,  1004, 23713,  1025,  2004,  1037,
         2158,  2017,  2323,  2467,  2202,  1996, 11669,  2041,  1012,  1012,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])




# 3. Preparing Dataloader

In [18]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# Data Collator (Handles Dynamic Padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
# Subset dataset for efficiency
small_train_dataset = tokenized_datasets["train"].shuffle(seed=1150).select(range(10000))  # 10K samples
small_eval_dataset = tokenized_datasets["train"].shuffle(seed=1150).select(range(1000))    # 1K samples (same train split)
small_test_dataset = tokenized_datasets["train"].shuffle(seed=1150).select(range(1000))    # 1K samples (same train split)

In [20]:
# Create Dataloaders
train_dataloader = DataLoader(
    small_train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator
)
test_dataloader = DataLoader(
    small_test_dataset, batch_size=32, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    small_eval_dataset, batch_size=32, collate_fn=data_collator
)

In [21]:
# Check first batch
for batch in train_dataloader:
    break

batch['labels'].shape, batch['input_ids'].shape, batch['attention_mask'].shape

(torch.Size([32]), torch.Size([32, 128]), torch.Size([32, 128]))

# 4. Model Training for Even and Odd Layers