In [None]:
# Transformer Model From Scratch

In [None]:
Install required datasets, including accelerate library for doing the training

In [6]:
!pip install transformers datasets
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading transformers-4.48.3-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading d

In [None]:
Load the datasets into memory

In [8]:
from datasets import load_dataset
dataset = load_dataset('jeffnyman/emotions')

README.md:   0%|          | 0.00/5.80k [00:00<?, ?B/s]

emotions.py:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

The repository for jeffnyman/emotions contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/jeffnyman/emotions.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


ModuleNotFoundError: No module named 'datasets.tasks'

In [None]:
Define the tokenizer function/strategy, based on the Bert-Base-Uncased

In [None]:
from transformers import AutoTokenizer

def tokenize_function(examples):
  return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
Verify unique labels in the dataset to prevent GPU-related errors

In [None]:
unique_labels = set(tokenized_datasets['train']['label'])
print(f"Unique labels in the training set: {unique_labels}")

def check_labels(dataset):
  for label in dataset['train']['label']:
    if label not in unique_labels:
      print(f"Found invalid label: {label}")

check_labels(tokenized_datasets)

In [None]:
Specify Hyperparameters and creates the Model Config

In [None]:
from transformers import BertConfig
from transformers import BertForSequenceClassification

config = BertConfig(
vocab_size=tokenizer.vocab_size,
hidden_size=512,
num_hidden_layers=6,
num_attention_heads=8,
intermediate_size=2048,
max_position_embeddings=512,
num_labels=len(unique_labels)
)

model = BertForSequenceClassification(config)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
  output_dir='./results',
  evaluation_strategy="epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,
  num_train_epochs=3,
  weight_decay=0.01,
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_datasets["train"],
  eval_dataset=tokenized_datasets["test"],
)

In [None]:
Now run the train function for this model

In [None]:
trainer.train()