HuggingFace is a company with a heavy open source philosophy that makes transformers readily available so you don't have to do what we did before for every application.

## Prep


In [None]:
!pip install -U datasets evaluate transformers transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.37.1-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 k

In [None]:

import torch
import numpy as np
import random
import os

def set_seeds(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

set_seeds()


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

raw_datasets = None  # load imdb dataset
raw_datasets

In [None]:
raw_datasets['train'][0]  # Let's see the first review

In [None]:
raw_datasets['train'].features

In [None]:

from transformers import AutoModelForSequenceClassification

model_name = "google/bert_uncased_L-2_H-128_A-2"  # Example model
model = None # Load the model


In [None]:

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from torch.optim import AdamW

# Ensure correct tokenization
tokenizer = None # Load the tokenizer

def tokenize_function(examples):
    return None # Tokenize each example

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
# Data collation
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# DataLoader setup
train_dataloader = None # Create a dataloader out of the train dataset, suffle it, batch it for 32 examples and use the data collator.
eval_dataloader = None # Same


In [None]:
# Inspect DataLoader output
for batch in train_dataloader:
    print("Batch shapes:")
    for key, value in batch.items():
        if hasattr(value, 'shape'):
            print(f"{key}: {value.shape}")
        else:
            print(f"{key}: {type(value)} - shape attribute not found")
    break  # Remove this break statement to inspect more batches


In [None]:
from torch.optim import AdamW
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(15):
    model.train()
    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = None
        loss = None
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    total_eval_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = None
        total_eval_loss += None
    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    print(f"Average evaluation loss: {avg_eval_loss}")
