## Bert & GPT-2

### ECE590 Homework assignment 5
Name: Javier Cervantes

net id: jc1010


1. We will use the BERT model to perform sentiment analysis. The sentiment is performed via a linear model applied to the output vector above the [CLS] input in BERT (that output vector is referred to as C).

    Use a pretrained BERT model from:
    https://github.com/google-research/bert

    And use sentiment data from:
    https://huggingface.co/datasets/yelp_polarity

    Build a sentiment-analysis model based on BERT, using the above data. Do a detailed analysis of performance, and compare the accuracy of this model to results you achieved with the simpler baseline model from the prior homework.

    Implement the model two ways:

    (a) Leave all BERT parameters unchanged, and just learn the linear model at the output.
    
    (b) Fine-tune all BERT parameters, while also learning the linear layer at the output.

    In your solution, provide all code and also a detailed summary of the analysis of the results.


In [24]:
from transformers import BertTokenizerFast, BertModel

# existing bert models
bert_models = {
    "base": "google/bert-base-uncased",
    "small": "google/bert_uncased_L-4_H-512_A-8",
    "medium": "google/bert_uncased_L-8_H-512_A-8",
}

# pick a bert model
bert_model_name = "medium"

# load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained(bert_models[bert_model_name])
bert_model = BertModel.from_pretrained(bert_models[bert_model_name])

In [26]:
from torch.utils.data import DataLoader
import torch
from datasets import load_dataset
import numpy as np

seed = 257

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# load the yelp_polarity dataset
train_data, test_data = load_dataset("yelp_polarity", split=["train", "test"])

max_length = 256


# define a function to tokenize the dataset
def tokenize(batch):
    return tokenizer(
        batch["text"][:max_length], padding="max_length", max_length=max_length
    )


# tokenize the dataset
train_data = train_data.map(tokenize)
test_data = test_data.map(tokenize)

# convert the dataset to pytorch tensors
train_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# validation data
train_valid_data = train_data.train_test_split(test_size=0.25)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

# create a dataloader
train_loader = DataLoader(train_data, batch_size=256, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=256)
test_loader = DataLoader(test_data, batch_size=256)

Map: 100%|██████████| 560000/560000 [04:27<00:00, 2089.85 examples/s]
Map: 100%|██████████| 38000/38000 [00:18<00:00, 2090.18 examples/s]


In [27]:
from torch import nn
from torch import optim
import tqdm


# Define a new model with BERT and a linear layer on top for classification
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert_model = bert_model
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)
        return logits


In [28]:
def train(model, criterion, optimizer, train_loader, device):
    model.train()
    epoch_losses = []
    epoch_accuracies = []
    for i, batch in tqdm.tqdm(
        enumerate(train_loader), desc="Training ...", total=len(train_loader)
    ):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        accuracy = (outputs.argmax(1) == labels).float().mean()
        epoch_losses.append(loss.item())
        epoch_accuracies.append(accuracy.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return np.mean(epoch_losses), np.mean(epoch_accuracies)



def validation(model, criterion, valid_loader, device):
    model.eval()
    epoch_losses = []
    epoch_accuracies = []
    for i, batch in tqdm.tqdm(
        enumerate(valid_loader), desc="Validation ...", total=len(valid_loader)
    ):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            accuracy = (outputs.argmax(1) == labels).float().mean()
            epoch_losses.append(loss.item())
            epoch_accuracies.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accuracies)



def train_model(
    model, criterion, optimizer, train_loader, valid_loader, num_epochs, device
):
    best_valid_loss = float("inf")
    for epoch in range(num_epochs):
        train_loss, train_accuracy = train(
            model, criterion, optimizer, train_loader, device
        )
        valid_loss, valid_accuracy = validation(model, criterion, valid_loader, device)
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Training loss: {train_loss:.4f}")
        print(f"Training accuracy: {train_accuracy:.4f}")
        print(f"Validation loss: {valid_loss:.4f}")
        print(f"Validation accuracy: {valid_accuracy:.4f}")
        print("-" * 50)
        params_grad = [param.requires_grad for param in model.bert_model.parameters()]
        if False in params_grad:
            full_freeze = "freeze"
        else:
            full_freeze = "full"
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(
                model.state_dict(), f"../models/bert_{bert_model_name}_{full_freeze}_{max_length}max.pt"
            )

> a) Leave all BERT parameters unchanged, and just learn the linear model at the output:

In [29]:

# create the model
model = BertClassifier(bert_model, num_classes=2)

# freeze the BERT parameters so that we only learn the linear classifier
for param in model.bert_model.parameters():
    param.requires_grad = False

In [30]:
# define the hyperparameters
lr = 2e-5
num_epochs = 5

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# move the model and loss function to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
criterion = criterion.to(device)

cuda


In [31]:
# train the model
train_model(model, criterion, optimizer, train_loader, valid_loader, num_epochs=num_epochs, device=device)


Training ...: 100%|██████████| 1641/1641 [10:36<00:00,  2.58it/s]
Validation ...: 100%|██████████| 547/547 [03:21<00:00,  2.72it/s]


Epoch 1/5
Training loss: 0.6224
Training accuracy: 0.6697
Validation loss: 0.5650
Validation accuracy: 0.7306
--------------------------------------------------


Training ...: 100%|██████████| 1641/1641 [10:37<00:00,  2.57it/s]
Validation ...: 100%|██████████| 547/547 [03:21<00:00,  2.72it/s]


Epoch 2/5
Training loss: 0.5518
Training accuracy: 0.7322
Validation loss: 0.5223
Validation accuracy: 0.7497
--------------------------------------------------


Training ...: 100%|██████████| 1641/1641 [10:38<00:00,  2.57it/s]
Validation ...: 100%|██████████| 547/547 [03:21<00:00,  2.72it/s]


Epoch 3/5
Training loss: 0.5242
Training accuracy: 0.7454
Validation loss: 0.5025
Validation accuracy: 0.7582
--------------------------------------------------


Training ...: 100%|██████████| 1641/1641 [10:38<00:00,  2.57it/s]
Validation ...: 100%|██████████| 547/547 [03:21<00:00,  2.72it/s]


Epoch 4/5
Training loss: 0.5099
Training accuracy: 0.7523
Validation loss: 0.4908
Validation accuracy: 0.7643
--------------------------------------------------


Training ...: 100%|██████████| 1641/1641 [10:38<00:00,  2.57it/s]
Validation ...: 100%|██████████| 547/547 [03:21<00:00,  2.72it/s]


Epoch 5/5
Training loss: 0.5009
Training accuracy: 0.7570
Validation loss: 0.4831
Validation accuracy: 0.7682
--------------------------------------------------


> b) Fine-tune all BERT parameters, while also learning the linear layer at the output.

In [22]:
# create the model
model = BertClassifier(bert_model, num_classes=2)

# define the hyperparameters
lr = 2e-5
num_epochs = 3

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# move the model and loss function to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
criterion = criterion.to(device)

cuda


In [23]:
# fine-tune the model
train_model(model, criterion, optimizer, train_loader, valid_loader, num_epochs=num_epochs, device=device)

Training ...: 100%|██████████| 1641/1641 [06:51<00:00,  3.99it/s]
Validation ...: 100%|██████████| 547/547 [00:52<00:00, 10.39it/s]


Epoch 1/5
Training loss: 0.3747
Training accuracy: 0.8213
Validation loss: 0.3177
Validation accuracy: 0.8525
--------------------------------------------------


Training ...: 100%|██████████| 1641/1641 [06:48<00:00,  4.02it/s]
Validation ...: 100%|██████████| 547/547 [00:52<00:00, 10.47it/s]


Epoch 2/5
Training loss: 0.3082
Training accuracy: 0.8579
Validation loss: 0.3003
Validation accuracy: 0.8631
--------------------------------------------------


Training ...: 100%|██████████| 1641/1641 [06:48<00:00,  4.02it/s]
Validation ...: 100%|██████████| 547/547 [00:52<00:00, 10.50it/s]


Epoch 3/5
Training loss: 0.2811
Training accuracy: 0.8724
Validation loss: 0.2918
Validation accuracy: 0.8677
--------------------------------------------------


Training ...: 100%|██████████| 1641/1641 [06:48<00:00,  4.02it/s]
Validation ...: 100%|██████████| 547/547 [00:52<00:00, 10.47it/s]


Epoch 4/5
Training loss: 0.2596
Training accuracy: 0.8836
Validation loss: 0.2913
Validation accuracy: 0.8703
--------------------------------------------------


Training ...: 100%|██████████| 1641/1641 [06:48<00:00,  4.02it/s]
Validation ...: 100%|██████████| 547/547 [00:52<00:00, 10.42it/s]

Epoch 5/5
Training loss: 0.2407
Training accuracy: 0.8932
Validation loss: 0.2957
Validation accuracy: 0.8701
--------------------------------------------------





In [None]:
# train the model
train_model(model, criterion, optimizer, train_loader, valid_loader, num_epochs=num_epochs, device=device)