In [1]:
## Import packages
import torch
import time
import wandb
import numpy as np
from tqdm import tqdm

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification

In [2]:
## Set the device to train your model.
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print("Device: ", device)

Device:  mps


In [3]:
## Load the dataset
dataset = load_dataset("stanfordnlp/sst2")

## Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)

## Apply the tokenizer to the dataset.
dataset = dataset.map(lambda x: tokenizer(x['sentence'], truncation=True), batched=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [4]:
## Create batch of data using DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
## Create the model.
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Train the Model.

In [6]:
## Create a trainer class.
class CustomTrainer(Trainer):
    def _inner_training_loop(
            self,
            batch_size = None, 
            args = None,
            resume_from_checkpoint = None,
            trial = None,
            ignore_keys_for_eval = None
    ):
        number_of_epochs = args.num_train_epochs
        start_time = time.time()

        criterion = torch.nn.CrossEntropyLoss().to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=1, gamma=0.9)

        train_dataloader = self.get_train_dataloader()
        eval_dataloader = self.get_eval_dataloader()

        for epoch in range(number_of_epochs):
            train_loss_per_epoch = 0
            train_acc_per_epoch = 0
            with tqdm(train_dataloader, unit = 'batch') as training_epoch:
                training_epoch.set_description(f"Training Epoch {epoch}")
                for step, inputs in enumerate(training_epoch):
                    inputs = inputs.to(device)
                    labels = inputs['labels'].to(device)

                    ## Forward pass
                    self.optimizer.zero_grad()
                    model_outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

                    ## Compute the loss
                    loss = criterion(model_outputs['logits'], labels)
                    train_loss_per_epoch += loss.item()

                    ## Calculate gradients
                    loss.backward()

                    ## Update weights
                    self.optimizer.step()
                    train_accuracy = (model_outputs['logits'].argmax(1) == labels).sum().item()
                    train_acc_per_epoch += train_accuracy

                    wandb.log({"Step Training Loss":loss.item(),
                                "Step Training Accuracy":train_acc_per_epoch,
                                "Step":step})

            ## Change the learning rate.
            self.scheduler.step()

            ## Compute the average loss and accuracy over all of the batches.
            train_loss_per_epoch /= len(train_dataloader)
            train_acc_per_epoch /= (len(train_dataloader) * batch_size)

            wandb.log({"Epoch Train Loss":train_loss_per_epoch,
                        "Epoch Train Accuracy":train_acc_per_epoch,
                        "Epoch":epoch})

            ## Run the Model on Evaluation Dataset
            eval_loss_per_epoch = 0
            eval_acc_per_epoch = 0
            with tqdm(eval_dataloader, unit='batch') as eval_epoch:
                eval_epoch.set_description(f"Evaluation Epoch {epoch}")
                with torch.no_grad():
                    for step, inputs in enumerate(eval_epoch):
                        inputs = inputs.to(device)
                        labels = inputs['labels'].to(device)

                        ## Foward pass
                        model_outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
                        
                        ## Compute loss
                        loss = criterion(model_outputs['logits'], labels)
                        eval_loss_per_epoch += loss.item()

                        ## Compute accuracy
                        eval_accuracy = (model_outputs['logits'].argmax(1) == labels).sum().item()
                        eval_acc_per_epoch += eval_accuracy
            
            eval_loss_per_epoch /= len(eval_dataloader)
            eval_acc_per_epoch /= (len(eval_dataloader) * batch_size)

            wandb.log({"Eval Loss": eval_loss_per_epoch, 
                        "Eval Accuracy": eval_acc_per_epoch})

            print(f'\tTrain Loss: {train_loss_per_epoch :.3f} | Train Acc: {train_acc_per_epoch*100:.2f}%')
            print(f'\tEval Loss: {eval_loss_per_epoch :.3f} | Eval Acc: {eval_acc_per_epoch*100:.2f}%')
        print(f'Time: {(time.time()-start_time)/60:.3f} minutes ')

In [7]:
## Set wandb parallelist to false.
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['WANDB_NOTEBOOK_NAME'] = 'hw2.ipynb'

## WandB setup to get plots and metrics.
config = dict(
    epochs = 5,
    classes = 2,
    batch_size = 64,
    learning_rate = 2e-5,
    dataset = 'sst2',
    architecture = 'bert'
)

## Setting up Training Pipeline inside WandB.
with wandb.init(project='NPL HW2', name='First Run', config=config):

    ## Define Training Arguments.
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=wandb.config.learning_rate,
        num_train_epochs=wandb.config.epochs,
        per_device_train_batch_size=wandb.config.batch_size,
        per_device_eval_batch_size=wandb.config.batch_size
    )

    ## Initialize the trainer
    trainer = CustomTrainer(
        model=model,
        args = training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    ## Train the model
    trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmohitydv09[0m ([33mmohitydv09-university-of-minnesota5275[0m). Use [1m`wandb login --relogin`[0m to force relogin


Training Epoch 0: 100%|██████████| 1/1 [00:01<00:00,  1.54s/batch]
Evaluation Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  6.18batch/s]


	Train Loss: 0.667 | Train Acc: 100.00%
	Eval Loss: 0.607 | Eval Acc: 50.00%


Training Epoch 1: 100%|██████████| 1/1 [00:00<00:00,  4.66batch/s]
Evaluation Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 41.25batch/s]


	Train Loss: 0.461 | Train Acc: 100.00%
	Eval Loss: 0.600 | Eval Acc: 50.00%


Training Epoch 2: 100%|██████████| 1/1 [00:00<00:00,  4.78batch/s]
Evaluation Epoch 2: 100%|██████████| 1/1 [00:00<00:00, 41.05batch/s]


	Train Loss: 0.347 | Train Acc: 100.00%
	Eval Loss: 0.609 | Eval Acc: 50.00%


Training Epoch 3: 100%|██████████| 1/1 [00:00<00:00,  3.98batch/s]
Evaluation Epoch 3: 100%|██████████| 1/1 [00:00<00:00, 27.28batch/s]


	Train Loss: 0.264 | Train Acc: 100.00%
	Eval Loss: 0.625 | Eval Acc: 50.00%


Training Epoch 4: 100%|██████████| 1/1 [00:00<00:00,  3.89batch/s]
Evaluation Epoch 4: 100%|██████████| 1/1 [00:00<00:00, 42.44batch/s]


	Train Loss: 0.225 | Train Acc: 100.00%
	Eval Loss: 0.639 | Eval Acc: 50.00%


Training Epoch 5: 100%|██████████| 1/1 [00:00<00:00,  3.51batch/s]
Evaluation Epoch 5: 100%|██████████| 1/1 [00:00<00:00, 32.39batch/s]


	Train Loss: 0.197 | Train Acc: 100.00%
	Eval Loss: 0.651 | Eval Acc: 50.00%


Training Epoch 6: 100%|██████████| 1/1 [00:00<00:00,  4.92batch/s]
Evaluation Epoch 6: 100%|██████████| 1/1 [00:00<00:00, 43.38batch/s]


	Train Loss: 0.176 | Train Acc: 100.00%
	Eval Loss: 0.661 | Eval Acc: 50.00%


Training Epoch 7: 100%|██████████| 1/1 [00:00<00:00,  4.89batch/s]
Evaluation Epoch 7: 100%|██████████| 1/1 [00:00<00:00, 39.01batch/s]


	Train Loss: 0.159 | Train Acc: 100.00%
	Eval Loss: 0.670 | Eval Acc: 50.00%


Training Epoch 8: 100%|██████████| 1/1 [00:00<00:00,  4.81batch/s]
Evaluation Epoch 8: 100%|██████████| 1/1 [00:00<00:00, 40.98batch/s]


	Train Loss: 0.149 | Train Acc: 100.00%
	Eval Loss: 0.678 | Eval Acc: 50.00%


Training Epoch 9: 100%|██████████| 1/1 [00:00<00:00,  4.87batch/s]
Evaluation Epoch 9: 100%|██████████| 1/1 [00:00<00:00, 16.38batch/s]


	Train Loss: 0.142 | Train Acc: 100.00%
	Eval Loss: 0.686 | Eval Acc: 50.00%


Training Epoch 10: 100%|██████████| 1/1 [00:00<00:00,  4.78batch/s]
Evaluation Epoch 10: 100%|██████████| 1/1 [00:00<00:00, 42.00batch/s]


	Train Loss: 0.133 | Train Acc: 100.00%
	Eval Loss: 0.692 | Eval Acc: 50.00%


Training Epoch 11: 100%|██████████| 1/1 [00:00<00:00,  4.44batch/s]
Evaluation Epoch 11: 100%|██████████| 1/1 [00:00<00:00, 33.34batch/s]


	Train Loss: 0.124 | Train Acc: 100.00%
	Eval Loss: 0.698 | Eval Acc: 50.00%


Training Epoch 12: 100%|██████████| 1/1 [00:00<00:00,  4.85batch/s]
Evaluation Epoch 12: 100%|██████████| 1/1 [00:00<00:00, 41.59batch/s]


	Train Loss: 0.116 | Train Acc: 100.00%
	Eval Loss: 0.703 | Eval Acc: 50.00%


Training Epoch 13: 100%|██████████| 1/1 [00:00<00:00,  4.35batch/s]
Evaluation Epoch 13: 100%|██████████| 1/1 [00:00<00:00, 47.05batch/s]


	Train Loss: 0.110 | Train Acc: 100.00%
	Eval Loss: 0.707 | Eval Acc: 50.00%


Training Epoch 14: 100%|██████████| 1/1 [00:00<00:00,  4.92batch/s]
Evaluation Epoch 14: 100%|██████████| 1/1 [00:00<00:00, 40.39batch/s]


	Train Loss: 0.104 | Train Acc: 100.00%
	Eval Loss: 0.711 | Eval Acc: 50.00%


Training Epoch 15: 100%|██████████| 1/1 [00:00<00:00,  4.78batch/s]
Evaluation Epoch 15: 100%|██████████| 1/1 [00:00<00:00, 40.18batch/s]


	Train Loss: 0.100 | Train Acc: 100.00%
	Eval Loss: 0.714 | Eval Acc: 50.00%


Training Epoch 16: 100%|██████████| 1/1 [00:00<00:00,  4.74batch/s]
Evaluation Epoch 16: 100%|██████████| 1/1 [00:00<00:00, 40.17batch/s]


	Train Loss: 0.096 | Train Acc: 100.00%
	Eval Loss: 0.716 | Eval Acc: 50.00%


Training Epoch 17: 100%|██████████| 1/1 [00:00<00:00,  4.61batch/s]
Evaluation Epoch 17: 100%|██████████| 1/1 [00:00<00:00, 13.61batch/s]


	Train Loss: 0.093 | Train Acc: 100.00%
	Eval Loss: 0.718 | Eval Acc: 50.00%


Training Epoch 18: 100%|██████████| 1/1 [00:00<00:00,  4.68batch/s]
Evaluation Epoch 18: 100%|██████████| 1/1 [00:00<00:00, 39.65batch/s]


	Train Loss: 0.091 | Train Acc: 100.00%
	Eval Loss: 0.720 | Eval Acc: 50.00%


Training Epoch 19: 100%|██████████| 1/1 [00:00<00:00,  4.20batch/s]
Evaluation Epoch 19: 100%|██████████| 1/1 [00:00<00:00, 38.40batch/s]


	Train Loss: 0.088 | Train Acc: 100.00%
	Eval Loss: 0.721 | Eval Acc: 50.00%
Time: 0.115 minutes 


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Epoch Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Epoch Train Loss,█▆▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
Eval Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Eval Loss,▁▁▂▂▃▄▅▅▅▆▆▇▇▇▇█████
Step,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Step Training Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Step Training Loss,█▆▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
Epoch,19.0
Epoch Train Accuracy,1.0
Epoch Train Loss,0.08842
Eval Accuracy,0.5
Eval Loss,0.72141
Step,0.0
Step Training Accuracy,2.0
Step Training Loss,0.08842
