In [1]:
## Import packages
import torch
import time
import wandb
from tqdm import tqdm

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification

In [2]:
## Set the device to train your model.
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print("Device: ", device)

Device:  cuda


In [3]:
## Load the dataset
dataset = load_dataset("stanfordnlp/sst2")

## Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)

## Apply the tokenizer to the dataset.
dataset = dataset.map(lambda x: tokenizer(x['sentence'], truncation=True), batched=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [4]:
## Create batch of data using DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
## Create the model.
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Train the Model.

In [6]:
## Create a trainer class.
class CustomTrainer(Trainer):
    def _inner_training_loop(
            self,
            batch_size = None, 
            args = None,
            resume_from_checkpoint = None,
            trial = None,
            ignore_keys_for_eval = None
    ):
        number_of_epochs = args.num_train_epochs
        start_time = time.time()

        criterion = torch.nn.CrossEntropyLoss().to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=1, gamma=0.9)

        train_dataloader = self.get_train_dataloader()
        eval_dataloader = self.get_eval_dataloader()

        for epoch in range(number_of_epochs):
            train_loss_per_epoch = 0
            train_acc_per_epoch = 0
            with tqdm(train_dataloader, unit = 'batch') as training_epoch:
                training_epoch.set_description(f"Training Epoch {epoch}")
                for step, inputs in enumerate(training_epoch):
                    inputs = inputs.to(device)
                    labels = inputs['labels'].to(device)

                    ## Forward pass
                    self.optimizer.zero_grad()
                    model_outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

                    ## Compute the loss
                    loss = criterion(model_outputs['logits'], labels)
                    train_loss_per_epoch += loss.item()

                    ## Calculate gradients
                    loss.backward()

                    ## Update weights
                    self.optimizer.step()
                    train_accuracy = (model_outputs['logits'].argmax(1) == labels).sum().item()
                    train_acc_per_epoch += train_accuracy

                    wandb.log({"Step Training Loss":loss.item()})

            ## Change the learning rate.
            self.scheduler.step()

            ## Compute the average loss and accuracy over all of the batches.
            train_loss_per_epoch /= len(train_dataloader)
            train_acc_per_epoch /= (len(train_dataloader) * batch_size)

            wandb.log({"Epoch Train Loss":train_loss_per_epoch,
                        "Epoch Train Accuracy":train_acc_per_epoch,
                        "Epoch":epoch})

            ## Run the Model on Evaluation Dataset
            eval_loss_per_epoch = 0
            eval_acc_per_epoch = 0
            with tqdm(eval_dataloader, unit='batch') as eval_epoch:
                eval_epoch.set_description(f"Evaluation Epoch {epoch}")
                with torch.no_grad():
                    for step, inputs in enumerate(eval_epoch):
                        inputs = inputs.to(device)
                        labels = inputs['labels'].to(device)

                        ## Foward pass
                        model_outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
                        
                        ## Compute loss
                        loss = criterion(model_outputs['logits'], labels)
                        eval_loss_per_epoch += loss.item()

                        ## Compute accuracy
                        eval_accuracy = (model_outputs['logits'].argmax(1) == labels).sum().item()
                        eval_acc_per_epoch += eval_accuracy
            
            eval_loss_per_epoch /= len(eval_dataloader)
            eval_acc_per_epoch /= (len(eval_dataloader) * batch_size)

            wandb.log({"Eval Loss": eval_loss_per_epoch, 
                        "Eval Accuracy": eval_acc_per_epoch})

            print(f'\tTrain Loss: {train_loss_per_epoch :.3f} | Train Acc: {train_acc_per_epoch*100:.2f}%')
            print(f'\tEval Loss: {eval_loss_per_epoch :.3f} | Eval Acc: {eval_acc_per_epoch*100:.2f}%')
        print(f'Time: {(time.time()-start_time)/60:.3f} minutes ')

In [7]:
## Set wandb parallelist to false.
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['WANDB_NOTEBOOK_NAME'] = 'hw2.ipynb'

## WandB setup to get plots and metrics.
config = dict(
    epochs = 3,
    classes = 2,
    batch_size = 128,
    learning_rate = 2e-5,
    dataset = 'sst2',
    architecture = 'bert'
)

## Setting up Training Pipeline inside WandB.
with wandb.init(project='NPL HW2', name='Full Data Training', config=config):

    ## Define Training Arguments.
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=wandb.config.learning_rate,
        num_train_epochs=wandb.config.epochs,
        per_device_train_batch_size=wandb.config.batch_size,
        per_device_eval_batch_size=wandb.config.batch_size
    )

    ## Initialize the trainer
    trainer = CustomTrainer(
        model=model,
        args = training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    ## Train the model
    trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmohitydv09[0m ([33mmohitydv09-university-of-minnesota5275[0m). Use [1m`wandb login --relogin`[0m to force relogin


Training Epoch 0: 100%|██████████| 527/527 [01:35<00:00,  5.53batch/s]
Evaluation Epoch 0: 100%|██████████| 7/7 [00:00<00:00, 14.54batch/s]


	Train Loss: 0.207 | Train Acc: 91.66%
	Eval Loss: 0.210 | Eval Acc: 89.40%


Training Epoch 1: 100%|██████████| 527/527 [01:35<00:00,  5.50batch/s]
Evaluation Epoch 1: 100%|██████████| 7/7 [00:00<00:00, 14.47batch/s]


	Train Loss: 0.088 | Train Acc: 96.94%
	Eval Loss: 0.232 | Eval Acc: 89.62%


Training Epoch 2: 100%|██████████| 527/527 [01:36<00:00,  5.48batch/s]
Evaluation Epoch 2: 100%|██████████| 7/7 [00:00<00:00, 13.34batch/s]


	Train Loss: 0.048 | Train Acc: 98.19%
	Eval Loss: 0.230 | Eval Acc: 89.96%
Time: 4.815 minutes 


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Epoch,▁▅█
Epoch Train Accuracy,▁▇█
Epoch Train Loss,█▃▁
Eval Accuracy,▁▄█
Eval Loss,▁█▇
Step Training Loss,█▅▃▃▄▂▃▃▄▂▃▃▂▂▁▂▁▂▂▂▃▃▁▂▂▃▂▁▁▁▁▁▁▂▁▂▁▁▂▂

0,1
Epoch,2.0
Epoch Train Accuracy,0.98194
Epoch Train Loss,0.04805
Eval Accuracy,0.89955
Eval Loss,0.23041
Step Training Loss,0.02899


In [55]:
## Get Test accuracy and wrong items
import pandas as pd
from torch.utils.data import DataLoader

incorrect_pred_df = pd.DataFrame(columns=['text', 'ground truth', 'prediction','confidence score', 
                                          'chatGPT prediction', 'error type','error cause','potential solution'])
# ['idx', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask']

## Validation set is used here as the test data in the dataset doesn't have labels so we won't be able to calculate accuracy by code.
test_dataset = dataset['validation'].remove_columns(['sentence','token_type_ids' ])
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        batch = batch.to(device)

        predictions = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])

        # print(len(predictions['logits']))
        for prediction, label, idx in zip(predictions['logits'],batch['labels'], batch['idx']):
            if(prediction.argmax() != label):
                new_row = {'text':dataset['validation']['sentence'][idx],
                           'ground truth':label.item(),
                           'prediction':prediction.argmax().item(),
                           'confidence score': torch.nn.functional.softmax(prediction, dim=-1).max().item(),
                           'error type': 'false positive' if (label.item() == 0) else 'false negetive'
                           }
                incorrect_pred_df = pd.concat([incorrect_pred_df, pd.DataFrame([new_row])], ignore_index=True)
        if len(incorrect_pred_df) > 20:

            break

  incorrect_pred_df = pd.concat([incorrect_pred_df, pd.DataFrame([new_row])], ignore_index=True)


In [56]:
incorrect_pred_df

Unnamed: 0,text,ground truth,prediction,confidence score,chatGPT prediction,error type,error cause,potential solution
0,vera 's technical prowess ends up selling his ...,0,1,0.803044,,false positive,,
1,"a full world has been presented onscreen , not...",1,0,0.739702,,false negetive,,
2,you 'll gasp appalled and laugh outraged and p...,1,0,0.682118,,false negetive,,
3,a solid film ... but more conscientious than i...,1,0,0.690323,,false negetive,,
4,though it 's become almost redundant to say so...,1,0,0.848193,,false negetive,,
5,every nanosecond of the the new guy reminds yo...,0,1,0.976213,,false positive,,
6,something akin to a japanese alice through the...,1,0,0.639032,,false negetive,,
7,the experience of going to a film festival is ...,0,1,0.882621,,false positive,,
8,"it 's inoffensive , cheerful , built to inspir...",0,1,0.765365,,false positive,,
9,"not far beneath the surface , this reconfigure...",1,0,0.521828,,false negetive,,


In [5]:
### Get ChatGPT responces on the data.
from openai import OpenAI
from openAIkey import OPENAI_API_KEY

client = OpenAI(api_key=OPENAI_API_KEY)

def get_chatgpt_responce(text):
    prompt = f'Classify the sentiment of the following text as 0 for negative or 1 for positive: {text}'

    responce = client.chat.completions.create(
        model = 'gpt-3.5-turbo',
        messages = [
            {
                'role': 'user',
                'content': prompt
            }
        ]
    )
    return responce.choices[0].message['content']


print(get_chatgpt_responce('that loves its characters and communicates something rather beautiful about human nature'))

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}