In [1]:
# Install libraries for NLP and ML tasks
!pip install transformers datasets evaluate scikit-learn

# Install transformers with PyTorch support
!pip install transformers[torch]

# Install Weights & Biases for experiment tracking
!pip install wandb

Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Importing necessary libraries
from huggingface_hub import notebook_login
import time
import math
from tqdm import tqdm
import wandb

In [3]:
import torch

# Check if CUDA (GPU support) is available, otherwise use CPU
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# Print the device being used (GPU or CPU)
print(f'device: {device}')

device: cuda:0


In [4]:
# Logging into Hugging Face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
# Importing dataset sst2
from datasets import load_dataset

dataset = load_dataset("sst2")

In [20]:
# Checking the dataset
dataset["validation"][0]

{'idx': 0,
 'sentence': "it 's a charming and often affecting journey . ",
 'label': 1}

In [21]:
from transformers import AutoTokenizer

# Initialize a tokenizer for the 'distilbert-base-uncased' model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [22]:
# Define a preprocessing function to tokenize input sentences
def preprocess_function(examples):
    # Tokenize the sentence(s), truncating them if they exceed the maximum length allowed by the model
    return tokenizer(examples["sentence"], truncation=True)

In [23]:
# Apply the preprocess_function to the entire dataset, processing in batches for efficiency
tokenized_review = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [24]:
from transformers import DataCollatorWithPadding

# Initialize a data collator that will dynamically pad the batched samples to the maximum length in each batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
import evaluate

# Load the accuracy metric from the 'evaluate' library to evaluate model performance
accuracy = evaluate.load("accuracy")

In [26]:
import numpy as np

# Define a function to compute metrics for evaluation
def compute_metrics(eval_pred):
    # Unpack the predictions and labels from eval_pred
    predictions, labels = eval_pred
    # Convert logits to predicted class indices
    predictions = np.argmax(predictions, axis=1)
    # Compute accuracy between predictions and actual labels
    return accuracy.compute(predictions=predictions, references=labels)

In [27]:
labels = ['NEGATIVE', 'POSITIVE']

# Create a dictionary mapping from index to label
id2label = {i: label for i, label in enumerate(labels)}
# Create a dictionary mapping from label to index
label2id = {label: i for i, label in id2label.items()}

# Print the mappings
print('id2label:', id2label)
print('label2id:', label2id)

id2label: {0: 'NEGATIVE', 1: 'POSITIVE'}
label2id: {'NEGATIVE': 0, 'POSITIVE': 1}


In [28]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Initialize a model for sequence classification based on 'distilbert-base-uncased',
# specifying the number of labels and the label mappings
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Define training arguments for the Trainer
training_args = TrainingArguments(
    output_dir="hw1",  # Directory where the model checkpoints will be saved
    learning_rate=2e-5,  # Learning rate for the optimizer
    per_device_train_batch_size=45,  # Training batch size per device (GPU/CPU)
    per_device_eval_batch_size=45,  # Evaluation batch size per device (GPU/CPU)
    num_train_epochs=3,  # Total number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model in terms of metrics at the end of training
    push_to_hub=True,  # Push the model to the Hugging Face Model Hub
)

In [30]:
# Setting up custom trainer
class CustomTrainer(Trainer):
    def _inner_training_loop(
        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
    ):
        wandb.init(
        # Set the wandb project where this run will be logged
        project="NLP_hw1",

        # Track hyperparameters and run metadata
        config={
        "learning_rate": args.learning_rate,
        "architecture": "NN",
        "dataset": "SST2",
        "epochs": args.num_train_epochs,
        }
        )
        number_of_epochs = args.num_train_epochs
        start = time.time()
        train_loss =[]
        train_acc =[]
        eval_acc =[]
        criterion = torch.nn.BCEWithLogitsLoss ().to(device) # to make all the computations happen through cuda
        self.optimizer = torch.optim.Adam(model.parameters(),lr=args.learning_rate )
        self.scheduler = torch.optim.lr_scheduler.StepLR (self.optimizer,1,gamma=0.9)
        train_dataloader = self.get_train_dataloader()
        eval_dataloader = self.get_eval_dataloader()
        max_steps = math . ceil(args.num_train_epochs * len ( train_dataloader ))
        for epoch in range(number_of_epochs):
          train_loss_per_epoch = 0
          train_acc_per_epoch = 0
          with tqdm (train_dataloader,unit=" batch ") as training_epoch :
            training_epoch.set_description(f"Training Epoch { epoch }")
            for step,inputs in enumerate(training_epoch):
              inputs = inputs.to (device)
              #return(inputs)
              labels = inputs['labels'].float()

              # forward pass
              self.optimizer.zero_grad ()
              output = model(**inputs)# TODO Implement by yourself

              # get the loss
              loss = criterion(output[1][:,1],labels) # TODO Implement by yourself
              curr_acc=(output ['logits'].argmax(1)==labels).sum().item()

              # print(f'\tTrain Loss per loop: {loss:.3f} | Train Acc per loop: {curr_acc*100:.2f}%') ---- CHECK THIS
              train_loss_per_epoch += loss.item ()

              # calculate gradients
              loss.backward ()

              # update weights
              self.optimizer.step()
              train_acc_per_epoch += (output ['logits'].argmax(1)==labels).sum().item()

          # adjust the learning rate
          self.scheduler.step ()
          train_loss_per_epoch /= len ( train_dataloader )
          train_acc_per_epoch /= (len ( train_dataloader )* batch_size )
          eval_loss_per_epoch = 0
          eval_acc_per_epoch = 0
          with tqdm ( eval_dataloader , unit =" batch ") as eval_epoch :
            eval_epoch . set_description (f" Evaluation Epoch { epoch }")
            for step,inputs in enumerate(eval_epoch):
              inputs = inputs.to (device)
              labels = inputs['labels'].float()
              # forward pass
              output = model(**inputs)
              loss = criterion(output[1][:,1],labels)
              eval_loss_per_epoch += loss.item ()
              eval_acc_per_epoch += (output['logits'].argmax(1)==labels).sum().item()
            # ... TODO Implement by yourself
          eval_loss_per_epoch /= (len ( eval_dataloader ))
          eval_acc_per_epoch /= ( len ( eval_dataloader )* batch_size )
          print (f'\n\t Train Loss:{train_loss_per_epoch:.3f} | Train Acc : {train_acc_per_epoch * 100 :.2f}% ') # These give the accuracy after each epoch ( epoch is traning through entire training set)
          print (f'\t Eval Loss:{eval_loss_per_epoch:.3f} | Eval Acc : {eval_acc_per_epoch * 100 :.2f}%')
          wandb.log({"val_acc": eval_acc_per_epoch, "loss": train_loss_per_epoch,"train_acc":train_acc_per_epoch})
        print (f'Time :{(time.time()-start)/60:.3f} minutes ')
        wandb.finish()


In [31]:
# Initialize the custom trainer with model, training arguments, datasets, tokenizer, data collator, and metric computation function
cust_trainer = CustomTrainer(
    model=model,  # The model to be trained
    args=training_args,  # Training arguments
    train_dataset=tokenized_review["train"],  # Training dataset
    eval_dataset=tokenized_review["validation"],  # Validation dataset for evaluation
    tokenizer=tokenizer,  # Tokenizer for preprocessing data
    data_collator=data_collator,  # Data collator for dynamically padding batches
    compute_metrics=compute_metrics,  # Function to compute metrics for evaluation
)

In [32]:
# Start training the model using the custom trainer
cust_trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Training Epoch 0:   0%|          | 0/1497 [00:00<?, ? batch /s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Training Epoch 0: 100%|██████████| 1497/1497 [04:03<00:00,  6.14 batch /s]
 Evaluation Epoch 0: 100%|██████████| 20/20 [00:01<00:00, 16.13 batch /s]



	 Train Loss:0.207 | Train Acc : 91.74% 
	 Eval Loss:0.232 | Eval Acc : 87.78%


Training Epoch 1: 100%|██████████| 1497/1497 [04:01<00:00,  6.20 batch /s]
 Evaluation Epoch 1: 100%|██████████| 20/20 [00:01<00:00, 16.25 batch /s]



	 Train Loss:0.093 | Train Acc : 96.86% 
	 Eval Loss:0.263 | Eval Acc : 86.89%


Training Epoch 2: 100%|██████████| 1497/1497 [04:02<00:00,  6.17 batch /s]
 Evaluation Epoch 2: 100%|██████████| 20/20 [00:01<00:00, 16.32 batch /s]



	 Train Loss:0.051 | Train Acc : 98.20% 
	 Eval Loss:0.283 | Eval Acc : 87.33%
Time :12.189 minutes 


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▃▁
train_acc,▁▇█
val_acc,█▁▅

0,1
loss,0.05144
train_acc,0.98199
val_acc,0.87333


In [33]:
  # Push the trained model and its tokenizer to the Hugging Face Model Hub
cust_trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mudit1903/hw1/commit/8646293bd7b1e2cea17638ce8550e6db65062eb1', commit_message='End of training', commit_description='', oid='8646293bd7b1e2cea17638ce8550e6db65062eb1', pr_url=None, pr_revision=None, pr_num=None)

In [34]:
# Extract the validation dataset from the dataset object
validation = dataset['validation']

In [35]:
# Evaluate the model on the tokenized validation dataset
cust_trainer.evaluate(tokenized_review["validation"])

[34m[1mwandb[0m: Currently logged in as: [33mjantw002[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 0.2817418575286865,
 'eval_accuracy': 0.9013761467889908,
 'eval_runtime': 1.3375,
 'eval_samples_per_second': 651.968,
 'eval_steps_per_second': 14.953}

In [36]:
from transformers import AutoModelForSequenceClassification

# Assuming 'model' is already defined and initialized earlier
# Directly evaluate the model on the tokenized validation dataset
evaluation_results_trainer = cust_trainer.evaluate(tokenized_review["validation"])

# Output the evaluation results
evaluation_results_trainer

{'eval_loss': 0.2817418575286865,
 'eval_accuracy': 0.9013761467889908,
 'eval_runtime': 1.3052,
 'eval_samples_per_second': 668.074,
 'eval_steps_per_second': 15.323}

In [37]:
# Print the evaluation results stored in the variable
print(evaluation_results_trainer)

{'eval_loss': 0.2817418575286865, 'eval_accuracy': 0.9013761467889908, 'eval_runtime': 1.3052, 'eval_samples_per_second': 668.074, 'eval_steps_per_second': 15.323}


In [38]:
# Accessing the hidden size of the model
model.config.hidden_size

768

In [39]:
# Accessing the dropout rate of the model
model.config.dropout

0.1

In [40]:
# Print the full configuration of the model
print(model.config)

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "vocab_size": 30522
}



Analysis of model:

In [41]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Initialize counters for tracking mismatches and indexing
count = 0
i = 0

# Model name from Hugging Face's Model Hub
model_name = 'mudit1903/hw1'

# Load tokenizer and model from the specified model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Loop until 10 mismatches between predicted and actual labels are found
while count != 10:
    # Extract the sentence to be classified
    text = validation['sentence'][i]

    # Tokenize the text and prepare it for the model
    inputs = tokenizer(text, return_tensors="pt")

    # Perform inference without computing gradients for efficiency
    with torch.no_grad():
        logits = model(**inputs).logits

    # Determine the predicted class ID based on the logits
    predicted_class_id = logits.argmax().item()

    # Check if the prediction matches the actual label
    if predicted_class_id != validation['label'][i]:
        # Print the sentence and confidence scores for incorrect predictions
        print(text)
        print('Confidence score:', torch.nn.functional.softmax(logits, dim=1))
        print('Predicted:', model.config.id2label[predicted_class_id], "The actual is:", model.config.id2label[validation['label'][i]])

        # Increment the mismatch counter
        count += 1

    # Move to the next sentence
    i += 1

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

we root for ( clara and paul ) , even like them , though perhaps it 's an emotion closer to pity . 
Confidence score: tensor([[0.5861, 0.4139]])
Predicted: NEGATIVE The actual is: POSITIVE
holden caulfield did it better . 
Confidence score: tensor([[0.0029, 0.9971]])
Predicted: POSITIVE The actual is: NEGATIVE
if the movie succeeds in instilling a wary sense of ` there but for the grace of god , ' it is far too self-conscious to draw you deeply into its world . 
Confidence score: tensor([[0.4880, 0.5120]])
Predicted: POSITIVE The actual is: NEGATIVE
it offers little beyond the momentary joys of pretty and weightless intellectual entertainment . 
Confidence score: tensor([[0.4096, 0.5904]])
Predicted: POSITIVE The actual is: NEGATIVE
the primitive force of this film seems to bubble up from the vast collective memory of the combatants . 
Confidence score: tensor([[0.5442, 0.4558]])
Predicted: NEGATIVE The actual is: POSITIVE
though it 's become almost redundant to say so , major kudos go