# Sentiment Analysis with DistilBERT and SST2 Dataset

## Install, import and setup

In [1]:
%pip install transformers datasets torch



In [2]:
%pip install --upgrade datasets fsspec

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [3]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np

In [4]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

Using device: cuda
GPU: Tesla T4


## Load the SST2 dataset

In [6]:
dataset = load_dataset("stanfordnlp/sst2")

print(dataset['train'][0])

{'idx': 0, 'sentence': 'hide new secretions from the parental units ', 'label': 0}


## Load the DistilBERT Tokenizer and Tokenize the dataset

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

## Convert to PyTorch Dataset

In [9]:
train_dataset = tokenized_datasets["train"]
validation_dataset = tokenized_datasets["validation"]

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
validation_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

for i in range(1):
    print(train_dataset[i]['input_ids'].shape)
    print(train_dataset[i]['attention_mask'].shape)
    print(train_dataset[i]['label'].shape)
    break

torch.Size([128])
torch.Size([128])
torch.Size([])


## Load and Configure the DistilBERT Model

In [10]:
# Load the pre-trained DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Freeze DistilBERT Layers

In [11]:
for param in model.distilbert.parameters():
    param.requires_grad = False
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## Compile the Model

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to=None,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

## Train the Model

In [14]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlucas-le[0m ([33mlucas-le-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.395,0.384372,0.830275
2,0.3717,0.375362,0.831422
3,0.3694,0.374796,0.832569


TrainOutput(global_step=3159, training_loss=0.39559492743366237, metrics={'train_runtime': 775.1791, 'train_samples_per_second': 260.646, 'train_steps_per_second': 4.075, 'total_flos': 6691160124062208.0, 'train_loss': 0.39559492743366237, 'epoch': 3.0})

## Save the model

In [15]:
model.save_pretrained("./distilbert-sst2")

## Evaluate the Model

In [16]:
results = trainer.evaluate()
print(results)
print(f"Test loss: {results['eval_loss']}")
print(f"Test accuracy: {results['eval_accuracy']}")

{'eval_loss': 0.3747960329055786, 'eval_accuracy': 0.8325688073394495, 'eval_runtime': 3.1058, 'eval_samples_per_second': 280.764, 'eval_steps_per_second': 4.508, 'epoch': 3.0}
Test loss: 0.3747960329055786
Test accuracy: 0.8325688073394495
