<a href="https://colab.research.google.com/github/pedrofuentes79/RNNs/blob/master/Sentiment-Analysis/sentAnalysis_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

In this notebook I will use a transformed-based model to solve the same problem, determining if a review is positive or negative.
I will use the BERT model.

# Imports and dataset

In [1]:
!pip install --upgrade transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install pytorch-lightning

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.7 MB/s[0m eta [36m0:00:0

In [2]:
import tensorflow as tf
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.optimizers import AdamW, Adam

import torch
from torch.utils.data import IterableDataset, TensorDataset, DataLoader
import matplotlib.pyplot as plt
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from google.colab import drive
drive.mount("/content/drive")

df = pd.read_csv("/content/drive/MyDrive/ColabProjects/amazonreviews_downsampled.csv")
df = df.sample(n=1000, random_state=27)

Mounted at /content/drive


In [4]:
# Constants
MAXLEN = 128

In [6]:
text = df["Text"]
labels = df["Score"].map({1:0, 2:0, 3:0, 4:1, 5:1}).astype("int32").to_numpy()

labels = torch.tensor(labels)

# Tokenize

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

encoding = tokenizer.batch_encode_plus(text, max_length=MAXLEN, padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")

input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
print(input_ids.size())
print(attention_mask.size())

torch.Size([1000, 128])
torch.Size([1000, 128])


In [9]:
# Perform data splitting
train_inputs, temp_inputs, train_attention_masks, temp_attention_masks, train_labels, temp_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.3, random_state=27)
val_inputs, test_inputs, val_attention_masks, test_attention_masks, val_labels, test_labels = train_test_split(temp_inputs, temp_attention_masks, temp_labels, test_size=0.5, random_state=27)

In [10]:
# Create TensorDataset instances
train_dataset = TensorDataset(train_inputs, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_attention_masks, val_labels)
test_dataset = TensorDataset(test_inputs, test_attention_masks, test_labels)

In [11]:
batch_size = 32  # Adjust the batch size according to your needs

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [12]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # num_labels=1? for binary classification

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Loop

In [None]:
import pytorch_lightning as pl

# Define your model as a PyTorch Lightning module
class SentimentClassifier(pl.LightningModule):
    def __init__(self, model, learning_rate=2e-5):
        super(SentimentClassifier, self).__init__()
        self.model = model
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        logits = outputs[0]  # First element is logits
        probs = torch.sigmoid(logits)  # Apply sigmoid activation for probabilities
        loss_fn = torch.nn.BCELoss()  # Use BCELoss for probabilities
        loss = loss_fn(probs.view(-1), labels.float().view(-1))
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        logits = outputs[0]  # First element is logits
        probs = torch.sigmoid(logits)  # Apply sigmoid activation for probabilities
        loss_fn = torch.nn.BCELoss()  # Use BCELoss for probabilities
        loss = loss_fn(probs.view(-1), labels.float().view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        logits = outputs[0]  # First element is logits
        probs = torch.sigmoid(logits)  # Apply sigmoid activation for probabilities
        loss_fn = torch.nn.BCELoss()  # Use BCELoss for probabilities
        loss = loss_fn(probs.view(-1), labels.float().view(-1))
        self.log('test_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer



# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',  # Evaluate after each epoch
    save_total_limit=2,  # Limit the number of checkpoints saved
    num_train_epochs=3,
    per_device_train_batch_size=32,
    learning_rate=3e-5,
)

classifier = SentimentClassifier(model)



# Instantiate a PyTorch Lightning Trainer
trainer = pl.Trainer()

# Train the model
trainer.fit(classifier, train_loader, val_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 109 M 
--------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

## Validation and testing

In [None]:
model.eval()
val_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        val_loss += loss.item()

        _, predicted = logits.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

val_accuracy = 100 * correct / total
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")