https://skimai.com/fine-tuning-bert-for-sentiment-analysis/

https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface

In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from datasets import load_dataset
import seaborn as sns
import plotly.express as ex

## Helper funtions

In [2]:
def check_available_device():
    if torch.cuda.is_available():       
        device = torch.device("cuda")
        print(f'There are {torch.cuda.device_count()} GPU(s) available.')
        print('Device name:', torch.cuda.get_device_name(0))

    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    return device

## Global variables

In [3]:
TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
DEVICE = check_available_device()
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(DEVICE)

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3070 Ti Laptop GPU


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

## Prepare datasets

In [4]:
dataset_train = load_dataset("imdb", split="train")
dataset_test = load_dataset("imdb", split="test")
train_data = pd.DataFrame(dataset_train)
test_df = pd.DataFrame(dataset_test)

Found cached dataset imdb (/home/erthax/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/erthax/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [5]:
train_df, val_df = train_test_split(train_data, test_size=0.1, random_state=666)

In [6]:
train_df

Unnamed: 0,text,label
17785,"If I'm going to watch a porn movie, I prefer i...",1
14913,"That magical moment in life, that point betwee...",1
9323,I'm among millions who consider themselves Car...,0
18761,I was lucky enough to catch this film finally ...,1
20738,If you want Scream or anything like the big-st...,1
...,...,...
8262,"Hiya folks,<br /><br />Well, this movie sucks ...",0
2878,This movie was so ridiculous i never even fini...,0
7597,This is the kind of movie which shows the pauc...,0
10114,"The documentary revolves around Eva Mozes Kor,...",0


## Text preprocessing

In [7]:
def preprocess_data(data, tokenizer):
    input_ids = []
    attention_masks = []

    for text in data['text']:
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(data['label'].values)

    return input_ids, attention_masks, labels

In [8]:
train_input_ids, train_attention_masks, train_labels = preprocess_data(train_df, TOKENIZER)
val_input_ids, val_attention_masks, val_labels = preprocess_data(val_df, TOKENIZER)
test_input_ids, test_attention_masks, test_labels = preprocess_data(test_df, TOKENIZER)

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [9]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)

for epoch in range(5):
    model.train()

    for batch in tqdm(train_dataloader, desc="Training step"):
        input_ids, attention_masks, labels = batch
        input_ids = input_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        labels = labels.to(DEVICE)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)

        loss = outputs[0]
        loss.backward()

        optimizer.step()

    print(f'Training - Epoch: {epoch}, Loss: {loss.item()}')

    model.eval()

    with torch.no_grad():
        all_preds = []
        all_labels = []

        for batch in tqdm(val_dataloader, desc="Validation step"):
            input_ids, attention_masks, labels = batch
            input_ids = input_ids.to(DEVICE)
            attention_masks = attention_masks.to(DEVICE)
            labels = labels.to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_masks)
            _, preds = torch.max(outputs[0], dim=1)

            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())

        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

        print(f'Validation - Epoch: {epoch}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')

Training steps: 2813it [18:23,  2.55it/s]


Training - Epoch: 0, Step: 2812, Loss: 0.12774477899074554


Validation steps: 100%|███████████████████████| 313/313 [00:41<00:00,  7.52it/s]


Validation - Epoch: 0, Accuracy: 0.9284, Precision: 0.9045112781954887, Recall: 0.9585657370517928, F1 Score: 0.9307543520309477


Training steps: 2813it [18:25,  2.54it/s]


Training - Epoch: 1, Step: 2812, Loss: 0.008139469660818577


Validation steps: 100%|███████████████████████| 313/313 [00:41<00:00,  7.50it/s]


Validation - Epoch: 1, Accuracy: 0.9404, Precision: 0.9273570324574961, Recall: 0.9561752988047809, F1 Score: 0.9415457041977247


Training steps: 2813it [18:26,  2.54it/s]


Training - Epoch: 2, Step: 2812, Loss: 0.0007470625569112599


Validation steps: 100%|███████████████████████| 313/313 [00:41<00:00,  7.50it/s]


Validation - Epoch: 2, Accuracy: 0.9364, Precision: 0.9328593996840442, Recall: 0.9410358565737051, F1 Score: 0.9369297897659659


Training steps: 2813it [18:28,  2.54it/s]


Training - Epoch: 3, Step: 2812, Loss: 0.21905921399593353


Validation steps: 100%|███████████████████████| 313/313 [00:41<00:00,  7.48it/s]


Validation - Epoch: 3, Accuracy: 0.9344, Precision: 0.914828897338403, Recall: 0.9585657370517928, F1 Score: 0.9361867704280156


Training steps: 2813it [18:29,  2.54it/s]


Training - Epoch: 4, Step: 2812, Loss: 0.00019843250629492104


Validation steps: 100%|███████████████████████| 313/313 [00:41<00:00,  7.47it/s]

Validation - Epoch: 4, Accuracy: 0.934, Precision: 0.9373996789727127, Recall: 0.9306772908366534, F1 Score: 0.9340263894442224





In [11]:
model.eval()

with torch.no_grad():
    all_preds = []
    all_labels = []

    for batch in tqdm(test_dataloader, desc="Testing step"):
        input_ids, attention_masks, labels = batch

        input_ids = input_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        labels = labels.to(DEVICE)

        outputs = model(input_ids, attention_mask=attention_masks)
        _, preds = torch.max(outputs[0], dim=1)

        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

    print(f'Testing - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')


Testing step: 100%|█████████████████████████| 3125/3125 [06:57<00:00,  7.48it/s]

Testing - Accuracy: 0.9318, Precision: 0.9375050660614412, Recall: 0.92528, F1 Score: 0.9313524177638202



