In [2]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel, AdamW
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score

In [3]:
# Load Dataset
df = pd.read_csv('IMDB-Dataset.csv')

# Convert Sentiment to Binary Labels
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"used device: {device}")

used device: cuda


In [5]:
# prompt: split the whole dataset and use 80% for training/fine-tuning, 10% for testing, and 10% for validation

from sklearn.model_selection import train_test_split

# Split the data into training and a temporary set (test + validation)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split the temporary set into testing and validation sets
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training data size: {len(train_df)}")
print(f"Testing data size: {len(test_df)}")
print(f"Validation data size: {len(val_df)}")

Training data size: 40000
Testing data size: 5000
Validation data size: 5000


In [6]:
# prompt: use BERT tokenizer to tokenize the dataset

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_dataset(df):
  tokenized_reviews = tokenizer(
      df['review'].tolist(),
      padding='max_length',
      truncation=True,
      max_length=512,
      return_tensors='pt'
  )
  return tokenized_reviews

train_tokenized = tokenize_dataset(train_df)
test_tokenized = tokenize_dataset(test_df)
val_tokenized = tokenize_dataset(val_df)

print(train_tokenized.keys())
# Now you have tokenized inputs for your training, testing, and validation sets.

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [7]:
train_labels = train_df['sentiment'].tolist()
val_labels = val_df['sentiment'].tolist()
test_labels = test_df['sentiment'].tolist()

In [8]:
# Convert the labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

In [9]:
print(train_tokenized['input_ids'].shape)
print(train_tokenized['attention_mask'].shape)
print(train_labels.shape)

torch.Size([40000, 512])
torch.Size([40000, 512])
torch.Size([40000])


In [10]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

# Create TensorDataset for training data
train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Create TensorDataset for validation data
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], val_labels)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

# Create TensorDataset for test data
test_dataset = TensorDataset(test_tokenized['input_ids'], test_tokenized['attention_mask'], test_labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

In [11]:
!nvidia-smi

Thu Oct 17 22:35:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-16GB           On  | 00000000:3B:00.0 Off |                    0 |
| N/A   29C    P0              24W / 250W |      4MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [13]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,  # I have two labels: positive and negative
    output_attentions=False,
    output_hidden_states=False,
)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

# Set up the AdamW optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Set the number of epochs
epochs = 3

# Total number of training steps = number of batches * number of epochs
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,  # Typically warm-up steps are set as a percentage of the total steps
                                            num_training_steps=total_steps)


In [15]:
import time
import datetime
import numpy as np

# Function to format elapsed time
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Training loop
for epoch in range(epochs):
    print(f'\n======== Epoch {epoch + 1} / {epochs} ========')
    print('Training...')

    t0 = time.time()
    total_train_loss = 0

    model.train()  # Put model in training mode

    # Iterate over each batch
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(f'  Batch {step}  of  {len(train_dataloader)}.    Elapsed: {elapsed}.')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()  # Zero any previously calculated gradients

        # Perform a forward pass (calculate predictions)
        outputs = model(input_ids=b_input_ids,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        loss = outputs.loss
        logits = outputs.logits

        # Accumulate the training loss for this batch
        total_train_loss += loss.item()

        # Backward pass to calculate the gradients
        loss.backward()

        # Clip gradients to avoid "exploding gradients" problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update model parameters
        optimizer.step()

        # Update the learning rate
        scheduler.step()

    # Calculate average training loss for this epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print(f"Average Training Loss: {avg_train_loss}")
    print(f"Training Epoch took: {training_time}")

    # Validation Phase
    print("\nRunning Validation...")

    t0 = time.time()
    model.eval()  # Put model in evaluation mode

    total_eval_loss = 0
    total_eval_accuracy = 0

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():  # Do not calculate gradients (to save memory and speed up validation)
            outputs = model(input_ids=b_input_ids,
                            attention_mask=b_input_mask,
                            labels=b_labels)

        loss = outputs.loss
        logits = outputs.logits

        total_eval_loss += loss.item()

        # Move logits and labels to CPU to calculate accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate accuracy
        preds = np.argmax(logits, axis=1)
        total_eval_accuracy += np.sum(preds == label_ids) / len(label_ids)

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    avg_val_loss = total_eval_loss / len(val_dataloader)
    validation_time = format_time(time.time() - t0)

    print(f"Validation Loss: {avg_val_loss}")
    print(f"Validation Accuracy: {avg_val_accuracy:.2f}")
    print(f"Validation took: {validation_time}")

print("\nTraining complete!")


Training...
  Batch 40  of  2500.    Elapsed: 0:00:19.
  Batch 80  of  2500.    Elapsed: 0:00:38.
  Batch 120  of  2500.    Elapsed: 0:00:56.
  Batch 160  of  2500.    Elapsed: 0:01:15.
  Batch 200  of  2500.    Elapsed: 0:01:33.
  Batch 240  of  2500.    Elapsed: 0:01:52.
  Batch 280  of  2500.    Elapsed: 0:02:10.
  Batch 320  of  2500.    Elapsed: 0:02:29.
  Batch 360  of  2500.    Elapsed: 0:02:47.
  Batch 400  of  2500.    Elapsed: 0:03:06.
  Batch 440  of  2500.    Elapsed: 0:03:24.
  Batch 480  of  2500.    Elapsed: 0:03:43.
  Batch 520  of  2500.    Elapsed: 0:04:01.
  Batch 560  of  2500.    Elapsed: 0:04:20.
  Batch 600  of  2500.    Elapsed: 0:04:38.
  Batch 640  of  2500.    Elapsed: 0:04:57.
  Batch 680  of  2500.    Elapsed: 0:05:15.
  Batch 720  of  2500.    Elapsed: 0:05:34.
  Batch 760  of  2500.    Elapsed: 0:05:52.
  Batch 800  of  2500.    Elapsed: 0:06:11.
  Batch 840  of  2500.    Elapsed: 0:06:29.
  Batch 880  of  2500.    Elapsed: 0:06:48.
  Batch 920  of  2500

In [16]:
print("\nRunning Test Set Evaluation...")

model.eval()
total_test_accuracy = 0
total_test_loss = 0

for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(input_ids=b_input_ids,
                        attention_mask=b_input_mask,
                        labels=b_labels)

    loss = outputs.loss
    logits = outputs.logits

    total_test_loss += loss.item()

    # Move logits and labels to CPU to calculate accuracy
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate accuracy
    preds = np.argmax(logits, axis=1)
    total_test_accuracy += np.sum(preds == label_ids) / len(label_ids)

avg_test_accuracy = total_test_accuracy / len(test_dataloader)
avg_test_loss = total_test_loss / len(test_dataloader)

print(f"Test Loss: {avg_test_loss}")
print(f"Test Accuracy: {avg_test_accuracy:.2f}")


Running Test Set Evaluation...
Test Loss: 0.2392246308466851
Test Accuracy: 0.95
