# Cloud Resources for NLP

## Sentiment Analysis on IMDB Movie reviews using BERT

*Lorna Aine, Fall 2023*


In [1]:
#Load all necessary libraries

!pip install -r requirements.txt
import torch
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

## Load dataset

In [2]:
# load datasets
train_set = pd.read_csv("train_imdb.csv")
test_set = pd.read_csv("test_imdb.csv")

#load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# function to tokenize each example
def tokenize_example(example):
    tokenized_example = tokenizer(
        example,  
        padding="max_length",
        truncation=True,
        return_tensors="pt" 
    )
    return tokenized_example

# tokenize the text
tokenized_train_texts = train_set["text"].apply(tokenize_example)
tokenized_test_texts = test_set["text"].apply(tokenize_example)


# get labels
train_labels = train_set["label"].values
test_labels = test_set["label"].values

# convert tokenized data to tensors
train_inputs = torch.cat([example["input_ids"] for example in tokenized_train_texts], dim=0)
test_inputs = torch.cat([example["input_ids"] for example in tokenized_test_texts], dim=0)
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# create TensorDatasets
train_dataset = TensorDataset(train_inputs, train_labels)
test_dataset = TensorDataset(test_inputs, test_labels)

# batch size 32, 64, etc.
batch_size = 32

# create DataLoaders 
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Number of labels
num_labels = len(train_set["label"].unique())

## Checking device: probably the most important step 

### Best practices
-   Checking which gpu 
-   Monitoring utilization

In [3]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# cpu or gpu, so how do we get gpu?


cpu


## Load Model

In [None]:
# load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

#Attention: Loading model to device 
model.to(device)

## Training process

In [5]:
# define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# start trainining
#num_epochs can be 3, 10, 15..++
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}", leave=False):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    print(f"Epoch {epoch + 1} - Training Loss: {train_loss:.4f}")

                                              

KeyboardInterrupt: 

## Evaluation

In [None]:
# evaluate
model.eval()
val_loss = 0.0
correct_preds = 0
total_preds = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Validation", leave=False):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)

        outputs = model(input_ids)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        val_loss += loss.item()

        _, predicted = torch.max(logits, dim=1)
        total_preds += labels.size(0)
        correct_preds += (predicted == labels).sum().item()

val_loss /= len(test_loader)
val_accuracy = correct_preds / total_preds * 100

print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.2f}%")