# BERT Classifier - Fine-tuned and Pre-trained

Foundations of Data Science, La Sapienza University of Rome, Autum 2024

Group 27

Team Members:
- Oskar Nesheim
- August Nyheim
- Magnus Ouren


## Imports

In [None]:
import pandas as pd
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
from sklearn.metrics import accuracy_score


## Constants:

In [2]:
DATA_LOCATION = './data'
DATA_FILE = 'movies_balanced.json'
GENRES_FILE = 'popular_genres.json'

### Load and process data

In [None]:
# Load data
df = pd.read_json(f'{DATA_LOCATION}/{DATA_FILE}')

# Load genres
genres = pd.read_json(f'{DATA_LOCATION}/{GENRES_FILE}')
genres = genres.to_numpy().flatten()

df.head()


In [4]:

# Preprocessing text
def preprocess_text(s):
    s = re.sub(r'[^\w\s]', '', s)  # Remove punctuation
    s = s.lower()  # Lowercase text
    return s


df['overview'] = df['overview'].apply(preprocess_text)
df = df[df['genre'].isin(genres)]




### Initialize BERT tokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts, add_special_tokens=True, max_length=120, padding='max_length', truncation=True, return_tensors='pt')


### Tokenize all descriptions
Also insert input_ids and attention_mask into the dataset.

In [6]:
encoded_batch = tokenize_function(df['overview'].tolist())
df['input_ids'] = [tensor.squeeze() for tensor in encoded_batch['input_ids']]
df['attention_mask'] = [tensor.squeeze()
                        for tensor in encoded_batch['attention_mask']]

### Encode labels
Encodes the labels such as 'Action' or 'Commedy'

In [7]:
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['genre'])

### Split data into training, validation, and test sets
Here we are going for a 80 percent training, 10 percent validation and 10 percent testing split.

In [8]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


### Create PyTorch dataset

In [9]:
class MovieDataset(Dataset):
    def __init__(self, entries):
        self.entries = entries

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        entry = self.entries.iloc[idx]
        input_ids = entry['input_ids']
        attention_mask = entry['attention_mask']
        labels= torch.tensor(entry['labels'], dtype=torch.long)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


train_dataset = MovieDataset(train_df)
val_dataset = MovieDataset(val_df)
test_dataset = MovieDataset(test_df)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


### Load BERT model for sequence classification

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
 
lr = 5e-5  # 0.00005
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)

best_model = None
best_val_loss = float('inf')


In [None]:
naked_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(label_encoder.classes_))
naked_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

### Training and validation

In [None]:
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

best_val_loss = float('inf')

for epoch in range(5):
    # === Training Loop ===
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    train_loss = total_loss / len(train_loader)
    current_lr = scheduler.get_last_lr()[0]  # Get current learning rate

    # === Validation Loop ===
    model.eval()
    total_val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            batch = {k: v.to(model.device) for k, v in batch.items()}
            outputs = model(**batch)

            # Extract predictions and true labels
            logits = outputs.logits

            # Get predicted class for each sample
            predictions = torch.argmax(logits, dim=-1)

            # Assuming labels are stored under 'labels' in the batch
            labels = batch['labels']

            # Store predictions and labels for F1 score calculation
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            loss = outputs.loss
            total_val_loss += loss.item()

    val_loss = total_val_loss / len(val_loader)

    # Calculate F1, precision, recall, and accuracy
    # 'weighted' accounts for class imbalance
    f1 = f1_score(all_labels, all_preds, average='weighted')
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    accuracy = accuracy_score(all_labels, all_preds)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model.state_dict()

    # Print out the metrics for the current epoch
    print(f"Epoch {epoch+1} Metrics:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"F1 Score: {f1:.4f} | Precision: {precision:.4f} | Recall: {
          recall:.4f} | Accuracy: {accuracy:.4f}")
    print(f"Current Learning Rate: {current_lr:.6f}\n")


# Load best model for evaluation

In [None]:

model.load_state_dict(best_model)
model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        actuals.extend(batch['labels'].tolist())


In [None]:
naked_model.eval()
predictions_naked, actuals_naked = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = {k: v.to(naked_model.device) for k, v in batch.items()}
        outputs = naked_model(**batch)
        logits = outputs.logits
        predictions_naked.extend(torch.argmax(logits, dim=-1).tolist())
        actuals_naked.extend(batch['labels'].tolist())


# Calculate accuracy
finetuned model:

In [None]:

accuracy = accuracy_score(actuals, predictions)
print(f'Accuracy: {accuracy:.4f}')  # Print the accuracy with 4 decimal places

Naked model

In [None]:
accuracy_naked = accuracy_score(actuals_naked, predictions_naked)
print(f'Accuracy: {accuracy_naked:.4f}')  # Print the accuracy with 4 decimal places


### Generate classification report and modify index to show genre names

In [None]:
report = pd.DataFrame(classification_report(
    actuals, predictions, output_dict=True)).transpose()
report_naked = pd.DataFrame(classification_report(
    actuals_naked, predictions_naked, output_dict=True)).transpose()

### Remove the 'accuracy' row if it's present

In [53]:

# report.drop(['accuracy'], inplace=True) #? Hvorfor gjør vi dette?
# report['support'] = report['support'].apply(int)


### Map numeric labels back to string names using LabelEncoder

In [54]:

report.index = [label_encoder.inverse_transform(
    [int(idx)])[0] if idx.isdigit() else idx for idx in report.index]
report_naked.index = [label_encoder.inverse_transform(
    [int(idx)])[0] if idx.isdigit() else idx for idx in report.index]


### Visualization of the Classification Report

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
report[['precision', 'recall', 'f1-score']].plot(kind='barh', ax=ax)
ax.text(0.45, 1.1, f'Accuracy: {accuracy:.2f}', transform=ax.transAxes)
ax.set_title('Classification Report')
ax.set_xlim([0, 1])
plt.show()

In [None]:

fig, ax = plt.subplots(figsize=(8, 5))
report_naked[['precision', 'recall', 'f1-score']].plot(kind='barh', ax=ax)
ax.text(0.45, 1.1, f'Accuracy: {accuracy_naked:.2f}', transform=ax.transAxes)
ax.set_title('Classification Report')
ax.set_xlim([0, 1])
plt.show()


### Confusion Matrix with genre names

In [None]:

conf_mat = confusion_matrix(actuals, predictions)
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Oppdatere denne slik at vi får lik data som i presentasjonen.
print(classification_report(actuals, predictions))

In [None]:

conf_mat = confusion_matrix(actuals_naked, predictions_naked)
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(classification_report(actuals_naked, predictions_naked))