## 4. Transformer model 1: BERT

In [1]:
from google.colab import files
import io
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')



df = pd.read_csv('/content/drive/MyDrive/w266_final_project/source/lyrics_df_cleaned.csv')
df.head()

Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,track_name,track_artist,valence,lyrics_snippet,track_popularity,track_album_id,track_album_name,track_album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,tempo,duration_ms
0,0,Dance Monkey,Tones and I,0.513,"They say, ""Oh my god, I see the way you shine ...",100,0UywfDKYlyiu1b38DRrzYD,Dance Monkey (Stripped Back) / Dance Monkey,2019-10-17,0.824,0.588,6,-6.4,0,0.0924,0.692,0.000104,0.149,98.027,209438
1,32,ROXANNE,Arizona Zervas,0.457,"All for the 'Gram Bitches love the 'Gram Oh, w...",99,6HJDrXs0hpebaRFKA1sF90,ROXANNE,2019-10-10,0.621,0.601,6,-5.616,0,0.148,0.0522,0.0,0.46,116.735,163636
2,1056,The Box,Roddy Ricch,0.642,Pullin' out the coupe at the lot Told 'em fuck...,98,52u4anZbHd6UInnmHRFzba,Please Excuse Me For Being Antisocial,2019-12-06,0.896,0.586,10,-6.687,0,0.0559,0.104,0.0,0.79,116.971,196653
3,33824,Blinding Lights,The Weeknd,0.345,Yeah I've been tryna call I've been on my own...,98,2ZfHkwHuoAZrlz7RMj0PDz,Blinding Lights,2019-11-29,0.513,0.796,1,-4.075,1,0.0629,0.00147,0.000209,0.0938,171.017,201573
4,66592,Memories,Maroon 5,0.575,Here's to the ones that we got Cheers to the w...,98,3nR9B40hYLKLcR0Eph3Goc,Memories,2019-09-20,0.764,0.32,11,-7.209,1,0.0546,0.837,0.0,0.0822,91.019,189486


**step 1**: prepare & tokenize lyrics to generate embedding arrays


In [2]:
from transformers import BertTokenizer

# though the dataframe has been cleaned, we still replace NaN with an empty string or suitable placeholder before tokenization
lyrics_series = df['lyrics_snippet'].fillna('')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_lyrics_tokenized_bert = bert_tokenizer(
    lyrics_series.tolist(),
    padding=True,
    truncation=True,
    max_length=512, # BERT's max sequence length is typically 512
    return_tensors='pt' # Return PyTorch tensors
)

print(f"Keys in X_lyrics_tokenized_bert: {X_lyrics_tokenized_bert.keys()}")
print(f"Shape of input_ids: {X_lyrics_tokenized_bert['input_ids'].shape}")
print(f"Shape of attention_mask: {X_lyrics_tokenized_bert['attention_mask'].shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Keys in X_lyrics_tokenized_bert: KeysView({'input_ids': tensor([[ 101, 2027, 2360,  ...,    0,    0,    0],
        [ 101, 2035, 2005,  ...,    0,    0,    0],
        [ 101, 4139, 2378,  ...,    0,    0,    0],
        ...,
        [ 101, 2065, 1045,  ...,    0,    0,    0],
        [ 101, 2028, 2154,  ...,    0,    0,    0],
        [ 101, 2123, 1005,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})
Shape of input_ids: torch.Size([3717, 512])
Shape of attention_mask: torch.Size([3717, 512])


**step 2**: isolate target variable, audio features of interest


In [3]:
y = df['valence']

audio_features = [
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'tempo']

X_audio = df[audio_features]

print(f"Shape of X_audio: {X_audio.shape}")
print(f"Shape of y: {y.shape}")

Shape of X_audio: (3717, 10)
Shape of y: (3717,)


**step 3**: generate test & training sets by splitting data


In [4]:
import torch
from sklearn.model_selection import train_test_split

# convert target variabel to a tensor
y_tensor_bert = torch.tensor(y.values, dtype=torch.float32)

# first split: 70% train, 30% to validation and test
X_audio_train, X_audio_temp, \
X_lyrics_input_ids_train_bert, X_lyrics_input_ids_temp_bert, \
X_lyrics_attention_mask_train_bert, X_lyrics_attention_mask_temp_bert, \
y_train_bert, y_temp_bert = train_test_split(
    X_audio, X_lyrics_tokenized_bert['input_ids'], X_lyrics_tokenized_bert['attention_mask'], y_tensor_bert,
    test_size=0.3, random_state=42
)

# divide 30% into 50% validation and 50% test
X_audio_val, X_audio_test, \
X_lyrics_input_ids_val_bert, X_lyrics_input_ids_test_bert, \
X_lyrics_attention_mask_val_bert, X_lyrics_attention_mask_test_bert, \
y_val_bert, y_test_bert = train_test_split(
    X_audio_temp, X_lyrics_input_ids_temp_bert, X_lyrics_attention_mask_temp_bert, y_temp_bert,
    test_size=0.5, random_state=42 # 0.5 of 0.3 is 0.15
)

print("Shape of training sets (BERT):")
print(f"  X_audio_train: {X_audio_train.shape}")
print(f"  X_lyrics_input_ids_train_bert: {X_lyrics_input_ids_train_bert.shape}")
print(f"  X_lyrics_attention_mask_train_bert: {X_lyrics_attention_mask_train_bert.shape}")
print(f"  y_train_bert: {y_train_bert.shape}")

print("\nShape of validation sets (BERT):")
print(f"  X_audio_val: {X_audio_val.shape}")
print(f"  X_lyrics_input_ids_val_bert: {X_lyrics_input_ids_val_bert.shape}")
print(f"  X_lyrics_attention_mask_val_bert: {X_lyrics_attention_mask_val_bert.shape}")
print(f"  y_val_bert: {y_val_bert.shape}")

print("\nShape of test sets (BERT):")
print(f"  X_audio_test: {X_audio_test.shape}")
print(f"  X_lyrics_input_ids_test_bert: {X_lyrics_input_ids_test_bert.shape}")
print(f"  X_lyrics_attention_mask_test_bert: {X_lyrics_attention_mask_test_bert.shape}")
print(f"  y_test_bert: {y_test_bert.shape}")

Shape of training sets (BERT):
  X_audio_train: (2601, 10)
  X_lyrics_input_ids_train_bert: torch.Size([2601, 512])
  X_lyrics_attention_mask_train_bert: torch.Size([2601, 512])
  y_train_bert: torch.Size([2601])

Shape of validation sets (BERT):
  X_audio_val: (558, 10)
  X_lyrics_input_ids_val_bert: torch.Size([558, 512])
  X_lyrics_attention_mask_val_bert: torch.Size([558, 512])
  y_val_bert: torch.Size([558])

Shape of test sets (BERT):
  X_audio_test: (558, 10)
  X_lyrics_input_ids_test_bert: torch.Size([558, 512])
  X_lyrics_attention_mask_test_bert: torch.Size([558, 512])
  y_test_bert: torch.Size([558])


**step 4:** create an off the shelf bert regression model to run our first rev of the lyric analysis

In [5]:

import torch
import torch.nn as nn
from transformers import BertForSequenceClassification

class BertRegressionModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=1, freeze_bert=False):
        super(BertRegressionModel, self).__init__()
        self.bert_for_regression = BertForSequenceClassification.from_pretrained(
            bert_model_name, num_labels=num_labels
        )

        if freeze_bert:
            for param in self.bert_for_regression.bert.parameters():
                param.requires_grad = False
            print("BERT layers frozen - only training regression head")

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_for_regression(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits.view(-1)

**step 5:** dataset creation & instatiation

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

# create a custom Dataset for BERT lyrics-only analysis
class BertLyricsDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# populate the datasets
train_lyrics_dataset_bert = BertLyricsDataset(
    X_lyrics_input_ids_train_bert,
    X_lyrics_attention_mask_train_bert,
    y_train_bert
)
val_lyrics_dataset_bert = BertLyricsDataset(
    X_lyrics_input_ids_val_bert,
    X_lyrics_attention_mask_val_bert,
    y_val_bert
)
test_lyrics_dataset_bert = BertLyricsDataset(
    X_lyrics_input_ids_test_bert,
    X_lyrics_attention_mask_test_bert,
    y_test_bert
)


batch_size = 16

# create dataloaders to batch and shuffle songs
train_lyrics_dataloader_bert = DataLoader(train_lyrics_dataset_bert, batch_size=batch_size, shuffle=True)
val_lyrics_dataloader_bert = DataLoader(val_lyrics_dataset_bert, batch_size=batch_size, shuffle=False)
test_lyrics_dataloader_bert = DataLoader(test_lyrics_dataset_bert, batch_size=batch_size, shuffle=False)


**step 6:** initial lyrics-only model: frozen/off-the-shelf BERT

In [7]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from tqdm.notebook import tqdm

#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

#instantiate the model & select the optimizer, params
model = BertRegressionModel(bert_model_name='bert-base-uncased', num_labels=1, freeze_bert = True)
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5) # Common learning rate for fine-tuning BERT
num_epochs = 3
criterion = nn.MSELoss()

# model evaluation function
def evaluate_model(model, dataloader, device):
    model.eval() # Set model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad(): # Disable gradient calculation
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Corrected: Added the line to make predictions
            predictions = model(input_ids, attention_mask)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())


    r2 = r2_score(all_labels, all_predictions)
    mse = mean_squared_error(all_labels, all_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(all_labels, all_predictions)

    return r2, mse, rmse, mae

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_lyrics_dataloader_bert, desc=f"Epoch {epoch + 1}/{num_epochs} Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        predictions = model(input_ids, attention_mask)
        loss = criterion(predictions, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_lyrics_dataloader_bert)
    print(f"\nEpoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}")

    # validation
    val_r2, val_mse, val_rmse, val_mae = evaluate_model(model, val_lyrics_dataloader_bert, device)
    print(f"Validation - R2: {val_r2:.4f}, MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}")

# test best run of the model on test set
print("\nEvaluating on Test Set...")
test_r2, test_mse, test_rmse, test_mae = evaluate_model(model, test_lyrics_dataloader_bert, device)
print(f"Test Set Results - R2: {test_r2:.4f}, MSE: {test_mse:.4f}, RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}")

Using device: cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT layers frozen - only training regression head


Epoch 1/3 Training:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 1/3, Average Training Loss: 0.3379
Validation - R2: -0.5099, MSE: 0.0756, RMSE: 0.2749, MAE: 0.2227


Epoch 2/3 Training:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 2/3, Average Training Loss: 0.0780
Validation - R2: -0.2981, MSE: 0.0650, RMSE: 0.2549, MAE: 0.2092


Epoch 3/3 Training:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 3/3, Average Training Loss: 0.0731
Validation - R2: -0.2044, MSE: 0.0603, RMSE: 0.2455, MAE: 0.2026

Evaluating on Test Set...
Test Set Results - R2: -0.2344, MSE: 0.0547, RMSE: 0.2339, MAE: 0.1909


**step 7:** audio + lyric analysis model class creation

In [18]:
import torch
import torch.nn as nn
from transformers import BertModel

class CombinedBertRegressionModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_audio_features=10,
                 audio_feature_hidden_size=64, num_labels=1):
        super(CombinedBertRegressionModel, self).__init__()

        # Load and freeze BERT. once again, this initial run will just be an execution of off-the-shelf bert
        self.bert = BertModel.from_pretrained(bert_model_name, add_pooling_layer=False)
        for param in self.bert.parameters():
            param.requires_grad = False
        print("BERT layers frozen - only training audio processor and regression head")

        # add a processor for audio features
        self.audio_feature_processor = nn.Sequential(
            nn.Linear(num_audio_features, audio_feature_hidden_size), #create a layer to handle variable scales of audio features
            nn.ReLU(), ## enable nonlinear analysis
            nn.Dropout(0.1)
        )

        # apply regression head to top of model to enable prediction across output layers
        combined_input_size = self.bert.config.hidden_size + audio_feature_hidden_size
        self.regressor = nn.Sequential(
            nn.Linear(combined_input_size, combined_input_size // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(combined_input_size // 2, num_labels)
        )

    def forward(self, input_ids, attention_mask, audio_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token_representation = outputs.last_hidden_state[:, 0, :]

        processed_audio_features = self.audio_feature_processor(audio_features)
        combined_features = torch.cat((cls_token_representation, processed_audio_features), dim=1)

        return self.regressor(combined_features).view(-1)

print("CombinedBertRegressionModel class defined successfully.")

CombinedBertRegressionModel class defined successfully.


**step 8:** dataset, data loader creation

In [19]:
import torch
from torch.utils.data import Dataset, DataLoader

# define a custom Dataset for the combined model
class CombinedBertDataset(Dataset):
    def __init__(self, input_ids, attention_mask, audio_features, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.audio_features = torch.tensor(audio_features.values, dtype=torch.float32) # Convert audio_features DataFrame to tensor
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'audio_features': self.audio_features[idx],
            'labels': self.labels[idx]
        }

# instantiate the datasets
train_combined_dataset_bert = CombinedBertDataset(
    X_lyrics_input_ids_train_bert,
    X_lyrics_attention_mask_train_bert,
    X_audio_train,
    y_train_bert
)
val_combined_dataset_bert = CombinedBertDataset(
    X_lyrics_input_ids_val_bert,
    X_lyrics_attention_mask_val_bert,
    X_audio_val,
    y_val_bert
)
test_combined_dataset_bert = CombinedBertDataset(
    X_lyrics_input_ids_test_bert,
    X_lyrics_attention_mask_test_bert,
    X_audio_test,
    y_test_bert
)



# DataLoaders
batch_size = 16
train_combined_dataloader_bert = DataLoader(train_combined_dataset_bert, batch_size=batch_size, shuffle=True)
val_combined_dataloader_bert = DataLoader(val_combined_dataset_bert, batch_size=batch_size, shuffle=False)
test_combined_dataloader_bert = DataLoader(test_combined_dataset_bert, batch_size=batch_size, shuffle=False)

print("Combined BERT + Audio Datasets and DataLoaders created successfully.")

Combined BERT + Audio Datasets and DataLoaders created successfully.


**step 9:** training and evaluation of combined model


In [20]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from tqdm.notebook import tqdm # for progress bar

#make sure we're using a GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

#instantiate combined model,se loss/epochs parameters
combined_model = CombinedBertRegressionModel(
    bert_model_name='bert-base-uncased',
    num_audio_features=10,
    audio_feature_hidden_size=64,
    num_labels=1
)
combined_model.to(device)

optimizer_combined = AdamW(combined_model.parameters(), lr=2e-5)
num_epochs = 3
criterion_combined = nn.MSELoss()

# write function to evaluate the combined model
def evaluate_combined_model(model, dataloader, device):
    model.eval() # Set model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad(): # Disable gradient calculation
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            audio_features = batch['audio_features'].to(device)
            labels = batch['labels'].to(device)

            predictions = model(input_ids, attention_mask, audio_features)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    r2 = r2_score(all_labels, all_predictions)
    mse = mean_squared_error(all_labels, all_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(all_labels, all_predictions)

    return r2, mse, rmse, mae


print("Starting training for Combined BERT + Audio Model...")
for epoch in range(num_epochs):
    combined_model.train()
    total_loss = 0
    for batch in tqdm(train_combined_dataloader_bert, desc=f"Epoch {epoch + 1}/{num_epochs} Combined Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_features = batch['audio_features'].to(device)
        labels = batch['labels'].to(device)

        optimizer_combined.zero_grad() # Clear previous gradients

        predictions = combined_model(input_ids, attention_mask, audio_features)
        loss = criterion_combined(predictions, labels)

        loss.backward() #calculate loss
        optimizer_combined.step() # Update model parameters

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_combined_dataloader_bert)
    print(f"\nEpoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}")

    val_r2, val_mse, val_rmse, val_mae = evaluate_combined_model(combined_model, val_combined_dataloader_bert, device)
    print(f"Combined Validation - R2: {val_r2:.4f}, MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}")

# final evaluation of the model on test set data
print("\nEvaluating Combined Model on Test Set...")
test_r2, test_mse, test_rmse, test_mae = evaluate_combined_model(combined_model, test_combined_dataloader_bert, device)
print(f"Combined Test Set Results - R2: {test_r2:.4f}, MSE: {test_mse:.4f}, RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}")

Using device: cuda
BERT layers frozen - only training audio processor and regression head
Starting training for Combined BERT + Audio Model...


Epoch 1/3 Combined Training:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 1/3, Average Training Loss: 0.2555
Combined Validation - R2: -0.0533, MSE: 0.0527, RMSE: 0.2296, MAE: 0.1913


Epoch 2/3 Combined Training:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 2/3, Average Training Loss: 0.1921
Combined Validation - R2: -0.0593, MSE: 0.0530, RMSE: 0.2303, MAE: 0.1905


Epoch 3/3 Combined Training:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 3/3, Average Training Loss: 0.1566
Combined Validation - R2: -0.0840, MSE: 0.0543, RMSE: 0.2329, MAE: 0.1928

Evaluating Combined Model on Test Set...
Combined Test Set Results - R2: -0.1357, MSE: 0.0503, RMSE: 0.2243, MAE: 0.1824


**step 10:** audio-only model


In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# first step: scale data to normalize for broadly different variances & scales among audio features
scaler = StandardScaler()

X_audio_train_scaled = scaler.fit_transform(X_audio_train)
X_audio_val_scaled = scaler.transform(X_audio_val)
X_audio_test_scaled = scaler.transform(X_audio_test)

# convert tensors to numpy arrays(sklearn models can only use arrays)
y_train_np = y_train_bert.cpu().numpy() # need to run .cpu as we're using a GPU to perform the transformer modeling
y_val_np = y_val_bert.cpu().numpy()
y_test_np = y_test_bert.cpu().numpy()

# initialize & train linreg model
linear_model = LinearRegression()
linear_model.fit(X_audio_train_scaled, y_train_np)


val_predictions_audio_only = linear_model.predict(X_audio_val_scaled)
test_predictions_audio_only = linear_model.predict(X_audio_test_scaled)

# calculate validation metrics
val_r2 = r2_score(y_val_np, val_predictions_audio_only)
val_mse = mean_squared_error(y_val_np, val_predictions_audio_only)
val_rmse = np.sqrt(val_mse)
val_mae = mean_absolute_error(y_val_np, val_predictions_audio_only)

print("Linear Regression (Audio-Only) - Validation Set Metrics:")
print(f"  R2: {val_r2:.4f}")
print(f"  MSE: {val_mse:.4f}")
print(f"  RMSE: {val_rmse:.4f}")
print(f"  MAE: {val_mae:.4f}")

# test model & print metrics
test_r2 = r2_score(y_test_np, test_predictions_audio_only)
test_mse = mean_squared_error(y_test_np, test_predictions_audio_only)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test_np, test_predictions_audio_only)

print("\nLinear Regression (Audio-Only) - Test Set Metrics:")
print(f"  R2: {test_r2:.4f}")
print(f"  MSE: {test_mse:.4f}")
print(f"  RMSE: {test_rmse:.4f}")
print(f"  MAE: {test_mae:.4f}")

Linear Regression (Audio-Only) - Validation Set Metrics:
  R2: 0.2233
  MSE: 0.0389
  RMSE: 0.1972
  MAE: 0.1616

Linear Regression (Audio-Only) - Test Set Metrics:
  R2: 0.2187
  MSE: 0.0346
  RMSE: 0.1860
  MAE: 0.1487


**step 11**: fine tune BERT model to optimize performance for the task


*   remove pooling layer
*   install new decoder and directly feed CLS results/strings to generate predicted valence




In [26]:
## using previously defined BERT class, we 'unfreeze' the layers to enable fine tuning

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# unfreeze
finetuned_model = BertRegressionModel(
    bert_model_name='bert-base-uncased',
    num_labels=1,
    freeze_bert=False
)
finetuned_model.to(device)

optimizer_ft = AdamW(finetuned_model.parameters(), lr=2e-5)
criterion_ft = nn.MSELoss()

print(f"\nTotal parameters: {sum(p.numel() for p in finetuned_model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in finetuned_model.parameters() if p.requires_grad):,}")

print('Fine-tuned model successfully created')



Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Total parameters: 109,483,009
Trainable parameters: 109,483,009
Fine-tuned model successfully created


**step 12:** training and testing of fine tuned model

In [36]:
## because we're fine tuning a much bigger model (110M parameters), adjustments need to be made to:
#    1. run this for more epochs
#    2. implement best model selection to identify the most performant weight mix, as overfitting is possible with the extended runtime
num_epochs = 10
best_val_r2 = -float('inf')
best_epoch = 0

print("=" * 60)
print("FINE-TUNING BERT MODEL")
print("=" * 60)


for epoch in range(num_epochs):
    finetuned_model.train()
    total_loss = 0

    for batch in tqdm(train_lyrics_dataloader_bert, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_ft.zero_grad()
        predictions = finetuned_model(input_ids, attention_mask) #make predictions
        loss = criterion_ft(predictions, labels)
        loss.backward()
        optimizer_ft.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_lyrics_dataloader_bert)

    val_r2, val_mse, val_rmse, val_mae = evaluate_model(
        finetuned_model, val_lyrics_dataloader_bert, device
    )

    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Val R2: {val_r2:.4f}, Val RMSE: {val_rmse:.4f}, Val MAE: {val_mae:.4f}")

    if val_r2 > best_val_r2:
        best_val_r2 = val_r2
        best_epoch = epoch + 1
        torch.save(finetuned_model.state_dict(), 'best_finetuned_bert.pt')
        print(f"#### New best model saved! ####")

print("\n" + "=" * 60)
print(f"training complete! Best model from epoch {best_epoch} with an R-squared value of: {best_val_r2:.4f})")
print("=" * 60)

# test of best model from training evaluation
finetuned_model.load_state_dict(torch.load('best_finetuned_bert.pt'))
test_r2_ft, test_mse_ft, test_rmse_ft, test_mae_ft = evaluate_model(
    finetuned_model, test_lyrics_dataloader_bert, device
)

print(f"\nFine-tuned BERT - Test Set Results:")
print(f"  R2: {test_r2_ft:.4f}")
print(f"  MSE: {test_mse_ft:.4f}")
print(f"  RMSE: {test_rmse_ft:.4f}")
print(f"  MAE: {test_mae_ft:.4f}")

FINE-TUNING BERT MODEL


Epoch 1/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 1/10
  Train Loss: 0.0041
  Val R2: -0.1151, Val RMSE: 0.2363, Val MAE: 0.1925
#### New best model saved! ####


Epoch 2/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 2/10
  Train Loss: 0.0036
  Val R2: -0.1110, Val RMSE: 0.2358, Val MAE: 0.1929
#### New best model saved! ####


Epoch 3/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 3/10
  Train Loss: 0.0033
  Val R2: -0.0643, Val RMSE: 0.2308, Val MAE: 0.1892
#### New best model saved! ####


Epoch 4/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 4/10
  Train Loss: 0.0028
  Val R2: -0.0845, Val RMSE: 0.2330, Val MAE: 0.1915


Epoch 5/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 5/10
  Train Loss: 0.0028
  Val R2: -0.0316, Val RMSE: 0.2272, Val MAE: 0.1865
#### New best model saved! ####


Epoch 6/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 6/10
  Train Loss: 0.0030
  Val R2: -0.0637, Val RMSE: 0.2307, Val MAE: 0.1893


Epoch 7/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 7/10
  Train Loss: 0.0027
  Val R2: -0.1532, Val RMSE: 0.2403, Val MAE: 0.1960


Epoch 8/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 8/10
  Train Loss: 0.0028
  Val R2: -0.0667, Val RMSE: 0.2311, Val MAE: 0.1904


Epoch 9/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 9/10
  Train Loss: 0.0027
  Val R2: -0.0329, Val RMSE: 0.2274, Val MAE: 0.1856


Epoch 10/10:   0%|          | 0/163 [00:00<?, ?it/s]


Epoch 10/10
  Train Loss: 0.0028
  Val R2: -0.0408, Val RMSE: 0.2283, Val MAE: 0.1868

training complete! Best model from epoch 5 with an R-squared value of: -0.0316)

Fine-tuned BERT - Test Set Results:
  R2: -0.0394
  MSE: 0.0460
  RMSE: 0.2146
  MAE: 0.1732
