## 4. Transformer model 1: RoBERTa

**context** : RoBERTa

In [18]:
from google.colab import files
import io
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')



df = pd.read_csv('/content/drive/MyDrive/w266_final_project/source/lyrics_df_cleaned.csv')
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,track_name,track_artist,valence,lyrics_snippet,track_popularity,track_album_id,track_album_name,track_album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,tempo,duration_ms
0,0,Dance Monkey,Tones and I,0.513,"They say, ""Oh my god, I see the way you shine ...",100,0UywfDKYlyiu1b38DRrzYD,Dance Monkey (Stripped Back) / Dance Monkey,2019-10-17,0.824,0.588,6,-6.4,0,0.0924,0.692,0.000104,0.149,98.027,209438
1,32,ROXANNE,Arizona Zervas,0.457,"All for the 'Gram Bitches love the 'Gram Oh, w...",99,6HJDrXs0hpebaRFKA1sF90,ROXANNE,2019-10-10,0.621,0.601,6,-5.616,0,0.148,0.0522,0.0,0.46,116.735,163636
2,1056,The Box,Roddy Ricch,0.642,Pullin' out the coupe at the lot Told 'em fuck...,98,52u4anZbHd6UInnmHRFzba,Please Excuse Me For Being Antisocial,2019-12-06,0.896,0.586,10,-6.687,0,0.0559,0.104,0.0,0.79,116.971,196653
3,33824,Blinding Lights,The Weeknd,0.345,Yeah I've been tryna call I've been on my own...,98,2ZfHkwHuoAZrlz7RMj0PDz,Blinding Lights,2019-11-29,0.513,0.796,1,-4.075,1,0.0629,0.00147,0.000209,0.0938,171.017,201573
4,66592,Memories,Maroon 5,0.575,Here's to the ones that we got Cheers to the w...,98,3nR9B40hYLKLcR0Eph3Goc,Memories,2019-09-20,0.764,0.32,11,-7.209,1,0.0546,0.837,0.0,0.0822,91.019,189486


**step 1**: import necessary libraries and ID audio features


In [19]:
from transformers import RobertaTokenizer

y = df['valence']

audio_features = [
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'tempo']

X_audio = df[audio_features]

print(f"Shape of X_audio: {X_audio.shape}")
print(f"Shape of y: {y.shape}")

Shape of X_audio: (3717, 10)
Shape of y: (3717,)


**step 2:**: tokenize lyrics

In [20]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Replace NaN with an empty string or suitable placeholder before tokenization
lyrics_series = df['lyrics_snippet'].fillna('')

X_lyrics_tokenized = tokenizer(
    lyrics_series.tolist(),
    padding=True,
    truncation=True,
    max_length=512, # RoBERTa's max sequence length
    return_tensors='pt' # Return PyTorch tensors
)

print(f"Keys in X_lyrics_tokenized: {X_lyrics_tokenized.keys()}")
print(f"Shape of input_ids: {X_lyrics_tokenized['input_ids'].shape}")
print(f"Shape of attention_mask: {X_lyrics_tokenized['attention_mask'].shape}")

Keys in X_lyrics_tokenized: KeysView({'input_ids': tensor([[    0,  1213,   224,  ...,     1,     1,     1],
        [    0,  3684,    13,  ...,     1,     1,     1],
        [    0, 45233,   179,  ...,     1,     1,     1],
        ...,
        [    0,  1106,    38,  ...,     1,     1,     1],
        [    0,  3762,   183,  ...,     1,     1,     1],
        [    0,  6766,    75,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})
Shape of input_ids: torch.Size([3717, 512])
Shape of attention_mask: torch.Size([3717, 512])


**step 3**: train-test split

In [22]:
import torch
from sklearn.model_selection import train_test_split

# convert y (valence) Series to a PyTorch tensor
y_tensor = torch.tensor(y.values, dtype=torch.float32)

# first split: 70% train, 30% temp (for validation and test)
X_audio_train, X_audio_temp, X_lyrics_input_ids_train, X_lyrics_input_ids_temp, X_lyrics_attention_mask_train, X_lyrics_attention_mask_temp, y_train, y_temp = train_test_split(
    X_audio, X_lyrics_tokenized['input_ids'], X_lyrics_tokenized['attention_mask'], y_tensor,
    test_size=0.3, random_state=42
)

# second split: divide temp into 50% validation and 50% test
X_audio_val, X_audio_test, X_lyrics_input_ids_val, X_lyrics_input_ids_test, X_lyrics_attention_mask_val, X_lyrics_attention_mask_test, y_val, y_test = train_test_split(
    X_audio_temp, X_lyrics_input_ids_temp, X_lyrics_attention_mask_temp, y_temp,
    test_size=0.5, random_state=42 # 0.5 of 0.3 is 0.15
)

print("Shape of training sets:")
print(f"  X_audio_train: {X_audio_train.shape}")
print(f"  X_lyrics_input_ids_train: {X_lyrics_input_ids_train.shape}")
print(f"  X_lyrics_attention_mask_train: {X_lyrics_attention_mask_train.shape}")
print(f"  y_train: {y_train.shape}")

print("\nShape of validation sets:")
print(f"  X_audio_val: {X_audio_val.shape}")
print(f"  X_lyrics_input_ids_val: {X_lyrics_input_ids_val.shape}")
print(f"  X_lyrics_attention_mask_val: {X_lyrics_attention_mask_val.shape}")
print(f"  y_val: {y_val.shape}")

print("\nShape of test sets:")
print(f"  X_audio_test: {X_audio_test.shape}")
print(f"  X_lyrics_input_ids_test: {X_lyrics_input_ids_test.shape}")
print(f"  X_lyrics_attention_mask_test: {X_lyrics_attention_mask_test.shape}")
print(f"  y_test: {y_test.shape}")

Shape of training sets:
  X_audio_train: (2601, 10)
  X_lyrics_input_ids_train: torch.Size([2601, 512])
  X_lyrics_attention_mask_train: torch.Size([2601, 512])
  y_train: torch.Size([2601])

Shape of validation sets:
  X_audio_val: (558, 10)
  X_lyrics_input_ids_val: torch.Size([558, 512])
  X_lyrics_attention_mask_val: torch.Size([558, 512])
  y_val: torch.Size([558])

Shape of test sets:
  X_audio_test: (558, 10)
  X_lyrics_input_ids_test: torch.Size([558, 512])
  X_lyrics_attention_mask_test: torch.Size([558, 512])
  y_test: torch.Size([558])


**step 4**: add classification head to enable prediction of a (0,1) variable, instead of a class label



In [40]:
import torch
import torch.nn as nn
from transformers import RobertaModel

class RobertaRegressionModel(nn.Module):
    def __init__(self, roberta_model_name='roberta-base', num_labels=1):
        super(RobertaRegressionModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name, add_pooling_layer=False)

        self.regressor = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size, self.roberta.config.hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(self.roberta.config.hidden_size, num_labels) # Output a single valence score
        )

    def forward(self, input_ids, attention_mask):
        # pull the last hidden state from roberta
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        # generate the representation for the [CLS] token (first token)
        cls_token_representation = last_hidden_state[:, 0, :]

        # pass the cls token representation through the regression head
        valence_prediction = self.regressor(cls_token_representation)

        return valence_prediction


**step 6**: create an array of input_ids (tracking word embeddings), attention (to help place emphasis on appropriate phrase) and labels (valence)


In [41]:
import torch
from torch.utils.data import Dataset, DataLoader

# define a custom dataset class to allow for storage of token ids, attention values, and labels/outcome variables
class LyricsDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# create instances
batch_size = 32

# training Dataset and DataLoader
train_dataset = LyricsDataset(
    X_lyrics_input_ids_train,
    X_lyrics_attention_mask_train,
    y_train
)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Validation Dataset and DataLoader
val_dataset = LyricsDataset(
    X_lyrics_input_ids_val,
    X_lyrics_attention_mask_val,
    y_val
)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Test Dataset and DataLoader
test_dataset = LyricsDataset(
    X_lyrics_input_ids_test,
    X_lyrics_attention_mask_test,
    y_test
)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"# training batches: {len(train_dataloader)}")
print(f"# validation batches: {len(val_dataloader)}")
print(f"# of test batches: {len(test_dataloader)}")

# training batches: 82
# validation batches: 18
# of test batches: 18


**step 7**: device selection & model initialization

*to ensure that our compute resources can quickly train a transformer-based model, it's critical that we're using a GPU. running this regression on a CPU would take significantly longer.*




In [48]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# initialize the model
model = RobertaRegressionModel(num_labels=1)# for regression
model.to(device)

# set optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()

Using device: cuda


**step 8**: train and evaluate lyrics-only model

In [None]:
import numpy as np

num_epochs = 3

print("initiating training loop (lyrics-only)...")

for epoch in range(num_epochs):
    model.train()

    total_train_loss = 0
    for batch_idx, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        predictions = model(input_ids, attention_mask).squeeze(1) # Squeeze to match labels shape
        loss = loss_fn(predictions, labels)

        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Validation phase
    model.eval() # Set model to evaluation mode
    total_val_loss = 0
    with torch.no_grad(): # Disable gradient calculations during validation
        for batch_idx, batch in enumerate(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            predictions = model(input_ids, attention_mask).squeeze(1)
            loss = loss_fn(predictions, labels)

            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

print("Training complete.")

initiating training loop (lyrics-only)...


In [27]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

print("Starting evaluation on the test set...")

model.eval()

predictions_list = []
labels_list = []

with torch.no_grad():
    for batch_idx, batch in enumerate(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        predictions = model(input_ids, attention_mask).squeeze(1)

        predictions_list.extend(predictions.cpu().numpy())
        labels_list.extend(labels.cpu().numpy())

# Convert lists to numpy arrays
predictions_np = np.array(predictions_list)
labels_np = np.array(labels_list)

# Calculate evaluation metrics
r2 = r2_score(labels_np, predictions_np)
mse = mean_squared_error(labels_np, predictions_np)
rmse = np.sqrt(mse)
mae = mean_absolute_error(labels_np, predictions_np)

print(f"\nTest Set Evaluation:")
print(f"  R2 Score: {r2:.4f}")
print(f"  Mean Squared Error (MSE): {mse:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"  Mean Absolute Error (MAE): {mae:.4f}")

print("Evaluation complete.")

Starting evaluation on the test set...

Test Set Evaluation:
  R2 Score: 0.0571
  Mean Squared Error (MSE): 0.0418
  Root Mean Squared Error (RMSE): 0.2044
  Mean Absolute Error (MAE): 0.1666
Evaluation complete.


**step 9**: train and evaluate audio only model


In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

print("### Audio Features Only Model Training and Evaluation ###")

# 1. Initialize StandardScaler
scaler = StandardScaler()

# 2. Fit the scaler on X_audio_train and then transform all audio feature sets
X_audio_train_scaled = scaler.fit_transform(X_audio_train)
X_audio_val_scaled = scaler.transform(X_audio_val)
X_audio_test_scaled = scaler.transform(X_audio_test)

print("Audio features scaled successfully.")

# 3. Convert y_train, y_val, and y_test (PyTorch tensors) into NumPy arrays
y_train_np = y_train.cpu().numpy()
y_val_np = y_val.cpu().numpy()
y_test_np = y_test.cpu().numpy()

print("Target variables converted to NumPy arrays.")

# 4. Initialize a LinearRegression model
linear_model = LinearRegression()

# 5. Train the LinearRegression model
linear_model.fit(X_audio_train_scaled, y_train_np)

print("Linear Regression model trained successfully.")

# 6. Make predictions on the scaled X_audio_test
predictions_audio_only = linear_model.predict(X_audio_test_scaled)

# 7. Calculate and print evaluation metrics
r2_audio_only = r2_score(y_test_np, predictions_audio_only)
mse_audio_only = mean_squared_error(y_test_np, predictions_audio_only)
rmse_audio_only = np.sqrt(mse_audio_only)
mae_audio_only = mean_absolute_error(y_test_np, predictions_audio_only)

print(f"\nTest Set Evaluation (Audio Features Only Model):")
print(f"  R2 Score: {r2_audio_only:.4f}")
print(f"  Mean Squared Error (MSE): {mse_audio_only:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_audio_only:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_audio_only:.4f}")

print("Audio Features Only Model evaluation complete.")

### Audio Features Only Model Training and Evaluation ###
Audio features scaled successfully.
Target variables converted to NumPy arrays.
Linear Regression model trained successfully.

Test Set Evaluation (Audio Features Only Model):
  R2 Score: 0.2187
  Mean Squared Error (MSE): 0.0346
  Root Mean Squared Error (RMSE): 0.1860
  Mean Absolute Error (MAE): 0.1487
Audio Features Only Model evaluation complete.


**step 10:** as we did for the lyrics-only model, we need to create a new class to store our features of interest for the combined model. in this case, we store audio features, lyrics information, and labels

In [29]:
import torch
import torch.nn as nn
from transformers import RobertaModel

class CombinedRegressionModel(nn.Module):
    def __init__(self, roberta_model_name='roberta-base', num_audio_features=10, num_labels=1):
        super(CombinedRegressionModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name, add_pooling_layer=False)

        # RoBERTa-base hidden size is 768
        roberta_output_size = self.roberta.config.hidden_size
        combined_feature_size = roberta_output_size + num_audio_features

        self.regressor = nn.Sequential(
            nn.Linear(combined_feature_size, combined_feature_size),
            nn.ReLU(),
            nn.Dropout(0.1), # Dropout for regularization
            nn.Linear(combined_feature_size, num_labels) # Output a single valence score
        )

    def forward(self, input_ids, attention_mask, audio_features):
        # Get the last hidden state from RoBERTa
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        # Extract the representation for the [CLS] token (first token)
        cls_token_representation = last_hidden_state[:, 0, :]

        # Concatenate RoBERTa [CLS] token representation with audio features
        combined_features = torch.cat((cls_token_representation, audio_features), dim=1)

        # Pass the combined features through the regression head
        valence_prediction = self.regressor(combined_features)

        return valence_prediction

print("CombinedRegressionModel class defined successfully.")

CombinedRegressionModel class defined successfully.


In [30]:
import torch
from torch.utils.data import Dataset, DataLoader

class CombinedDataset(Dataset):
    def __init__(self, input_ids, attention_mask, audio_features_scaled, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        # Convert numpy array to torch tensor if it's not already
        if isinstance(audio_features_scaled, np.ndarray):
            self.audio_features_scaled = torch.tensor(audio_features_scaled, dtype=torch.float32)
        else:
            self.audio_features_scaled = audio_features_scaled
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'audio_features': self.audio_features_scaled[idx],
            'labels': self.labels[idx]
        }

# 3. Create DataLoader instances for the training, validation, and test sets
batch_size = 16 # Keep the same batch size as the lyrics-only model

# Training Dataset and DataLoader
train_combined_dataset = CombinedDataset(
    X_lyrics_input_ids_train,
    X_lyrics_attention_mask_train,
    X_audio_train_scaled, # This is already a numpy array
    y_train
)
train_combined_dataloader = DataLoader(train_combined_dataset, batch_size=batch_size, shuffle=True)

# Validation Dataset and DataLoader
val_combined_dataset = CombinedDataset(
    X_lyrics_input_ids_val,
    X_lyrics_attention_mask_val,
    X_audio_val_scaled, # This is already a numpy array
    y_val
)
val_combined_dataloader = DataLoader(val_combined_dataset, batch_size=batch_size, shuffle=False)

# Test Dataset and DataLoader
test_combined_dataset = CombinedDataset(
    X_lyrics_input_ids_test,
    X_lyrics_attention_mask_test,
    X_audio_test_scaled, # This is already a numpy array
    y_test
)
test_combined_dataloader = DataLoader(test_combined_dataset, batch_size=batch_size, shuffle=False)

print(f"CombinedDataset and DataLoader instances created successfully.")
print(f"Number of training batches (combined): {len(train_combined_dataloader)}")
print(f"Number of validation batches (combined): {len(val_combined_dataloader)}")
print(f"Number of test batches (combined): {len(test_combined_dataloader)}")

CombinedDataset and DataLoader instances created successfully.
Number of training batches (combined): 163
Number of validation batches (combined): 35
Number of test batches (combined): 35


In [31]:
import torch.optim as optim

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize the CombinedRegressionModel
# num_audio_features should match the number of features in X_audio (10)
model_combined = CombinedRegressionModel(num_audio_features=X_audio_train_scaled.shape[1], num_labels=1)
model_combined.to(device)

# Define optimizer and loss function for the combined model
optimizer_combined = optim.AdamW(model_combined.parameters(), lr=2e-5)
loss_fn_combined = nn.MSELoss()

print("Combined Model, optimizer, and loss function initialized successfully.")

Using device: cuda
Combined Model, optimizer, and loss function initialized successfully.


**Step 11:** train and evaluate combined model

In [32]:
import numpy as np

# Training loop parameters
num_epochs_combined = 3
print("initiating combined model training loop...")

for epoch in range(num_epochs_combined):
    model_combined.train() # Set model to training mode
    total_train_loss_combined = 0
    for batch_idx, batch in enumerate(train_combined_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_features = batch['audio_features'].to(device)
        labels = batch['labels'].to(device)

        optimizer_combined.zero_grad()

        predictions = model_combined(input_ids, attention_mask, audio_features).squeeze(1) # Squeeze to match labels shape
        loss = loss_fn_combined(predictions, labels)

        loss.backward()
        optimizer_combined.step()

        total_train_loss_combined += loss.item()

    avg_train_loss_combined = total_train_loss_combined / len(train_combined_dataloader)

    # Validation phase for combined model
    model_combined.eval()
    total_val_loss_combined = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_combined_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            audio_features = batch['audio_features'].to(device)
            labels = batch['labels'].to(device)

            predictions = model_combined(input_ids, attention_mask, audio_features).squeeze(1)
            loss = loss_fn_combined(predictions, labels)

            total_val_loss_combined += loss.item()

    avg_val_loss_combined = total_val_loss_combined / len(val_combined_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs_combined}, Combined Train Loss: {avg_train_loss_combined:.4f}, Combined Val Loss: {avg_val_loss_combined:.4f}")

print("Combined model training complete.")

Starting combined model training loop...
Epoch 1/3, Combined Train Loss: 0.0623, Combined Val Loss: 0.0426
Epoch 2/3, Combined Train Loss: 0.0493, Combined Val Loss: 0.0404
Epoch 3/3, Combined Train Loss: 0.0413, Combined Val Loss: 0.0458
Combined model training complete.


In [33]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

print("Starting evaluation of the combined model on the test set...")

model_combined.eval()

predictions_combined_list = []
labels_combined_list = []

with torch.no_grad(): # Disable gradient calculations during testing
    for batch_idx, batch in enumerate(test_combined_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_features = batch['audio_features'].to(device)
        labels = batch['labels'].to(device)

        predictions = model_combined(input_ids, attention_mask, audio_features).squeeze(1)

        predictions_combined_list.extend(predictions.cpu().numpy())
        labels_combined_list.extend(labels.cpu().numpy())

# Convert lists to numpy arrays
predictions_combined_np = np.array(predictions_combined_list)
labels_combined_np = np.array(labels_combined_list)

# Calculate evaluation metrics for the combined model
r2_combined = r2_score(labels_combined_np, predictions_combined_np)
mse_combined = mean_squared_error(labels_combined_np, predictions_combined_np)
rmse_combined = np.sqrt(mse_combined)
mae_combined = mean_absolute_error(labels_combined_np, predictions_combined_np)

print(f"\nTest Set Evaluation (Combined Model):")
print(f"  R2 Score: {r2_combined:.4f}")
print(f"  Mean Squared Error (MSE): {mse_combined:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_combined:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_combined:.4f}")

print("Combined model evaluation complete.")

Starting evaluation of the combined model on the test set...

Test Set Evaluation (Combined Model):
  R2 Score: 0.1335
  Mean Squared Error (MSE): 0.0384
  Root Mean Squared Error (RMSE): 0.1959
  Mean Absolute Error (MAE): 0.1569
Combined model evaluation complete.


**step 12**: model comparison

In [39]:
import pandas as pd

#pull the R2, RMSE, and MAE scores from each model
metrics = {
    "Lyrics Only Model": {
        "R2 Score": r2,
        "Mean Squared Error (MSE)": mse,
        "Root Mean Squared Error (RMSE)": rmse,
        "Mean Absolute Error (MAE)": mae
    },
    "Audio Features Only Model": {
        "R2 Score": r2_audio_only,
        "Mean Squared Error (MSE)": mse_audio_only,
        "Root Mean Squared Error (RMSE)": rmse_audio_only,
        "Mean Absolute Error (MAE)": mae_audio_only
    },
    "Combined Model": {
        "R2 Score": r2_combined,
        "Mean Squared Error (MSE)": mse_combined,
        "Root Mean Squared Error (RMSE)": rmse_combined,
        "Mean Absolute Error (MAE)": mae_combined
    },
    "Fine-tuned RoBERTa Model": {
        "R2 Score": r2_ft,
        "Mean Squared Error (MSE)": mse_ft,
        "Root Mean Squared Error (RMSE)": rmse_ft,
        "Mean Absolute Error (MAE)": mae_ft
    }
}

# create & print a df
comparison_df = pd.DataFrame.from_dict(metrics, orient='index')
print("\n### Model Performance Comparison ###")
print(comparison_df.round(4))



### Model Performance Comparison ###
                           R2 Score  Mean Squared Error (MSE)  \
Lyrics Only Model            0.0571                    0.0418   
Audio Features Only Model    0.2187                    0.0346   
Combined Model               0.1335                    0.0384   
Fine-tuned RoBERTa Model     0.0979                    0.0400   

                           Root Mean Squared Error (RMSE)  \
Lyrics Only Model                                  0.2044   
Audio Features Only Model                          0.1860   
Combined Model                                     0.1959   
Fine-tuned RoBERTa Model                           0.1999   

                           Mean Absolute Error (MAE)  
Lyrics Only Model                             0.1666  
Audio Features Only Model                     0.1487  
Combined Model                                0.1569  
Fine-tuned RoBERTa Model                      0.1634  


# Fine tuning RoBERTa

In [36]:
import torch.optim as optim
from transformers import RobertaForSequenceClassification
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize RobertaForSequenceClassification for regression
fine_tune_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)
fine_tune_model.to(device)
print("RobertaForSequenceClassification model initialized for fine-tuning.")

optimizer_ft = optim.AdamW(fine_tune_model.parameters(), lr=2e-5)
loss_fn_ft = nn.MSELoss()

num_epochs_ft = 3

print("Starting fine-tuning training loop...")

for epoch in range(num_epochs_ft):
    fine_tune_model.train()
    total_train_loss_ft = 0
    for batch_idx, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_ft.zero_grad()


        outputs = fine_tune_model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.squeeze(1)
        loss = loss_fn_ft(predictions, labels)

        loss.backward()
        optimizer_ft.step()

        total_train_loss_ft += loss.item()

    avg_train_loss_ft = total_train_loss_ft / len(train_dataloader)

    # Validation phase for fine-tuned model
    fine_tune_model.eval()
    total_val_loss_ft = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = fine_tune_model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = outputs.logits.squeeze(1)
            loss = loss_fn_ft(predictions, labels)

            total_val_loss_ft += loss.item()

    avg_val_loss_ft = total_val_loss_ft / len(val_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs_ft}, Fine-tune Train Loss: {avg_train_loss_ft:.4f}, Fine-tune Val Loss: {avg_val_loss_ft:.4f}")

print("Fine-tuning complete.")

Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification model initialized for fine-tuning.
Starting fine-tuning training loop...
Epoch 1/3, Fine-tune Train Loss: 0.0687, Fine-tune Val Loss: 0.0536
Epoch 2/3, Fine-tune Train Loss: 0.0566, Fine-tune Val Loss: 0.0477
Epoch 3/3, Fine-tune Train Loss: 0.0530, Fine-tune Val Loss: 0.0447
Fine-tuning complete.


In [37]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

print("Starting evaluation of the fine-tuned model on the test set...")

fine_tune_model.eval()

predictions_ft_list = []
labels_ft_list = []

with torch.no_grad():
    for batch_idx, batch in enumerate(test_dataloader): # Reusing test_dataloader from iitial lyrics-only evaluation
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = fine_tune_model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.squeeze(1)

        predictions_ft_list.extend(predictions.cpu().numpy())
        labels_ft_list.extend(labels.cpu().numpy())

# Convert lists to numpy arrays
predictions_ft_np = np.array(predictions_ft_list)
labels_ft_np = np.array(labels_ft_list)

# Calculate evaluation metrics for the fine-tuned model
r2_ft = r2_score(labels_ft_np, predictions_ft_np)
mse_ft = mean_squared_error(labels_ft_np, predictions_ft_np)
rmse_ft = np.sqrt(mse_ft)
mae_ft = mean_absolute_error(labels_ft_np, predictions_ft_np)

print(f"\nTest Set Evaluation (Fine-tuned RoBERTa Model):")
print(f"  R2 Score: {r2_ft:.4f}")
print(f"  Mean Squared Error (MSE): {mse_ft:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_ft:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_ft:.4f}")

print("Fine-tuned model evaluation complete.")

Starting evaluation of the fine-tuned model on the test set...

Test Set Evaluation (Fine-tuned RoBERTa Model):
  R2 Score: 0.0979
  Mean Squared Error (MSE): 0.0400
  Root Mean Squared Error (RMSE): 0.1999
  Mean Absolute Error (MAE): 0.1634
Fine-tuned model evaluation complete.
