### 3. Baseline Modeling: Multimodal valence prediction using Bag-of-Words and Audio features

In [1]:
from google.colab import files
import io
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')



df = pd.read_csv('/content/drive/MyDrive/w266_final_project/source/lyrics_df_cleaned.csv')
df.head()

Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,track_name,track_artist,valence,lyrics_snippet,track_popularity,track_album_id,track_album_name,track_album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,tempo,duration_ms
0,0,Dance Monkey,Tones and I,0.513,"They say, ""Oh my god, I see the way you shine ...",100,0UywfDKYlyiu1b38DRrzYD,Dance Monkey (Stripped Back) / Dance Monkey,2019-10-17,0.824,0.588,6,-6.4,0,0.0924,0.692,0.000104,0.149,98.027,209438
1,32,ROXANNE,Arizona Zervas,0.457,"All for the 'Gram Bitches love the 'Gram Oh, w...",99,6HJDrXs0hpebaRFKA1sF90,ROXANNE,2019-10-10,0.621,0.601,6,-5.616,0,0.148,0.0522,0.0,0.46,116.735,163636
2,1056,The Box,Roddy Ricch,0.642,Pullin' out the coupe at the lot Told 'em fuck...,98,52u4anZbHd6UInnmHRFzba,Please Excuse Me For Being Antisocial,2019-12-06,0.896,0.586,10,-6.687,0,0.0559,0.104,0.0,0.79,116.971,196653
3,33824,Blinding Lights,The Weeknd,0.345,Yeah I've been tryna call I've been on my own...,98,2ZfHkwHuoAZrlz7RMj0PDz,Blinding Lights,2019-11-29,0.513,0.796,1,-4.075,1,0.0629,0.00147,0.000209,0.0938,171.017,201573
4,66592,Memories,Maroon 5,0.575,Here's to the ones that we got Cheers to the w...,98,3nR9B40hYLKLcR0Eph3Goc,Memories,2019-09-20,0.764,0.32,11,-7.209,1,0.0546,0.837,0.0,0.0822,91.019,189486


# Initial Model: Bag-of-Words

### data preparation

**step 1**: download NLTK embeddings and tokenize `lyrics_snippet`


In [8]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# create Bag-of-Words features from 'cleaned_lyrics'
vectorizer = CountVectorizer()
lyrics_features = vectorizer.fit_transform(df['lyrics_snippet'])

# select audio features
audio_feature_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness']
audio_features = df[audio_feature_cols]

print("Lyrics features shape:", lyrics_features.shape)
print("Audio features shape:", audio_features.shape)

Lyrics features shape: (3717, 18950)
Audio features shape: (3717, 7)


**step 2**:
now that we have a tokenized input, split into training and test dataframes across audio/lyrics only scenarios

In [10]:
X_lyrics_train, X_lyrics_test, X_audio_train, X_audio_test, y_train, y_test = train_test_split(
    lyrics_features, audio_features, df['valence'], test_size=0.2, random_state=42
)

print("X_lyrics_train shape:", X_lyrics_train.shape)
print("X_lyrics_test shape:", X_lyrics_test.shape)
print("X_audio_train shape:", X_audio_train.shape)
print("X_audio_test shape:", X_audio_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_lyrics_train shape: (2973, 18950)
X_lyrics_test shape: (744, 18950)
X_audio_train shape: (2973, 7)
X_audio_test shape: (744, 7)
y_train shape: (2973,)
y_test shape: (744,)


### model training & evaluation: lyrics-only

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np # Import numpy for sqrt

# Instantiate a Linear Regression model
lyrics_model = LinearRegression()

# Train the model using X_lyrics_train and y_train
lyrics_model.fit(X_lyrics_train, y_train)

# Make predictions on X_lyrics_test
lyrics_predictions = lyrics_model.predict(X_lyrics_test)

# Calculate evaluation metrics
r2 = r2_score(y_test, lyrics_predictions)
rmse = np.sqrt(mean_squared_error(y_test, lyrics_predictions))
mae = mean_absolute_error(y_test, lyrics_predictions)

# Print the metrics
print(f"Lyrics Only Model Performance:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

Lyrics Only Model Performance:
R2 Score: -17.3491
RMSE: 0.9220
MAE: 0.6864


### model training & evaluation: audio features-only

In [14]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Instantiate a Linear Regression model for audio features
audio_model = LinearRegression()

# Train the model using X_audio_train and y_train
audio_model.fit(X_audio_train, y_train)

# Make predictions on X_audio_test
audio_predictions = audio_model.predict(X_audio_test)

# Calculate evaluation metrics
r2_audio = r2_score(y_test, audio_predictions)
rmse_audio = np.sqrt(mean_squared_error(y_test, audio_predictions))
mae_audio = mean_absolute_error(y_test, audio_predictions)

# Print the metrics
print(f"Audio Features Only Model Performance:")
print(f"R2 Score: {r2_audio:.4f}")
print(f"RMSE: {rmse_audio:.4f}")
print(f"MAE: {mae_audio:.4f}")

Audio Features Only Model Performance:
R2 Score: 0.2320
RMSE: 0.1886
MAE: 0.1533


### model training & evaluation: multimodal model

In [16]:
from scipy.sparse import hstack
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# combine lyrics & audio training & validation datasets

X_combined_train = hstack([X_lyrics_train, X_audio_train])
X_combined_test = hstack([X_lyrics_test, X_audio_test])

# instantiate a Linear Regression model for combined features
combined_model = LinearRegression()

# Train the model using X_combined_train and y_train
combined_model.fit(X_combined_train, y_train)

# Make predictions on X_combined_test
combined_predictions = combined_model.predict(X_combined_test)

# Calculate evaluation metrics
r2_combined = r2_score(y_test, combined_predictions)
rmse_combined = np.sqrt(mean_squared_error(y_test, combined_predictions))
mae_combined = mean_absolute_error(y_test, combined_predictions)

# Print the metrics
print(f"Combined Features Model Performance:")
print(f"R2 Score: {r2_combined:.4f}")
print(f"RMSE: {rmse_combined:.4f}")
print(f"MAE: {mae_combined:.4f}")

Combined Features Model Performance:
R2 Score: -2.1558
RMSE: 0.3824
MAE: 0.2911


## summary:

The R2, RMSE, and MAE metrics obtained from all three results are summarized below, along with a comparison to analyze the individual and combined impact of lyrics and audio features on valence prediction.

*   **Lyrics Only Model**:
    *   R2 Score: -17.3491
    *   RMSE: 0.9220
    *   MAE: 0.6864
*   **Audio Features Only Model**:
    *   R2 Score: 0.2320
    *   RMSE: 0.1886
    *   MAE: 0.1533
*   **Combined Features Model**:
    *   R2 Score: -2.1558
    *   RMSE: 0.3824
    *   MAE: 0.2911