In [264]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

import optuna

## Sample Data

In [None]:
song_df_with_lyrics = pd.read_csv('/song_df_with_lyrics.csv')
song_df_with_lyrics = song_df_with_lyrics.drop_duplicates(subset='spotify_id').reset_index(drop=True)

audio_features_df = pd.read_csv('/song_df_with_audio_features.csv')
audio_features_df = audio_features_df.drop_duplicates(subset='spotify_id').reset_index(drop=True)

merge_df = pd.merge(song_df_with_lyrics, audio_features_df, on='spotify_id', how='inner')

In [None]:
# Assign target label
label_mapping = {'High Valence High Arousal': 0, 'Low Valence High Arousal': 1, 'Low Valence Low Arousal': 2, 'High Valence Low Arousal': 3}
merge_df['target'] = merge_df['Mood Quadrant'].map(label_mapping)

In [None]:
merge_df.head()

Unnamed: 0,spotify_id,normalized_lyrics,Mood Quadrant,danceability,energy,loudness,acousticness,liveness,tempo,duration_ms,...,key_7,key_8,key_9,key_10,key_11,mode_0,mode_1,valence,arousal,target
0,32xfcxu2gKRVmDopzlmnUc,cool side cool side cool side cool side the co...,Low Valence High Arousal,0.462436,0.249437,0.645942,0.830321,0.236994,0.328553,0.101148,...,0,0,0,0,1,0,1,-0.15,0.075,1
1,1ocORq8GJBUAIJdi8QPgme,yeah yeah im a virgin yeah i stay lurkin stay ...,Low Valence Low Arousal,0.627085,0.20718,0.63147,0.788153,0.374672,0.251345,0.105012,...,0,0,0,0,0,1,0,-0.45,-0.25,2
2,2bhwPUsgts9pmZIbMvlHZV,hey kid wan na hear something bug when the kni...,High Valence Low Arousal,0.540944,0.858137,0.808191,0.047892,0.06516,0.488848,0.136614,...,0,1,0,0,0,0,1,0.225,-0.175,3
3,4xIlIbQDXw9BXVFcBdPrvW,a day in falsettoland dr mendel at work you go...,Low Valence Low Arousal,0.544215,0.487886,0.696775,0.036747,0.4598,0.209176,0.115004,...,0,0,1,0,0,0,1,-0.075,-0.3,2
4,1X5DB8JnEDPiPskaxLIbfk,one one two three four every second i have ill...,High Valence High Arousal,0.58347,0.849082,0.813579,0.003052,0.056647,0.485716,0.107991,...,0,0,0,0,0,1,0,0.4,0.075,0


In [None]:
# Split data

# Input Features
X_lyrics = merge_df['normalized_lyrics']
X_audio = np.array(merge_df[['danceability', 'energy', 'loudness', 'acousticness', 'liveness', 'tempo', 'duration_ms', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'mode_0', 'mode_1']])

# Target Feature
y = merge_df['target']

## Model

### 1. Only Audio Features

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_audio, y, test_size=0.2, random_state=13)

# Define models
models = [
    ("SVM", SVC(random_state=13, class_weight='balanced', kernel='rbf')),
    ("Random Forest", RandomForestClassifier(random_state=13)),
    ("XGBoost", XGBClassifier(random_state=13))
]

# Train and evaluate each model
for name, model in models:

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model   
    f1 = f1_score(y_test, y_pred, average='weighted') 
    print(f"\nModel: {name}")
    print(f"F1-score average: {f1}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['High Valence High Arousal', 'Low Valence High Arousal', 'Low Valence Low Arousal', 'High Valence Low Arousal']))


Model: SVM
F1-score average: 0.5164544406994128
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.73      0.57      0.64       485
 Low Valence High Arousal       0.19      0.46      0.27       110
  Low Valence Low Arousal       0.61      0.36      0.45       253
 High Valence Low Arousal       0.30      0.45      0.36       108

                 accuracy                           0.49       956
                macro avg       0.46      0.46      0.43       956
             weighted avg       0.59      0.49      0.52       956


Model: Random Forest
F1-score average: 0.7489490788075812
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.77      0.91      0.83       485
 Low Valence High Arousal       0.77      0.48      0.59       110
  Low Valence Low Arousal       0.76      0.76      0.76       253
 High Valence Low Arousal       0.64  

### 2. Only Lyrics

In [None]:
# Define n-grams configurations
ngram_configs = [
    (1, 1),  # Unigram
    (2, 2),  # Bigram
    (3, 3)  # Trigram
]

##### 2.1 Support Vector Machine

In [None]:
# SVM
for ngram_range in ngram_configs:
    print(f"\nTraining with n-grams: {ngram_range}")
    
    # Use TfidfVectorizer for feature extraction
    if ngram_range != (3, 3):
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.09, min_df=88)
        text_features = vectorizer.fit_transform(X_lyrics).toarray()

    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.5, min_df=32)
        text_features = vectorizer.fit_transform(X_lyrics).toarray()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(text_features, y, test_size=0.2, random_state=13)

    # Create a pipeline with vectorizer and LinearSVC
    model = make_pipeline(StandardScaler(with_mean=False), SVC(max_iter=1000, random_state=13, class_weight='balanced', kernel='rbf'))
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(f"F1-score average: {f1_score(y_test, y_pred, average='weighted')}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['High Valence High Arousal', 'Low Valence High Arousal', 'Low Valence Low Arousal', 'High Valence Low Arousal']))


Training with n-grams: (1, 1)
F1-score average: 0.7184591099851467
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.90      0.62      0.74       485
 Low Valence High Arousal       0.61      0.87      0.72       110
  Low Valence Low Arousal       0.72      0.72      0.72       253
 High Valence Low Arousal       0.48      0.94      0.63       108

                 accuracy                           0.71       956
                macro avg       0.68      0.79      0.70       956
             weighted avg       0.77      0.71      0.72       956


Training with n-grams: (2, 2)
F1-score average: 0.7703806350853118
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.93      0.69      0.79       485
 Low Valence High Arousal       0.64      0.94      0.76       110
  Low Valence Low Arousal       0.79      0.76      0.77       253
 High Vale

##### 2.2 Random Forest

In [None]:
# SVM
for ngram_range in ngram_configs:
    print(f"\nTraining with n-grams: {ngram_range}")
    
    # Use TfidfVectorizer for feature extraction
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.95, min_df=30)
    text_features = vectorizer.fit_transform(X_lyrics).toarray()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(text_features, y, test_size=0.2, random_state=13)

    # Create a pipeline with vectorizer and LinearSVC
    model = make_pipeline(StandardScaler(with_mean=False), RandomForestClassifier(random_state=13))
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(f"F1-score average: {f1_score(y_test, y_pred, average='weighted')}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['High Valence High Arousal', 'Low Valence High Arousal', 'Low Valence Low Arousal', 'High Valence Low Arousal']))


Training with n-grams: (1, 1)
F1-score average: 0.7763712755522691
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.76      0.98      0.85       485
 Low Valence High Arousal       0.78      0.56      0.65       110
  Low Valence Low Arousal       0.85      0.68      0.75       253
 High Valence Low Arousal       0.98      0.44      0.61       108

                 accuracy                           0.79       956
                macro avg       0.84      0.67      0.72       956
             weighted avg       0.81      0.79      0.78       956


Training with n-grams: (2, 2)
F1-score average: 0.7600093442135
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.75      0.98      0.85       485
 Low Valence High Arousal       0.69      0.60      0.64       110
  Low Valence Low Arousal       0.87      0.61      0.72       253
 High Valence

##### 2.3 XGBoost

In [None]:
# XGBoost
for ngram_range in ngram_configs:
    print(f"\nTraining with n-grams: {ngram_range}")
    
    # Use TfidfVectorizer for feature extraction
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.95, min_df=30)
    text_features = vectorizer.fit_transform(X_lyrics).toarray()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(text_features, y, test_size=0.2, random_state=13)

    # Create a pipeline with vectorizer and LinearSVC
    model = make_pipeline(StandardScaler(with_mean=False), XGBClassifier(random_state=13))
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(f"F1-score average: {f1_score(y_test, y_pred, average='weighted')}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['High Valence High Arousal', 'Low Valence High Arousal', 'Low Valence Low Arousal', 'High Valence Low Arousal']))


Training with n-grams: (1, 1)
F1-score average: 0.7599003267575425
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.72      0.99      0.83       485
 Low Valence High Arousal       0.97      0.51      0.67       110
  Low Valence Low Arousal       0.88      0.63      0.74       253
 High Valence Low Arousal       0.96      0.42      0.58       108

                 accuracy                           0.78       956
                macro avg       0.88      0.64      0.70       956
             weighted avg       0.82      0.78      0.76       956


Training with n-grams: (2, 2)
F1-score average: 0.6709326101819102
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.66      1.00      0.79       485
 Low Valence High Arousal       0.90      0.25      0.40       110
  Low Valence Low Arousal       0.84      0.52      0.64       253
 High Vale

### 3. Audio + Lyrics

In [None]:
# Define n-grams configurations
ngram_configs = [
    (1, 1),  # Unigram
    (2, 2),  # Bigram
    (3, 3)  # Trigram
]

##### 3.1 Support Vector Machine

In [None]:
# SVM
for ngram_range in ngram_configs:
    print(f"\nTraining with n-grams: {ngram_range}")
    
       # Use TfidfVectorizer for feature extraction
    if ngram_range != (3, 3):
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.09, min_df=92)
        text_features = vectorizer.fit_transform(X_lyrics).toarray()

    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.5, min_df=32)
        text_features = vectorizer.fit_transform(X_lyrics).toarray()
    
    # Combine Audio and Text Features
    combined_features = np.hstack([X_audio, text_features])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=13)

    # Create a pipeline with vectorizer and LinearSVC
    model = make_pipeline(StandardScaler(with_mean=False), SVC(random_state=13, class_weight='balanced', kernel='rbf'))
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(f"F1-score average: {f1_score(y_test, y_pred, average='weighted')}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['High Valence High Arousal', 'Low Valence High Arousal', 'Low Valence Low Arousal', 'High Valence Low Arousal']))


Training with n-grams: (1, 1)
F1-score average: 0.7617930931025326
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.87      0.77      0.82       485
 Low Valence High Arousal       0.55      0.85      0.67       110
  Low Valence Low Arousal       0.82      0.66      0.73       253
 High Valence Low Arousal       0.58      0.81      0.67       108

                 accuracy                           0.76       956
                macro avg       0.70      0.77      0.72       956
             weighted avg       0.79      0.76      0.76       956


Training with n-grams: (2, 2)
F1-score average: 0.7774082425131245
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.87      0.77      0.81       485
 Low Valence High Arousal       0.53      0.85      0.65       110
  Low Valence Low Arousal       0.83      0.70      0.76       253
 High Vale

##### 3.2 Random Forest

In [None]:
# RF
for ngram_range in ngram_configs:
    print(f"\nTraining with n-grams: {ngram_range}")
    
    # Use TfidfVectorizer for feature extraction
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.95, min_df=80)
    text_features = vectorizer.fit_transform(X_lyrics).toarray()
    
    # Combine Audio and Text Features
    combined_features = np.hstack([X_audio, text_features])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=13)

    # Create a pipeline with vectorizer and LinearSVC
    model = make_pipeline(StandardScaler(with_mean=False), RandomForestClassifier(n_estimators=1000, random_state=13))
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(f"F1-score average: {f1_score(y_test, y_pred, average='weighted')}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['High Valence High Arousal', 'Low Valence High Arousal', 'Low Valence Low Arousal', 'High Valence Low Arousal']))


Training with n-grams: (1, 1)
F1-score average: 0.7799180706612663
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.78      0.98      0.87       485
 Low Valence High Arousal       0.67      0.56      0.61       110
  Low Valence Low Arousal       0.83      0.69      0.75       253
 High Valence Low Arousal       1.00      0.44      0.62       108

                 accuracy                           0.79       956
                macro avg       0.82      0.67      0.71       956
             weighted avg       0.81      0.79      0.78       956


Training with n-grams: (2, 2)
F1-score average: 0.7875925613728345
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.79      0.96      0.87       485
 Low Valence High Arousal       0.63      0.53      0.57       110
  Low Valence Low Arousal       0.83      0.73      0.78       253
 High Vale

##### 3.3 XGBoost

In [None]:
# XGBoost
for ngram_range in ngram_configs:
    print(f"\nTraining with n-grams: {ngram_range}")
    
    # Use TfidfVectorizer for feature extraction
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.95, min_df=80)
    text_features = vectorizer.fit_transform(X_lyrics).toarray()
    
    # Combine Audio and Text Features
    combined_features = np.hstack([X_audio, text_features])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=13)

    # Create a pipeline with vectorizer and LinearSVC
    model = make_pipeline(StandardScaler(with_mean=False), XGBClassifier(objective='multi:softprob', n_estimators=1000, random_state=13))
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(f"F1-score average: {f1_score(y_test, y_pred, average='weighted')}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['High Valence High Arousal', 'Low Valence High Arousal', 'Low Valence Low Arousal', 'High Valence Low Arousal']))


Training with n-grams: (1, 1)
F1-score average: 0.7388014980377501
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.76      0.94      0.84       485
 Low Valence High Arousal       0.95      0.36      0.53       110
  Low Valence Low Arousal       0.71      0.77      0.74       253
 High Valence Low Arousal       0.92      0.33      0.49       108

                 accuracy                           0.76       956
                macro avg       0.84      0.60      0.65       956
             weighted avg       0.79      0.76      0.74       956


Training with n-grams: (2, 2)
F1-score average: 0.762871017334203
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.77      0.94      0.85       485
 Low Valence High Arousal       0.90      0.43      0.58       110
  Low Valence Low Arousal       0.74      0.79      0.76       253
 High Valen

#### **Summarize**
A Random Forest model utilizing Audio + Lyrics features, transformed using trigram TF-IDF, achieved the highest performance

### 4. Improve Model

In [None]:
# Use TfidfVectorizer for feature extraction
vectorizer = TfidfVectorizer(ngram_range=(3, 3), max_df=0.95, min_df=80)
text_features = vectorizer.fit_transform(X_lyrics).toarray()
    
# Combine Audio and Text Features
combined_features = np.hstack([X_audio, text_features])

# Split data
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=13)

#### 4.1 SMOTE Over Sampling

In [None]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=13, sampling_strategy='not majority')
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

#### 4.2 Using Optuna for Hyperparameter Tuning

In [None]:
def objective(trial):
    # Define hyperparameters to be tuned
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)

    # Create Random Forest model with suggested hyperparameters
    model = RandomForestClassifier(n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf)

    # Evaluate model using cross-validation
    score = cross_val_score(model, X_train_resampled, y_train_resampled, n_jobs=-1, cv=5).mean()

    return -score  # Negative because Optuna minimizes

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
# Fit the best model
best_params = study.best_params
best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train_resampled, y_train_resampled)

 # Predict on the test set
y_pred = best_model.predict(X_test)
    
# Evaluate the model
print(f"F1-score average: {f1_score(y_test, y_pred, average='weighted')}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['High Valence High Arousal', 'Low Valence High Arousal', 'Low Valence Low Arousal', 'High Valence Low Arousal']))

F1-score average: 0.8566650671135793
Classification Report:
                           precision    recall  f1-score   support

High Valence High Arousal       0.90      0.89      0.89       485
 Low Valence High Arousal       0.81      0.92      0.86       110
  Low Valence Low Arousal       0.85      0.79      0.82       253
 High Valence Low Arousal       0.77      0.81      0.79       108

                 accuracy                           0.86       956
                macro avg       0.83      0.85      0.84       956
             weighted avg       0.86      0.86      0.86       956

