In [3]:
import pylast
import os
import sys
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import mean_squared_error, r2_score

sys.path.append(os.path.abspath('..'))
from music_sentiment.lastfm_api import get_lastfm_network, get_song_tags
from music_sentiment.tag_utils import process_tag_weights

#load the environment variables from the .env file
load_dotenv()


True

In [4]:
# load the cleaned dataset

df = pd.read_csv("../data/cleaned/muse_cleaned.csv")


In [5]:
# create network for last.fm
network = get_lastfm_network()

# start with a smaller subset of the dataset
sample_size = 200
df_sample = df.sample(sample_size, random_state=42)

# Fetch tags for each song in the sample
print("Fetching tags for each song in the sample...")
tags_text_list = []
tags_with_weights = []

for idx, row in df_sample.iterrows():
    print(f"Processing {idx+1}/{len(df_sample)}: {row['artist']} - {row['track']}")
    song_tags = get_song_tags(network, row['artist'], row['track'])
    
    # store tag data with weight for each tag
    tags_with_weights.append(song_tags)
    tags_text_list.append(process_tag_weights(song_tags))
    
    
# Add the tags to the DataFrame
df_sample['tags_with_weights'] = tags_with_weights
df_sample['tag_text'] = tags_text_list
# Save the DataFrame with tags to a CSV file
df_sample.to_csv("../data/cleaned/muse_with_tags_valence.csv", index=False)

Fetching tags for each song in the sample...
Processing 53009/200: Blackstreet - Falling In Love Again
Processing 12457/200: Lucifers Crossing - Blasphemer
Processing 53406/200: Madonna - Justify My Love (porno mix) Rare
Processing 64177/200: Aimee Mann - One
Processing 66341/200: Linkin Park - Roads Untraveled
Processing 1967/200: Nits - Two Skaters
Processing 66701/200: Frank Sinatra - I'll be seeing you
Processing 8342/200: Frankie Lymon and The Teenagers - Teen Angel
Processing 71666/200: Vanessa-Mae - City Theme
Processing 6607/200: Narsilion - Pedraforca, Terra De Bruixes
Processing 23823/200: T-Bone Walker - Blues For Marili ( LP Version )
Processing 57286/200: The Harvest Ministers - When You Have A Faint Heart
Processing 55139/200: Poison - OnceBittenTwiceShy
Processing 39377/200: The Faint - Desperate Guys
Processing 54512/200: Beautiful Creatures - 1 A.M.
Processing 35870/200: Puressence - The Feeling
Processing 29661/200: Marillion - Beyond You
Processing 20476/200: Stereol

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
# ComplementNB works with sparse matrices directly

# Your existing code for feature extraction
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7)
X = vectorizer.fit_transform(df_sample['tag_text'])

# Bin the valence scores into discrete categories
bins = [0, 3, 5, 7, 10]  # You can adjust these bins
labels = [0, 1, 2, 3]    # Numeric labels for the bins
df_sample['valence_binned'] = pd.cut(df_sample['valence_tags'], bins=bins, labels=labels)

# Use the binned values as target
y = df_sample['valence_binned']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Use ComplementNB which works well with sparse features
model = ComplementNB()
model.fit(X_train, y_train)  # No need for .toarray()

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))



Accuracy: 0.4500
              precision    recall  f1-score   support

           0       0.33      0.33      0.33         6
           1       0.45      0.60      0.51        15
           2       0.60      0.38      0.46        32
           3       0.29      0.57      0.38         7

    accuracy                           0.45        60
   macro avg       0.42      0.47      0.42        60
weighted avg       0.50      0.45      0.45        60



In [7]:
# evaluate model on the training data
y_train_pred = model.predict(X_train)

# print evaluation metrics with the same format as the test set
from sklearn.metrics import accuracy_score, classification_report
print(f"Training Data Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(classification_report(y_train, y_train_pred))

Training Data Accuracy: 0.9143
              precision    recall  f1-score   support

           0       1.00      0.62      0.76        13
           1       0.85      0.95      0.90        55
           2       0.94      0.94      0.94        48
           3       1.00      0.96      0.98        24

    accuracy                           0.91       140
   macro avg       0.95      0.86      0.89       140
weighted avg       0.92      0.91      0.91       140

