In [41]:
import pylast
import os
import sys
from dotenv import load_dotenv
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import mean_squared_error, r2_score

sys.path.append(os.path.abspath('..'))
from music_sentiment.lastfm_api import get_lastfm_network, get_song_tags
from music_sentiment.tag_utils import process_tag_weights

#load the environment variables from the .env file
load_dotenv()


True

In [25]:
# load the cleaned dataset

df = pd.read_csv("../data/cleaned/muse_cleaned.csv")


In [None]:
# create network for last.fm
network = get_lastfm_network()

# start with a smaller subset of the dataset
sample_size = 2000
df_sample = df.sample(sample_size, random_state=42)

# Fetch tags for each song in the sample
print("Fetching tags for each song in the sample...")
tags_text_list = []
tags_with_weights = []

for idx, row in df_sample.iterrows():
    print(f"Processing {idx+1}/{len(df_sample)}: {row['artist']} - {row['track']}")
    song_tags = get_song_tags(network, row['artist'], row['track'])
    
    # store tag data with weight for each tag
    tags_with_weights.append(song_tags)
    tags_text_list.append(process_tag_weights(song_tags))
    
    
# Add the tags to the DataFrame
df_sample['tags_with_weights'] = tags_with_weights
df_sample['tag_text'] = tags_text_list
# Save the DataFrame with tags to a CSV file
df_sample.to_csv("../data/cleaned/muse_with_tags_valence.csv", index=False)

In [27]:
from sklearn.model_selection import train_test_split, cross_val_score

# Load the dataset with tags
df_sample = pd.read_csv("../data/cleaned/muse_with_tags_valence.csv")

# ComplementNB works with sparse matrices directly

# Your existing code for feature extraction
vectorizer = TfidfVectorizer(min_df=1, max_df=0.7)
X = vectorizer.fit_transform(df_sample['tag_text'])

# Bin the valence scores into discrete categories
bins = [0, 2.5, 5, 7.5, 10]  # You can adjust these bins
labels = [0, 1, 2, 3]    # Numeric labels for the bins
df_sample['valence_binned'] = pd.cut(df_sample['valence_tags'], bins=bins, labels=labels)

# Use the binned values as target
y = df_sample['valence_binned']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Use ComplementNB which works well with sparse features
model = ComplementNB()
model.fit(X_train, y_train)  # No need for .toarray()

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))



Accuracy: 0.6467
              precision    recall  f1-score   support

           0       0.05      0.08      0.06        12
           1       0.71      0.47      0.57       207
           2       0.67      0.84      0.74       337
           3       0.41      0.16      0.23        44

    accuracy                           0.65       600
   macro avg       0.46      0.39      0.40       600
weighted avg       0.65      0.65      0.63       600



In [28]:
from joblib import dump
import os

# evaluate model on the training data
y_train_pred = model.predict(X_train)

# print evaluation metrics with the same format as the test set
from sklearn.metrics import accuracy_score, classification_report
print(f"Training Data Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(classification_report(y_train, y_train_pred))



Training Data Accuracy: 0.7914
              precision    recall  f1-score   support

           0       0.50      0.30      0.37        44
           1       0.89      0.66      0.76       485
           2       0.76      0.95      0.84       777
           3       0.92      0.38      0.54        94

    accuracy                           0.79      1400
   macro avg       0.77      0.57      0.63      1400
weighted avg       0.81      0.79      0.78      1400

Model and vectorizer saved successfully!


In [None]:

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save the model
dump(model, '../models/valence_classifier.joblib')
# Save the vectorizer
dump(vectorizer, '../models/tfidf_vectorizer.joblib')

print("Model and vectorizer saved successfully!")

## Trying a new approach, using the warriner list to map Last.fm tags to words tied with emotion

In [56]:
# In your exploration.ipynb or a new notebook

# Load the cleaned Warriner dataset
warriner_df = pd.read_csv('../data/cleaned/warriner_clean.csv')

# Create a dictionary for faster lookups
warriner_dict = dict(zip(warriner_df['word'], zip(
    warriner_df['valence_score'], 
    warriner_df['arousal_score'], 
    warriner_df['dominance_score']
)))

# Function to extract Warriner features from tags
def extract_warriner_features(tag_text, warriner_df):
    # Split the tag text into individual words
    words = set(tag_text.lower().split())
    
    # Initialize feature dictionary
    features = {
        'warriner_valence_mean': 0.0,
        'warriner_arousal_mean': 0.0,
        'warriner_dominance_mean': 0.0,
        'warriner_coverage': 0.0
    }
    
    # Match words with the Warriner lexicon
    matched_words = []
    for word in words:
        if word in warriner_df.index:
            matched_words.append(word)
    
    # Calculate coverage
    if len(words) > 0:
        features['warriner_coverage'] = len(matched_words) / len(words)
    
    # Calculate emotion scores
    if matched_words:
        valence_scores = [warriner_df.loc[word, 'V.Mean.Sum'] for word in matched_words]
        arousal_scores = [warriner_df.loc[word, 'A.Mean.Sum'] for word in matched_words]
        dominance_scores = [warriner_df.loc[word, 'D.Mean.Sum'] for word in matched_words]
        
        features['warriner_valence_mean'] = sum(valence_scores) / len(valence_scores)
        features['warriner_arousal_mean'] = sum(arousal_scores) / len(arousal_scores)
        features['warriner_dominance_mean'] = sum(dominance_scores) / len(dominance_scores)
    
    return features

In [63]:
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
# Load Warriner wordlist
warriner_df = pd.read_csv('../data/cleaned/warriner_clean.csv', index_col='word')

# Apply Warriner features to your dataset
for idx, row in df_sample.iterrows():
    if pd.notna(row['tag_text']):
        warriner_features = extract_warriner_features(row['tag_text'], warriner_df)
        for feature, value in warriner_features.items():
            df_sample.at[idx, feature] = value

# Get your numeric features and scale them
warriner_cols = ['warriner_valence_mean', 'warriner_arousal_mean', 'warriner_dominance_mean', 'warriner_coverage']
X_warriner = df_sample[warriner_cols].fillna(0).values
scaler = StandardScaler()
X_warriner_scaled = scaler.fit_transform(X_warriner)

# Get your text features
X_tfidf = vectorizer.fit_transform(df_sample['tag_text'].fillna(''))

# Combine features
X_combined = hstack([X_tfidf, X_warriner_scaled])

# Split for training/testing
X_train, X_test, y_train, y_test = train_test_split(X_combined, df_sample['emotion'], test_size=0.3, random_state=42)

# Train your model
model = ComplementNB()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6833
              precision    recall  f1-score   support

       happy       0.79      0.69      0.74        39
         sad       0.54      0.67      0.60        21

    accuracy                           0.68        60
   macro avg       0.67      0.68      0.67        60
weighted avg       0.70      0.68      0.69        60

