In [1]:
# Course : CS 513 - Knowledge Discovery and Data Mining
# Group Members: Branden Bulatao, Joseph Faustino, Natalie Fortes, Isabel Sutedjo
# Id : 20006007, 20006618
# Purpose : Music Happiness Predictor - Predicts the happiness of music tracks based on various features.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Import dataset
df = pd.read_csv('./dataset.csv')

df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True) # drop unnamed column
df.dropna(inplace=True) # drop rows with null values

df['valence'] = df['valence'].astype('category')
df['valence'] = df['valence'].map(lambda x: 0 if x < 0.5 else 1) # sets valence to 0 if sad, 1 if happy

# Convert 'explicit' boolean to integer (0/1)
df['explicit'] = df['explicit'].astype(int)

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['key', 'time_signature', 'track_genre'], drop_first=True)

scaler = StandardScaler()
X = df.drop(['track_id', 'artists', 'album_name', 'track_name', 'valence'], axis=1)
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
y = df['valence']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,track_genre_spanish,track_genre_study,track_genre_swedish,track_genre_synth-pop,track_genre_tango,track_genre_techno,track_genre_trance,track_genre_trip-hop,track_genre_turkish,track_genre_world-music
0,1.782624,0.024557,-0.305769,0.629239,-0.717147,0.300825,-1.326297,0.551843,-0.850193,-0.504111,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073
1,0.975625,-0.730889,-0.305769,-0.845908,-1.889974,-1.784739,0.753979,-0.078995,1.831744,-0.504097,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073
2,1.065291,-0.160353,-0.305769,-0.742187,-1.122667,-0.293289,0.753979,-0.273827,-0.315489,-0.504115,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073
3,1.692957,-0.243236,-0.305769,-1.733301,-2.312987,-2.039246,0.753979,-0.457309,1.774605,-0.503886,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073
4,2.186123,-0.271942,-0.305769,0.295026,-0.788709,-0.282751,0.753979,-0.303146,0.463409,-0.504115,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113994,-0.548707,1.462948,-0.305769,-2.274956,-1.615652,-1.617321,0.753979,-0.401507,0.977663,2.493742,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,10.630099
113995,-0.503873,1.462957,-0.305769,-2.263432,-2.084782,-2.000075,-1.326297,-0.421369,2.042258,2.648803,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,10.630099
113996,-0.503873,0.404815,-0.305769,0.358411,-1.241937,-0.524135,-1.326297,-0.403399,1.660327,-0.504115,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,10.630099
113997,0.347959,0.520635,-0.305769,0.116395,-0.538241,-0.522942,0.753979,-0.519731,0.198764,-0.504115,...,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,-0.094073,10.630099


In [3]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=8)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Accuracy: 86.40%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88     12336
           1       0.86      0.84      0.85     10464

    accuracy                           0.86     22800
   macro avg       0.86      0.86      0.86     22800
weighted avg       0.86      0.86      0.86     22800

Confusion Matrix:
[[10915  1421]
 [ 1680  8784]]
