In [None]:
# Course : CS 513 - Knowledge Discovery and Data Mining
# Group Members: Branden Bulatao, Joseph Faustino, Natalie Fortes, Isabel Sutedjo
# Id : 20005971
# Purpose : Music Happiness Predictor - Predicts the happiness of music tracks based on various features.

# Main Author: Branden Bulatao

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Clips outliers Q1–1.5×IQR and Q3+1.5×IQR
def clip_outliers(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Clip the values
    df[feature] = df[feature].clip(lower=lower_bound, upper=upper_bound)

    # sns.boxplot(x=f"{feature}", data=df)
    # plt.title(f"{feature} by Valence Group")
    # plt.show()
    return df

def clip_outliers_strict(df, feature):
    lower = df[feature].quantile(0.005)
    upper = df[feature].quantile(0.995)
    df[feature] = df[feature].clip(lower=lower, upper=upper)

    return df

# Check Outlier percentages of each feature
def calculate_outlier_percentage(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    outlier_percentage = len(outliers) / len(df) * 100
    return outlier_percentage

In [None]:
# Import dataset
df = pd.read_csv("./dataset.csv")

df.drop(
    df.columns[df.columns.str.contains("unnamed", case=False)], axis=1, inplace=True
)  # drop unnamed column

df.dropna(inplace=True)  # drop rows with null values

df["valence"] = df["valence"].astype("category")
df["valence"] = df["valence"].map(
    lambda x: 0 if x < 0.5 else 1
)  # sets valence to 0 if sad, 1 if happy

# Convert 'explicit' boolean to integer (0/1)
df["explicit"] = df["explicit"].astype(int)

# One-hot encode categorical features
df = pd.get_dummies(
    df, columns=["key", "time_signature", "track_genre"], drop_first=True
)

# X = df.drop(['track_id', 'artists', 'album_name', 'track_name', 'valence'], axis=1)
X = df.drop(["track_id", "artists", "album_name", "track_name", "valence", "duration_ms"], axis=1)
y = df["valence"]

# Identify numeric features to scale
numeric_features = [
    # "duration_ms",
    # "popularity",
    # "tempo",
    # "loudness",
    "danceability",
    "energy",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
]

numeric_high_value_features = [
    "popularity",
    "tempo",
    "loudness",
]

# All other features (binary or one-hot) are left as-is
non_scaled_features = [
    col 
    for col in X.columns 
    if col not in (numeric_features + numeric_high_value_features)
]
# non_scaled_features = [col for col in X.columns if col not in numeric_high_value_features]

# Clip extreme outliers >5%
# threshold = 5
# for col in numeric_features + numeric_high_value_features:
#     percent = calculate_outlier_percentage(X, col)
#     if percent > threshold:
#         X = clip_outliers(X, col)
#         print(f"Clipped {col} (outliers were {percent:.2f}%)")

# ColumnTransformer for selective scaling
preprocessor = ColumnTransformer(
    transformers=[
        # ("num", StandardScaler(), numeric_features),
        ("num", "passthrough", numeric_features),
        ("num2", MinMaxScaler(), numeric_high_value_features),
        ("pass", "passthrough", non_scaled_features),
    ]
)

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

# Reconstruct a DataFrame (optional, for inspection/debugging)
final_features = numeric_features + numeric_high_value_features + non_scaled_features
X_processed = pd.DataFrame(X_processed, columns=final_features)
# X_processed = X_processed.apply(pd.to_numeric)  # <-- this line fixes your problem

# Train the code
attr_train, attr_test, target_train, target_test = train_test_split(
    X_processed, y, test_size=0.3, random_state=42
)

In [None]:
# print(X_processed[numeric_features].describe())
print(X_processed[numeric_high_value_features].describe())

no clip 0.7279824561403508
reg clip 0.725906432748538
strict clip 0.7272222222222222

In [None]:
from sklearn.model_selection import cross_val_score
# K-nearest neighbors
# k_values = [3, 5, 7, 10, 15, 20, 30]
k_values = [20]
test_accuracies = []


for k in k_values:
    # knn = KNeighborsClassifier(n_neighbors=k)
    knn = KNeighborsClassifier(n_neighbors = k,weights='distance')
    knn.fit(attr_train, target_train)
    target_pred = knn.predict(attr_test)
    # accuracy = round(np.mean(target_test==target_pred ) * 100, 2)

    accuracy = accuracy_score(target_test, target_pred)
    test_accuracies.append(knn.score(attr_test, target_test))


    print(f"Accuracy of model with k = {k}: {accuracy}")
    print("")

    scores = cross_val_score(knn, attr_train, target_train, cv=5)
    print("Cross-validated Score:", scores.mean())


# Plot
# plt.figure(figsize=(10, 6))
# plt.plot(k_values, test_accuracies, label='Test Accuracy', marker='s')
# plt.xlabel('Number of Neighbors (k)')
# plt.ylabel('Accuracy')
# plt.title('KNN Accuracy vs Number of Neighbors')
# plt.legend()
# plt.grid(True)
# plt.show()

In [None]:
# Evaluate model
accuracy = accuracy_score(target_test, target_pred)
conf_matrix = confusion_matrix(target_test, target_pred)
class_report = classification_report(target_test, target_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

test_actual = attr_test  # Copy attr_test
test_actual["target_pred"] = target_pred  # Create new column for prediction values
test_actual["test_actual"] = target_test  # Create new column for actual values
test_actual.head()  # Show table

misclassified = (
    test_actual["target_pred"] != test_actual["test_actual"]
).sum()  # Calculate number of misclassified cases
total_tests = len(test_actual)  # total number of cases

error_rate = misclassified / total_tests

# Error rate percentage
print(f"Error rate: {error_rate:.2%}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
cm = confusion_matrix(target_test, target_pred)

# Create the heatmap
plt.figure(figsize=(6, 4))
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt="g", cmap="Blues", ax=ax)

# Labels, title, and ticks
ax.set_xlabel("Predicted Labels")
ax.set_ylabel("True Labels")
ax.set_title("KNN Confusion Matrix for Valence")
ax.xaxis.set_ticklabels(["Sad (0)", "Happy (1)"])
ax.yaxis.set_ticklabels(["Sad (0)", "Happy (1)"])

plt.show()

In [None]:
# False positives
results = pd.DataFrame(X)  # make sure this matches your data

test_actual = attr_test  # Copy attr_test
test_actual["target_pred"] = target_pred  # Create new column for prediction values
test_actual["test_actual"] = target_test  # Create new column for actual values
# test_actual.head()  # Show table

false_positives = test_actual[
    (test_actual["target_pred"] == 1) & (test_actual["test_actual"] == 0)
]

false_negatives = test_actual[
    (test_actual["target_pred"] == 0) & (test_actual["test_actual"] == 1)
]

true_positives = test_actual[
    (test_actual["target_pred"] == 1) & (test_actual["test_actual"] == 1)
]

true_negatives = test_actual[
    (test_actual["target_pred"] == 0) & (test_actual["test_actual"] == 0)
]

print(false_positives)
print(false_negatives)
# false_positives.describe()

In [None]:
# # Code given from the ML02_EDA.ipynb
# # Summarize each column
# summary_p = false_positives.describe().loc[["min", "max", "mean"]]
# summary_n = false_negatives.describe().loc[["min", "max", "mean"]]

# # Print the summary
# print(summary_p)
# print(summary_n)

# print("\n min, max, mean only")
# # Min_Max_Mean=df.describe(include=[float, int]).loc[["min", "max", "mean"]]
# Min_Max_Mean = X.describe(include=[float, int]).loc[["min", "max", "mean"]]
# print(Min_Max_Mean)

In [None]:
# print("False Negatives Summary vs Full Dataset:")
# print(false_negatives.describe().loc[["mean"]])
# print("\nFull Dataset:")
# print(df.describe().loc[["mean"]])

# delta_n = false_negatives.describe().loc["mean"] - df.describe().loc["mean"]
# print("\nDifference in Mean (False Negatives - Full Dataset):")
# print(delta_n)

# delta_p = false_positives.describe().loc["mean"] - df.describe().loc["mean"]
# print("\nDifference in Mean (False Positives - Full Dataset):")
# print(delta_p)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df_no_genres = df.loc[:, ~df.columns.str.contains("track_genre")]

feature_cols = [
    col
    for col in df_no_genres.columns
    if col
    not in [
        "target_actual",
        "target_pred",
        "track_id",
        "artists",
        "album_name",
        "track_name",
        "valence",
    ]
]

# KF_df = df.drop(columns=["valence","duration_ms"])
print(df_no_genres.select_dtypes(include=["float", "int"]).columns)
for col in df_no_genres.select_dtypes(include=["float", "int"]).columns:
    if col == "valence" or col == "duration_ms": continue
   
    plt.figure(figsize=(6, 3))

    # KDE plots
    sns.kdeplot(false_negatives[col], label="False Negatives", fill=True, color="r")
    sns.kdeplot(false_positives[col], label="False Positives", fill=True, color="g")
    sns.kdeplot(true_positives[col], label="True Positives", fill=True, color="royalblue")
    sns.kdeplot(true_negatives[col], label="True Negatives", fill=True, color="y")

    # Vertical lines for means
    plt.axvline(false_negatives[col].mean(), color="r", linestyle="--", label="FN Mean")
    plt.axvline(false_positives[col].mean(), color="g", linestyle="--", label="FP Mean")
    plt.axvline(
        true_positives[col].mean(), color="royalblue", linestyle="--", label="TP Mean"
    )
    plt.axvline(
        true_negatives[col].mean(), color="gold", linestyle="--", label="TN Mean"
    )

    plt.title(f"Distribution of {col}")
    plt.legend()
    plt.tight_layout()
    plt.show()

    print(type(false_negatives[col].mean()))

    print(f"Stats for '{col}':")
    print(f"  False Negatives - mean: {false_negatives[col].mean():.4f}, std: {false_negatives[col].std():.4f}")
    print(f"  False Positives - mean: {false_positives[col].mean():.4f}, std: {false_positives[col].std():.4f}")
    print(f"  True Positives  - mean: {true_positives[col].mean():.4f}, std: {true_positives[col].std():.4f}")
    print(f"  True Negatives  - mean: {true_negatives[col].mean():.4f}, std: {true_negatives[col].std():.4f}")
    print("-" * 60)

In [None]:
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

df_no_genres = df.loc[:, ~df.columns.str.contains("track_genre")]

# Features only (adjust as needed)
feature_cols = [
    col
    for col in df_no_genres.columns
    if col
    not in [
        "target_actual",
        "target_pred",
        "track_id",
        "artists",
        "album_name",
        "track_name",
        "valence",
        "duration_ms"
    ]
]

print(feature_cols)

# Mean summary per category
summary_df = pd.DataFrame(
    {
        "True Positives": true_positives[feature_cols].mean(),
        "True Negatives": true_negatives[feature_cols].mean(),
        "False Positives": false_positives[feature_cols].mean(),
        "False Negatives": false_negatives[feature_cols].mean(),
        # "All Data": df[feature_cols].mean(),
    }
)

# Transpose for easier plotting
summary_df = summary_df.astype(float)
summary_df = summary_df.T

# Optional: Normalize columns for radar/spider-style plots
summary_norm = (summary_df - summary_df.min()) / (summary_df.max() - summary_df.min())

# --- Plot 1: Heatmap of Means per Outcome Type ---
print("-" * 60)
print("1) Heatmap of feature means per outcome")
print("-" * 60)

plt.figure(figsize=(12, 6))
sns.heatmap(summary_df.T, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title("Feature Means by Prediction Outcome")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- Plot 2: Comparison Bar Plot for One Feature at a Time ---
# Choose top 5 most differing features (by std deviation across groups)
print("-" * 60)
print("2) Bar Plots of each feature")
print("-" * 60)

top_diff_features = summary_df.std().sort_values(ascending=False).head(10).index

for feature in top_diff_features:
    print(feature + ' std deviation: ', summary_df[feature].std())
    
    # ax = summary_df[feature].plot(
    #     kind="bar", 
    #     title=f"{feature} across Prediction Groups", 
    #     ylabel="Mean Value", 
    # )
    ax = sns.barplot(
        x=summary_df.index,
        y=summary_df[feature].values,
        hue=["True +", "True -", "False +", "False -"],
        palette=["green", "red", "gray", "gray"],
    )

    # Add value labels above each bar
    for i, value in enumerate(summary_df[feature]):
        ax.text(i, value + 0.01, f"{value:.2f}", ha="center", va="bottom", fontsize=9)
        
    plt.xticks(rotation=45)
    plt.title(f"{feature} across Prediction Groups")
    plt.grid(axis="y")
    plt.tight_layout()
    plt.show()

# --- Plot 3: Radar Plot (optional, fancier) ---
try:
    from math import pi

    categories = list(summary_norm.columns)
    groups = summary_norm.index

    for group in groups:
        values = summary_norm.loc[group].tolist()
        values += values[:1]  # repeat the first value to close the circle

        angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
        angles += angles[:1]

        plt.figure(figsize=(6, 6))
        ax = plt.subplot(111, polar=True)
        plt.xticks(angles[:-1], categories, color="grey", size=8)

        ax.plot(angles, values, linewidth=2, linestyle="solid", label=group)
        ax.fill(angles, values, alpha=0.2)

        plt.title(f"Radar Plot for {group}", size=14, y=1.1)
        plt.legend(loc="upper right", bbox_to_anchor=(0.1, 0.1))
        plt.tight_layout()
        plt.show()
except ImportError:
    print("Radar plot skipped (requires polar plotting support).")

pop-film     59.283000
k-pop        56.952953
chill        53.651000
sad          52.379000
grunge       49.594000
indian       49.539000
anime        48.772000
emo          48.128000
sertanejo    47.866000
pop          47.576000

["pop-film", "k-pop", "chill", "sad", "grunge", "indian", "anime", "emo", "sertanejo", "pop"]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: prettier style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# --- Genre Counts per Prediction Outcome ---

# Ensure genre columns are all 0 or 1
genre_cols = [col for col in df.columns if col.startswith("track_genre")]

# Define the prediction outcome groups
groups = {
    "True Positives (H)": true_positives,
    "True Negatives (S)": true_negatives,
    "False Positives": false_positives,
    "False Negatives": false_negatives,
}


# Count genres per group
genre_counts = {
    group_name: (group_df[genre_cols]).astype(int).sum() for group_name, group_df in groups.items()
}


# Create a DataFrame
genre_counts_df = pd.DataFrame(genre_counts).T  # Outcomes as rows
# genre_counts_df.to_csv('output.csv', index=False)

top_10_popular_genres = [
    "track_genre_pop-film", 
    "track_genre_k-pop", 
    "track_genre_chill", 
    "track_genre_sad", 
    "track_genre_grunge", 
    "track_genre_indian", 
    "track_genre_anime", 
    "track_genre_emo", 
    "track_genre_sertanejo", 
    "track_genre_pop",
    "track_genre_indie"
]
# top_genres = genre_counts_df.sum(axis=0).sort_values(ascending=False).index
top_genres = genre_counts_df.sum(axis=0).sort_values(ascending=False).head(20).index

def get_top_10_popular(genre):
    if genre not in top_10_popular_genres: 
        return True
    else:
        return False

# Plot barplot for each genre
for genre in top_genres:
    # if get_top_10_popular(genre): 
    #     continue
    
    plt.figure(figsize=(8, 4))
    ax = sns.barplot(
        x=genre_counts_df.index,
        y=genre_counts_df[genre].values,
        hue=["True +", "True -", "False +", "False -"],
        palette=["green", "red", "lightgreen", "lightcoral"],
    )

    # Add value labels above bars
    for i, value in enumerate(genre_counts_df[genre].values):
        ax.text(i, value + 0.5, str(int(value)), ha="center", va="bottom", fontsize=9)

    plt.title(f"Counts of {genre} by Prediction Outcome")
    plt.ylabel("Number of Tracks")
    plt.xlabel("Prediction Outcome")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    print(genre_counts_df[genre])

In [None]:
# Count of genres in true positives
# print(true_negatives.columns.values)
# genre_columns = [col for col in df.columns if col.startswith("track_genre_")]
# print(genre_columns)

# # Step 2: Use idxmax to get the column name with value 1
# df["track_genre"] = df[genre_columns].idxmax(axis=1)

# # Step 3: Remove the "track_genre_" prefix
# df["track_genre"] = df["track_genre"].str.replace("track_genre_", "")

# # Optional: Drop one-hot columns
# df = df.drop(columns=genre_columns)

# mood_counts_by_genre = (
#     df.groupby("track_genre")["valence"].value_counts().unstack().fillna(0)
# )
# true_negatives.to_csv("true_negatives.csv", index=False)

In [None]:
# Count of genres in true positives
# print(true_negatives.columns.values)
# genre_columns = [col for col in df.columns if col.startswith("track_genre_")]
# print(genre_columns)

# # Step 2: Use idxmax to get the column name with value 1
# df["track_genre"] = df[genre_columns].idxmax(axis=1)

# # Step 3: Remove the "track_genre_" prefix
# df["track_genre"] = df["track_genre"].str.replace("track_genre_", "")

# # Optional: Drop one-hot columns
# df = df.drop(columns=genre_columns)

# mood_counts_by_genre = (
#     df.groupby("track_genre")["valence"].value_counts().unstack().fillna(0)
# )
# true_negatives.to_csv("true_negatives.csv", index=False)

In [None]:
# sns.set(style="whitegrid")
# plt.rcParams["figure.figsize"] = (10, 5)

# # df_no_genres = df.loc[:, ~df.columns.str.contains("track_genre")]

# # Features only (adjust as needed)
# feature_cols = [
#     col
#     for col in df.columns
#     if col
#     not in [
#         "target_actual",
#         "target_pred",
#         "track_id",
#         "artists",
#         "album_name",
#         "track_name",
#         "valence",
#     ]
# ]

# print(feature_cols)

# # Mean summary per category
# summary_df = pd.DataFrame(
#     {
#         "False Negatives": false_negatives[feature_cols].mean(),
#         "False Positives": false_positives[feature_cols].mean(),
#         "True Positives": true_positives[feature_cols].mean(),
#         "True Negatives": true_negatives[feature_cols].mean(),
#         # "All Data": df[feature_cols].mean(),
#     }
# )

# # Transpose for easier plotting
# summary_df = summary_df.T

# # Optional: Normalize columns for radar/spider-style plots
# summary_norm = (summary_df - summary_df.min()) / (summary_df.max() - summary_df.min())


# # --- Comparison Bar Plot for of each genre  ---
# # Choose top 10 most differing features that are ONLY GENRES (by std deviation across groups)
# top_diff_features = (
#     summary_df.filter(like="track_genre")
#     .std()
#     .sort_values(ascending=False)
#     # .head(10)
#     .index
# )


# for feature in top_diff_features:
#     summary_df[feature].plot(
#         kind="bar", title=f"{feature} across Prediction Groups", ylabel="Mean Value"
#     )
#     plt.xticks(rotation=45)
#     plt.grid(axis="y")
#     plt.tight_layout()
#     plt.show()

# # --- Radar Plot (optional, fancier) ---
# try:
#     from math import pi

#     categories = list(summary_norm.columns)
#     groups = summary_norm.index

#     for group in groups:
#         values = summary_norm.loc[group].tolist()
#         values += values[:1]  # repeat the first value to close the circle

#         angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
#         angles += angles[:1]

#         plt.figure(figsize=(6, 6))
#         ax = plt.subplot(111, polar=True)
#         plt.xticks(angles[:-1], categories, color="grey", size=8)

#         ax.plot(angles, values, linewidth=2, linestyle="solid", label=group)
#         ax.fill(angles, values, alpha=0.2)

#         plt.title(f"Radar Plot for {group}", size=14, y=1.1)
#         plt.legend(loc="upper right", bbox_to_anchor=(0.1, 0.1))
#         plt.tight_layout()
#         plt.show()
# except ImportError:
#     print("Radar plot skipped (requires polar plotting support).")

Using K-Nearest Neighbors with audio features to predict song valence, our model achieved a classification accuracy of 73.6%. The model performs slightly better at identifying low-valence (sadder) songs, with higher recall (79%) compared to high-valence (happier) songs (67%). Precision is fairly balanced across both classes. The findings suggest that features like danceability, energy, and acousticness have predictive value for estimating a song’s mood, but future work could explore better handling of class imbalance or using more sophisticated models to improve recall for high-valence tracks.