In [None]:
# Course : CS 513 - Knowledge Discovery and Data Mining
# Group Members: Branden Bulatao, Joseph Faustino, Natalie Fortes, Isabel Sutedjo
# Id : 20005971, 20006114, 20006007, 20006618
# Purpose : Music Happiness Predictor - Predicts the happiness of music tracks based on various features.

In [2]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("dataset.csv")

# Filter for simplicity - top 5 most common genres
top_genres = df['track_genre'].value_counts().nlargest(5).index
df = df[df['track_genre'].isin(top_genres)]

# Select relevant features
features = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'valence', 'tempo']
X = df[features]
y = df['track_genre']

# Discretize continuous features (e.g. using quartiles)
for col in X.columns:
    X.loc[:, col] = pd.qcut(X[col], q=4, labels=False, duplicates='drop')

# Combine for easier handling
df = X.copy()
df['label'] = y

# --- C4.5 Functions ---
def entropy(labels):
    total = len(labels)
    counts = Counter(labels)
    return -sum((count/total) * math.log2(count/total) for count in counts.values())

def info_gain_ratio(df, attr, target):
    total_entropy = entropy(df[target])
    values = df[attr].unique()
    splits = []
    split_info = 0
    weighted_entropy = 0

    for val in values:
        subset = df[df[attr] == val]
        prob = len(subset) / len(df)
        weighted_entropy += prob * entropy(subset[target])
        split_info -= prob * math.log2(prob) if prob > 0 else 0

    gain = total_entropy - weighted_entropy
    return gain / split_info if split_info != 0 else 0

def build_tree(df, target, features):
    labels = df[target]
    if len(set(labels)) == 1:
        return labels.iloc[0]
    if len(features) == 0:
        return labels.mode()[0]

    gains = {feature: info_gain_ratio(df, feature, target) for feature in features}
    best_feature = max(gains, key=gains.get)
    tree = {best_feature: {}}

    for val in df[best_feature].unique():
        subset = df[df[best_feature] == val]
        subtree = build_tree(subset, target, [f for f in features if f != best_feature])
        tree[best_feature][val] = subtree

    return tree

def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attr = next(iter(tree))
    val = sample[attr]
    subtree = tree[attr].get(val)
    if subtree is None:
        return None
    return predict(subtree, sample)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df[features], df['label'], test_size=0.2, random_state=42)
train_df = X_train.copy()
train_df['label'] = y_train

# Train tree
decision_tree = build_tree(train_df, 'label', features)

# Predict
y_pred = [predict(decision_tree, row) for _, row in X_test.iterrows()]

# Remove Nones from unknowns
valid_idx = [i for i, val in enumerate(y_pred) if val is not None]
accuracy = accuracy_score(y_test.iloc[valid_idx], [y_pred[i] for i in valid_idx])
print("C4.5-style Decision Tree Accuracy:", round(accuracy * 100, 2), "%")


C4.5-style Decision Tree Accuracy: 53.17 %
