In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load dataset
df = pd.read_csv("dataset.csv")


# Drop unnamed columns
df.drop(
    df.columns[df.columns.str.contains("unnamed", case=False)], axis=1, inplace=True
)  # drop unnamed column
df.dropna(inplace=True)  # drop rows with null values


# Convert 'explicit' boolean to integer (0/1)
df["explicit"] = df["explicit"].astype(int)

# One-hot encode categorical features
df = pd.get_dummies(
    df, columns=["key", "time_signature", "track_genre"], drop_first=True
)

# Make valence binary for classification
df["valence"] = df["valence"].astype("category")
df["valence"] = df["valence"].map(
    lambda x: 0 if x < 0.5 else 1
)  # sets valence to 0 if sad, 1 if happy

# Drop unneeded features and make valence our target feature
X = df.drop(["track_id", "artists", "album_name", "track_name", "valence"], axis=1)
features = df.columns.tolist()
print(features)
y = df["valence"]

# Normalize Data
scalar = StandardScaler()
X = pd.DataFrame(scalar.fit_transform(X), columns=X.columns)

# Discretize continuous features (e.g. using quartiles)
for col in X.columns:
    X.loc[:, col] = pd.qcut(X[col], q=4, labels=False, duplicates='drop')

# Combine for easier handling
df = X.copy()
df['label'] = y

['track_id', 'artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5', 'track_genre_afrobeat', 'track_genre_alt-rock', 'track_genre_alternative', 'track_genre_ambient', 'track_genre_anime', 'track_genre_black-metal', 'track_genre_bluegrass', 'track_genre_blues', 'track_genre_brazil', 'track_genre_breakbeat', 'track_genre_british', 'track_genre_cantopop', 'track_genre_chicago-house', 'track_genre_children', 'track_genre_chill', 'track_genre_classical', 'track_genre_club', 'track_genre_comedy', 'track_genre_country', 'track_genre_dance', 'track_genre_dancehall', 'track_genre_death-metal', 'track_genre_deep-house', 'track_genre_detroit-techno', 'track_genre_disco

In [4]:
# --- C4.5 Functions ---
def entropy(labels):
    total = len(labels)
    counts = Counter(labels)
    return -sum((count / total) * math.log2(count / total) for count in counts.values())


def info_gain_ratio(df, attr, target):
    total_entropy = entropy(df[target])
    values = df[attr].unique()
    splits = []
    split_info = 0
    weighted_entropy = 0

    for val in values:
        subset = df[df[attr] == val]
        prob = len(subset) / len(df)
        weighted_entropy += prob * entropy(subset[target])
        split_info -= prob * math.log2(prob) if prob > 0 else 0

    gain = total_entropy - weighted_entropy
    return gain / split_info if split_info != 0 else 0


def build_tree(df, target, features):
    labels = df[target]
    if len(set(labels)) == 1:
        return labels.iloc[0]
    if len(features) == 0:
        return labels.mode()[0]

    gains = {feature: info_gain_ratio(df, feature, target) for feature in features}
    best_feature = max(gains, key=gains.get)
    tree = {best_feature: {}}

    for val in df[best_feature].unique():
        subset = df[df[best_feature] == val]
        subtree = build_tree(subset, target, [f for f in features if f != best_feature])
        tree[best_feature][val] = subtree

    return tree


def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attr = next(iter(tree))
    val = sample[attr]
    subtree = tree[attr].get(val)
    if subtree is None:
        return None
    return predict(subtree, sample)

In [7]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
train_df = X_train.copy()
train_df["valence"] = y_train

# Train tree
decision_tree = build_tree(train_df, "valence", features)

# Predict
y_pred = [predict(decision_tree, row) for _, row in X_test.iterrows()]

# Remove Nones from unknowns
valid_idx = [i for i, val in enumerate(y_pred) if val is not None]
accuracy = accuracy_score(y_test.iloc[valid_idx], [y_pred[i] for i in valid_idx])
print("C4.5-style Decision Tree Accuracy:", round(accuracy * 100, 2), "%")

KeyError: 'track_id'