In [None]:
import numpy as np
import pandas as pd

# 0) 데이터 로드

CSV_PATH = "dataset/spotify_songs_with_genre_int.csv"
URL = "https://raw.githubusercontent.com/hyeonx3/ML_Project/main/dataset/spotify_songs_with_genre_int.csv"

try:
    df = pd.read_csv(CSV_PATH) 
except:
    df = pd.read_csv(URL)

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"


In [3]:

# 1) 피처/타깃 지정 (숫자형만 사용)

target_col = "genre_int"
features = [
    "danceability", "energy", "key", "loudness", "mode",
    "speechiness", "acousticness", "instrumentalness", "liveness",
    "valence", "tempo", "duration_ms"
]
# 숫자형 필터
numeric_cols = df[features].select_dtypes(include=["int64", "float64"]).columns.tolist()
df = df.dropna(subset=numeric_cols + [target_col]).reset_index(drop=True)

X_raw = df[numeric_cols].astype(float).values
y = df[target_col].values
class_names = np.unique(y)

print(f"[Data] n={len(y)}, d={X_raw.shape[1]}, classes={class_names.size}")


[Data] n=32833, d=12, classes=6


In [4]:
# 2) Split: Train / Val / Test = 70 / 15 / 15

from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(
    X_raw, y, test_size=0.30, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("train:", X_train.shape, "val:", X_val.shape, "test:", X_test.shape)


train: (22983, 12) val: (4925, 12) test: (4925, 12)


In [5]:
# 3) 스케일링 (Train에 fit → Val/Test엔 transform)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
Xs_train = scaler.transform(X_train)
Xs_val   = scaler.transform(X_val)
Xs_test  = scaler.transform(X_test)


In [6]:
# 4) 차원 축소기 (Train에만 fit) : PCA, LDA, AE

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# 4-1) PCA (누적분산 95%)
pca = PCA(n_components=0.95, random_state=42).fit(Xs_train)
Xp_train = pca.transform(Xs_train)
Xp_val   = pca.transform(Xs_val)
Xp_test  = pca.transform(Xs_test)

# 4-2) LDA (지도식, 최대 성분 = 클래스수-1)
n_classes = class_names.size
lda_dim = max(1, min(10, n_classes - 1))
lda = LDA(n_components=lda_dim).fit(Xs_train, y_train)
Xl_train = lda.transform(Xs_train)
Xl_val   = lda.transform(Xs_val)
Xl_test  = lda.transform(Xs_test)

# 4-3) Autoencoder (Keras) — Train으로만 학습, Val로 모니터
#      encoder 출력(z)을 임베딩으로 사용
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

input_dim   = Xs_train.shape[1]
encoding_dim = 8  # (튜닝 후보: 6~16)

inp = layers.Input(shape=(input_dim,))
h1  = layers.Dense(16, activation='relu')(inp)
z   = layers.Dense(encoding_dim, activation='relu', name="bottleneck")(h1)
h2  = layers.Dense(16, activation='relu')(z)
out = layers.Dense(input_dim, activation='linear')(h2)

autoencoder = models.Model(inputs=inp, outputs=out)
autoencoder.compile(optimizer='adam', loss='mse')

es = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
autoencoder.fit(
    Xs_train, Xs_train,
    validation_data=(Xs_val, Xs_val),
    epochs=200, batch_size=64, shuffle=True, verbose=0, callbacks=[es]
)

encoder = models.Model(inputs=inp, outputs=z)
Xa_train = encoder.predict(Xs_train, verbose=0)
Xa_val   = encoder.predict(Xs_val,   verbose=0)
Xa_test  = encoder.predict(Xs_test,  verbose=0)


2025-11-04 08:25:59.292967: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762244759.305117   12594 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762244759.308777   12594 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-04 08:25:59.320922: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-04 08:26:01.093361: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call t

In [7]:
# 5) 조합 입력 세트 구성 (Train/Val/Test 각각)

Xsets = {
    "base":           (Xs_train,                 Xs_val,                 Xs_test),
    "base_pca":       (np.c_[Xs_train, Xp_train], np.c_[Xs_val, Xp_val], np.c_[Xs_test, Xp_test]),
    "base_lda":       (np.c_[Xs_train, Xl_train], np.c_[Xs_val, Xl_val], np.c_[Xs_test, Xl_test]),
    "base_ae":        (np.c_[Xs_train, Xa_train], np.c_[Xs_val, Xa_val], np.c_[Xs_test, Xa_test]),
    "base_pca_lda":   (np.c_[Xs_train, Xp_train, Xl_train],
                       np.c_[Xs_val,   Xp_val,   Xl_val],
                       np.c_[Xs_test,  Xp_test,  Xl_test]),
    "base_pca_ae":    (np.c_[Xs_train, Xp_train, Xa_train],
                       np.c_[Xs_val,   Xp_val,   Xa_val],
                       np.c_[Xs_test,  Xp_test,  Xa_test]),
    "base_lda_ae":    (np.c_[Xs_train, Xl_train, Xa_train],
                       np.c_[Xs_val,   Xl_val,   Xa_val],
                       np.c_[Xs_test,  Xl_test,  Xa_test]),
    "base_pca_lda_ae":(np.c_[Xs_train, Xp_train, Xl_train, Xa_train],
                       np.c_[Xs_val,   Xp_val,   Xl_val,   Xa_val],
                       np.c_[Xs_test,  Xp_test,  Xl_test,  Xa_test]),
}
ACTIVE_SET = "base_lda_ae"  # <- 여기만 바꿔서 실험 (예: 'base', 'base_pca', ...)

Z_train, Z_val, Z_test = Xsets[ACTIVE_SET]
print(f"[Xset] {ACTIVE_SET}: train={Z_train.shape}, val={Z_val.shape}, test={Z_test.shape}")

[Xset] base_lda_ae: train=(22983, 25), val=(4925, 25), test=(4925, 25)


In [8]:
# --------------------------------
# 6) 모델 학습/튜닝 (본 보고서: Validation까지만)  ← SVM(RBF, 수동 탐색)
# --------------------------------
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

# 작은 조합으로 빠르게 탐색 (필요시 후보 추가/축소)
C_list     = [0.1, 1, 10]
gamma_list = ["scale", 0.01, 0.001]   # 'scale'은 데이터 기반 기본값

best_cfg = None
best_val_acc = -1.0
best_model = None

for C in C_list:
    for gamma in gamma_list:
        svc = SVC(C=C, gamma=gamma, kernel="rbf", cache_size=1000)
        svc.fit(Z_train, y_train)
        val_acc = svc.score(Z_val, y_val)
        print(f"C={C}, gamma={gamma} → val acc={val_acc:.4f}")
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_cfg = (C, gamma)
            best_model = svc
print(f"\n[{ACTIVE_SET}]")
print(f"[선택] 제일 좋은 조합: C={best_cfg[0]}, gamma={best_cfg[1]} (val acc={best_val_acc:.4f})")

# ---- Validation 성능/혼동행렬/리포트 ----
val_pred = best_model.predict(Z_val)
val_acc  = accuracy_score(y_val, val_pred)
val_f1   = f1_score(y_val, val_pred, average="macro")
print(f"[Validation] acc={val_acc:.4f}, macroF1={val_f1:.4f}")

labels = np.sort(np.unique(np.r_[y_train, y_val]))
cm = confusion_matrix(y_val, val_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{l}" for l in labels],
    columns=[f"pred_{l}" for l in labels]
)
print("\n=== Confusion matrix (Validation, SVM-RBF) ===")
print(cm_df)

print("\n=== Classification report (Validation) ===")
print(classification_report(y_val, val_pred, digits=4))

C=0.1, gamma=scale → val acc=0.4973
C=0.1, gamma=0.01 → val acc=0.4904
C=0.1, gamma=0.001 → val acc=0.4575
C=1, gamma=scale → val acc=0.5149
C=1, gamma=0.01 → val acc=0.5076
C=1, gamma=0.001 → val acc=0.4743
C=10, gamma=scale → val acc=0.5316
C=10, gamma=0.01 → val acc=0.5226
C=10, gamma=0.001 → val acc=0.4899

[base_lda_ae]
[선택] 제일 좋은 조합: C=10, gamma=scale (val acc=0.5316)
[Validation] acc=0.5316, macroF1=0.5253

=== Confusion matrix (Validation, SVM-RBF) ===
        pred_0  pred_1  pred_2  pred_3  pred_4  pred_5
true_0     311      73     141      96      85     120
true_1      57     575      15      71      78      66
true_2      75      12     524      67      21      44
true_3     113     166      79     344      78      34
true_4     150     145      35      76     289      79
true_5     151      65      37      37      41     575

=== Classification report (Validation) ===
              precision    recall  f1-score   support

           0     0.3629    0.3765    0.3696       8

In [9]:

# --------------------------------
# (선택) 최종 평가: 보고서 범위 밖, 마지막에 한 번만 Test 사용
# --------------------------------
# test_pred = best_clf.predict(Z_test)
# cm = confusion_matrix(y_test, test_pred)
# print("\n=== Confusion matrix (Test) ===")
# print(cm)
# cm_df = pd.DataFrame(cm,
#     index=[f"true_{c}" for c in np.unique(y)],
#     columns=[f"pred_{c}" for c in np.unique(y)]
# )
# print(cm_df)