In [None]:
import numpy as np
import pandas as pd

# 0) 데이터 로드

CSV_PATH = "dataset/spotify_songs_with_genre_int.csv"
URL = "https://raw.githubusercontent.com/hyeonx3/ML_Project/main/dataset/spotify_songs_with_genre_int.csv"

try:
    df = pd.read_csv(CSV_PATH) 
except:
    df = pd.read_csv(URL)

In [None]:

# 1) 피처/타깃 지정 (숫자형만 사용)

target_col = "genre_int"
features = [
    "danceability", "energy", "key", "loudness", "mode",
    "speechiness", "acousticness", "instrumentalness", "liveness",
    "valence", "tempo", "duration_ms"
]
# 숫자형 필터
numeric_cols = df[features].select_dtypes(include=["int64", "float64"]).columns.tolist()
df = df.dropna(subset=numeric_cols + [target_col]).reset_index(drop=True)

X_raw = df[numeric_cols].astype(float).values
y = df[target_col].values
class_names = np.unique(y)

print(f"[Data] n={len(y)}, d={X_raw.shape[1]}, classes={class_names.size}")


[Data] n=32833, d=12, classes=6


In [None]:
# 2) Split: Train / Val / Test = 70 / 15 / 15

from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(
    X_raw, y, test_size=0.30, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("train:", X_train.shape, "val:", X_val.shape, "test:", X_test.shape)


train: (22983, 12) val: (4925, 12) test: (4925, 12)


In [None]:
# 3) 스케일링 (Train에 fit → Val/Test엔 transform)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
Xs_train = scaler.transform(X_train)
Xs_val   = scaler.transform(X_val)
Xs_test  = scaler.transform(X_test)


In [None]:
# 4) 차원 축소기 (Train에만 fit) : PCA, LDA, AE

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# 4-1) PCA (누적분산 95%)
pca = PCA(n_components=0.95, random_state=42).fit(Xs_train)
Xp_train = pca.transform(Xs_train)
Xp_val   = pca.transform(Xs_val)
Xp_test  = pca.transform(Xs_test)

# 4-2) LDA (지도식, 최대 성분 = 클래스수-1)
n_classes = class_names.size
lda_dim = max(1, min(10, n_classes - 1))
lda = LDA(n_components=lda_dim).fit(Xs_train, y_train)
Xl_train = lda.transform(Xs_train)
Xl_val   = lda.transform(Xs_val)
Xl_test  = lda.transform(Xs_test)

# 4-3) Autoencoder (Keras) — Train으로만 학습, Val로 모니터
#      encoder 출력(z)을 임베딩으로 사용
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

input_dim   = Xs_train.shape[1]
encoding_dim = 8  # (튜닝 후보: 6~16)

inp = layers.Input(shape=(input_dim,))
h1  = layers.Dense(16, activation='relu')(inp)
z   = layers.Dense(encoding_dim, activation='relu', name="bottleneck")(h1)
h2  = layers.Dense(16, activation='relu')(z)
out = layers.Dense(input_dim, activation='linear')(h2)

autoencoder = models.Model(inputs=inp, outputs=out)
autoencoder.compile(optimizer='adam', loss='mse')

es = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
autoencoder.fit(
    Xs_train, Xs_train,
    validation_data=(Xs_val, Xs_val),
    epochs=200, batch_size=64, shuffle=True, verbose=0, callbacks=[es]
)

encoder = models.Model(inputs=inp, outputs=z)
Xa_train = encoder.predict(Xs_train, verbose=0)
Xa_val   = encoder.predict(Xs_val,   verbose=0)
Xa_test  = encoder.predict(Xs_test,  verbose=0)


In [None]:
# 5) 조합 입력 세트 구성 (Train/Val/Test 각각)

Xsets = {
    "base":           (Xs_train,                 Xs_val,                 Xs_test),
    "base_pca":       (np.c_[Xs_train, Xp_train], np.c_[Xs_val, Xp_val], np.c_[Xs_test, Xp_test]),
    "base_lda":       (np.c_[Xs_train, Xl_train], np.c_[Xs_val, Xl_val], np.c_[Xs_test, Xl_test]),
    "base_ae":        (np.c_[Xs_train, Xa_train], np.c_[Xs_val, Xa_val], np.c_[Xs_test, Xa_test]),
    "base_pca_lda":   (np.c_[Xs_train, Xp_train, Xl_train],
                       np.c_[Xs_val,   Xp_val,   Xl_val],
                       np.c_[Xs_test,  Xp_test,  Xl_test]),
    "base_pca_ae":    (np.c_[Xs_train, Xp_train, Xa_train],
                       np.c_[Xs_val,   Xp_val,   Xa_val],
                       np.c_[Xs_test,  Xp_test,  Xa_test]),
    "base_lda_ae":    (np.c_[Xs_train, Xl_train, Xa_train],
                       np.c_[Xs_val,   Xl_val,   Xa_val],
                       np.c_[Xs_test,  Xl_test,  Xa_test]),
    "base_pca_lda_ae":(np.c_[Xs_train, Xp_train, Xl_train, Xa_train],
                       np.c_[Xs_val,   Xp_val,   Xl_val,   Xa_val],
                       np.c_[Xs_test,  Xp_test,  Xl_test,  Xa_test]),
}
ACTIVE_SET = "base_lda_ae"  # <- 여기만 바꿔서 실험 (예: 'base', 'base_pca', ...)

Z_train, Z_val, Z_test = Xsets[ACTIVE_SET]
print(f"[Xset] {ACTIVE_SET}: train={Z_train.shape}, val={Z_val.shape}, test={Z_test.shape}")

[Xset] base_lda_ae: train=(22983, 25), val=(4925, 25), test=(4925, 25)


In [None]:
# 6) 모델 학습/튜닝 (본 보고서: Validation까지만)
#    * 이미 스케일된/결합된 Z_*를 쓰므로 추가 스케일러는 불필요
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# (A) Multinomial Logistic Regression
clf = LogisticRegression(max_iter=1000, multi_class="multinomial", solver="lbfgs")
param_grid = {"C": [0.01, 0.1, 1, 3, 10, 100]}

gs = GridSearchCV(clf, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, refit=True, verbose=0)
gs.fit(Z_train, y_train)

best_clf = gs.best_estimator_
val_pred = best_clf.predict(Z_val)
val_acc  = accuracy_score(y_val, val_pred)
val_f1   = f1_score(y_val, val_pred, average="macro")

print(f"\n[Validation] set={ACTIVE_SET}")
print(f"  best C = {gs.best_params_['C']}")

labels = np.sort(np.unique(np.concatenate([y_train, y_val])))
cm = confusion_matrix(y_val, val_pred, labels=labels)

cm_df = pd.DataFrame(
    cm,
    index=[f"true_{c}" for c in labels],
    columns=[f"pred_{c}" for c in labels]
)

print("\n=== Confusion matrix (Validation, table) ===")
print(cm_df)

# --- Classification Report (Validation) ---
print("\n=== Classification report (Validation) ===")
print(classification_report(y_val, val_pred, digits=4))





[Validation] set=base_lda_ae
  best C = 10

=== Confusion matrix (Validation, table) ===
        pred_0  pred_1  pred_2  pred_3  pred_4  pred_5
true_0     208      66     167      94     130     161
true_1      59     490      18      89     105     101
true_2      44       9     481      81      37      91
true_3      95     173     111     281     114      40
true_4     118     127      65      71     274     119
true_5     105      82      51      39      64     565

=== Classification report (Validation) ===
              precision    recall  f1-score   support

           0     0.3307    0.2518    0.2859       826
           1     0.5174    0.5684    0.5417       862
           2     0.5386    0.6474    0.5880       743
           3     0.4290    0.3452    0.3826       814
           4     0.3785    0.3540    0.3658       774
           5     0.5246    0.6236    0.5698       906

    accuracy                         0.4668      4925
   macro avg     0.4531    0.4651    0.4557    