In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [None]:
CSV_PATH = "dataset/spotify_songs_with_genre_int.csv"
URL = "https://raw.githubusercontent.com/hyeonx3/ML_Project/main/dataset/spotify_songs_with_genre_int.csv"

try:
    df = pd.read_csv(CSV_PATH) 
except:
    df = pd.read_csv(URL)

In [None]:
target_col = "genre_int"
features = ["danceability","energy","key","loudness","mode", "speechiness","acousticness","instrumentalness","liveness","valence", "tempo","duration_ms"]

numeric_cols = df[features].select_dtypes(include=["int64", "float64"]).columns.tolist()

X = df[numeric_cols]
y = df[target_col]

In [5]:
# 1) Split (70/15/15)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)
print("train:", X_train.shape)
print("val  :", X_val.shape)
print("test :", X_test.shape)

# 2) 가벼운 수동 탐색
C_list     = [0.1, 1, 10]
gamma_list = ["scale", 0.01, 0.001]   # 'scale'은 데이터 기반 기본값

best_cfg = None
best_val_acc = -1

for C in C_list:
    for gamma in gamma_list:
        model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=C, gamma=gamma, kernel="rbf", cache_size=1000))
        ])
        model.fit(X_train, y_train)
        val_acc = model.score(X_val, y_val)
        print(f"C={C}, gamma={gamma} → val acc={val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_cfg = (C, gamma)

print(f"\n[선택] 제일 좋은 조합: C={best_cfg[0]}, gamma={best_cfg[1]} (val acc={best_val_acc:.4f})")

# 3) 최적 조합으로 모델 재학습 (Train)
best_model = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(C=best_cfg[0], gamma=best_cfg[1], kernel="rbf", cache_size=1000))
])
best_model.fit(X_train, y_train)

# ---- 추가한 부분: Validation 성능/혼동행렬/리포트 ----
val_pred = best_model.predict(X_val)
val_acc  = accuracy_score(y_val, val_pred)
val_f1   = f1_score(y_val, val_pred, average="macro")
print(f"[Validation] acc={val_acc:.4f}, macroF1={val_f1:.4f}")

labels = np.sort(np.unique(np.r_[y_train, y_val]))
cm = confusion_matrix(y_val, val_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{l}" for l in labels],
    columns=[f"pred_{l}" for l in labels]
)
print("\n=== Confusion matrix (Validation, SVM-RBF) ===")
print(cm_df)

print("\n=== Classification report (Validation) ===")
print(classification_report(y_val, val_pred, digits=4))


train: (22983, 12)
val  : (4925, 12)
test : (4925, 12)
C=0.1, gamma=scale → val acc=0.5145
C=0.1, gamma=0.01 → val acc=0.4812
C=0.1, gamma=0.001 → val acc=0.4372
C=1, gamma=scale → val acc=0.5356
C=1, gamma=0.01 → val acc=0.5046
C=1, gamma=0.001 → val acc=0.4731
C=10, gamma=scale → val acc=0.5362
C=10, gamma=0.01 → val acc=0.5194
C=10, gamma=0.001 → val acc=0.4808

[선택] 제일 좋은 조합: C=10, gamma=scale (val acc=0.5362)
[Validation] acc=0.5362, macroF1=0.5318

=== Confusion matrix (Validation, SVM-RBF) ===
        pred_0  pred_1  pred_2  pred_3  pred_4  pred_5
true_0     318      68     127     100      86     127
true_1      58     568      16      93      75      52
true_2      87      15     532      54      17      38
true_3     138     146      77     343      74      36
true_4     157     135      38      67     304      73
true_5     152      63      43      30      42     576

=== Classification report (Validation) ===
              precision    recall  f1-score   support

          