In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
CSV_PATH = "dataset/spotify_songs_with_genre_int.csv"
URL = "https://raw.githubusercontent.com/hyeonx3/ML_Project/main/dataset/spotify_songs_with_genre_int.csv"

try:
    df = pd.read_csv(CSV_PATH) 
except:
    df = pd.read_csv(URL)

In [7]:
target_col = "genre_int"
features = ["danceability","energy","key","loudness","mode", "speechiness","acousticness","instrumentalness","liveness","valence", "tempo","duration_ms"]

numeric_cols = df[features].select_dtypes(include=["int64", "float64"]).columns.tolist()

X = df[numeric_cols]
y = df[target_col]

In [12]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
# 3. train / temp (70 / 30)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

# 4. temp → val / test (15 / 15)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

print("train:", X_train.shape)
print("val  :", X_val.shape)
print("test :", X_test.shape)


pipe = Pipeline([
    ("scaler", StandardScaler()),                 # CV 폴드의 train 조각에서만 fit됨
    ("clf", LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=1000
    ))
])

param_grid = {
    "clf__C": [0.01, 0.1, 1, 3, 10, 100],        # 규제 강도
    # 필요 시: "clf__class_weight": [None, "balanced"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1_macro",     # 불균형 다중분류에 적합
    cv=cv,
    n_jobs=-1,
    refit=True,             # 최적 하이퍼파라미터로 train 전체 재학습
    verbose=0
)

# 4) 학습(Train 내부 CV) → Validation 평가
gs.fit(X_train, y_train)
best_model = gs.best_estimator_

val_pred = best_model.predict(X_val)
val_acc  = accuracy_score(y_val, val_pred)
val_f1   = f1_score(y_val, val_pred, average="macro")

labels = np.unique(np.r_[y_train, y_val])

cm = confusion_matrix(y_val, val_pred, labels=labels)

cm_df = pd.DataFrame(
    cm,
    index=[f"true_{l}" for l in labels],
    columns=[f"pred_{l}" for l in labels]
)
print(f"[CV] best C      : {gs.best_params_['clf__C']}")

print("\n=== Confusion matrix (Validation, table) ===")
print(cm_df)

print("\n=== Classification report (Validation) ===")
print(classification_report(y_val, val_pred, digits=4))


train: (22983, 12)
val  : (4925, 12)
test : (4925, 12)




[CV] best C      : 10

=== Confusion matrix (Validation, table) ===
        pred_0  pred_1  pred_2  pred_3  pred_4  pred_5
true_0     210      56     167     101     131     161
true_1      63     457      19     117     103     103
true_2      46       9     480      79      38      91
true_3     100     150     112     302     109      41
true_4     114     119      65      80     276     120
true_5     106      81      53      44      70     552

=== Classification report (Validation) ===
              precision    recall  f1-score   support

           0     0.3286    0.2542    0.2867       826
           1     0.5241    0.5302    0.5271       862
           2     0.5357    0.6460    0.5857       743
           3     0.4177    0.3710    0.3930       814
           4     0.3796    0.3566    0.3678       774
           5     0.5169    0.6093    0.5593       906

    accuracy                         0.4623      4925
   macro avg     0.4504    0.4612    0.4533      4925
weighted avg   