In [None]:
!pip install lightgbm

In [None]:
!pip install catboost

# Spotify 장르 분류: MLP + Gradient Boosting

요약: Spotify의 오디오·메타데이터(수치 및 범주형)를 이용해 6개 장르를 분류합니다. 실험 흐름은 전처리 → 모델 학습(MLP, CatBoost, LightGBM) → 하이퍼파라미터 탐색 → 피처 중요도 해석입니다.

주요 사항:
- 데이터: dataset/spotify_songs_with_genre_int.csv (결측치 제거 후 사용)
- 목표 지표: Accuracy, Macro‑F1 (클래스 불균형 고려)
- 재현성: stratified split 및 random_state 고정

In [2]:
# === 1.1 필요 모듈 임포트 ===
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import logging

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)
logging.getLogger('lightgbm').setLevel(logging.ERROR)

In [3]:
# === 1.2 데이터셋 로드 ===
ROOT = Path("dataset")

PATH_NO_LABEL = ROOT / "spotify_songs.csv"
PATH_INT_LABEL = ROOT / "spotify_songs_with_genre_int.csv"

df = pd.read_csv(PATH_INT_LABEL)
df = df.dropna()

print("=== 클래스 분포 ===")
print(df["genre_int"].value_counts())
print("\n=== 데이터셋 크기 ===")
print("총 샘플 수:", len(df))

=== 클래스 분포 ===
genre_int
5    6043
1    5743
0    5507
3    5431
4    5153
2    4951
Name: count, dtype: int64

=== 데이터셋 크기 ===
총 샘플 수: 32828


## 데이터·특징 및 전처리

특징(예): track_popularity, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms

전처리 요약:
- 수치형: StandardScaler — train에 fit → test/val에 transform
- 범주형(key, mode): OneHotEncoder 적용 (모델에 따라 원본 범주 전달 가능)
- 분할: stratified train/val/test (50/25/25), random_state 42로 고정

모델별 유의사항:
- LightGBM/CatBoost: 범주형 처리 방식 확인(원핫 vs 카테고리 인자).
- MLP: 입력 특성은 반드시 스케일링 필요(특히 duration_ms, tempo 등). 배치 크기와 early_stopping으로 수렴 조절.

In [5]:
# === 2. 데이터 전처리 ===

# 특징/레이블 컬럼 정의
num_cols = ["loudness", "danceability", "energy",
            "speechiness", "acousticness", "instrumentalness", "liveness",
            "valence", "tempo", "duration_ms"]
cat_cols = ["key", "mode"]

# 전처리 파이프라인 구성
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# 데이터 분할
X = df[num_cols + cat_cols]
y = df["genre_int"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 전처리 적용
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

print("전처리 후 특징 수:", X_train_prep.shape[1])

전처리 후 특징 수: 24


In [None]:
# === 3. MLP Classifier ===
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    activation="relu",
    solver="adam",
    batch_size=128,
    learning_rate_init=1e-3,
    max_iter=100,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42
)

mlp.fit(X_train_prep, y_train)
y_pred_mlp = mlp.predict(X_test_prep)

print("=== Neural Network ===")
print("Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Macro-F1:", f1_score(y_test, y_pred_mlp, average="macro"))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_mlp))

=== Neural Network ===
Accuracy: 0.5324398416082851
Macro-F1: 0.521551497122449

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.33      0.35      1101
           1       0.53      0.71      0.61      1149
           2       0.63      0.72      0.67       990
           3       0.45      0.45      0.45      1086
           4       0.51      0.33      0.40      1031
           5       0.64      0.64      0.64      1209

    accuracy                           0.53      6566
   macro avg       0.52      0.53      0.52      6566
weighted avg       0.53      0.53      0.52      6566



In [None]:
# === 4. CatBoost Classifier ===
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    iterations=600,
    learning_rate=0.1,
    depth=7,
    random_seed=42,
    verbose=100
)

cat.fit(X_train_prep, y_train)
y_pred_cat = cat.predict(X_test_prep)

print("=== CatBoost ===")
print("Accuracy:", accuracy_score(y_test, y_pred_cat))
print("Macro-F1:", f1_score(y_test, y_pred_cat, average="macro"))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_cat))

0:	learn: 1.7269491	total: 73.3ms	remaining: 43.9s
100:	learn: 1.1053914	total: 2.95s	remaining: 14.6s
200:	learn: 1.0123475	total: 5.27s	remaining: 10.5s
300:	learn: 0.9429896	total: 7.62s	remaining: 7.57s
400:	learn: 0.8810588	total: 9.66s	remaining: 4.8s
500:	learn: 0.8239679	total: 11.6s	remaining: 2.28s
599:	learn: 0.7760735	total: 13.6s	remaining: 0us
=== CatBoost ===
Accuracy: 0.559549192811453
Macro-F1: 0.5540502570771376

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.36      0.37      1101
           1       0.60      0.66      0.63      1149
           2       0.68      0.75      0.72       990
           3       0.49      0.47      0.48      1086
           4       0.48      0.42      0.45      1031
           5       0.67      0.69      0.68      1209

    accuracy                           0.56      6566
   macro avg       0.55      0.56      0.55      6566
weighted avg       0.55      0.56      0.56      6566


In [None]:
# === 5. LightGBM Classifier ===
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=-1,
    verbosity=-1,  # suppress internal LightGBM messages
    n_jobs=-1,
    random_state=42
)

lgb.fit(X_train_prep, y_train)
y_pred_lgb = lgb.predict(X_test_prep)

print("=== LightGBM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("Macro-F1:", f1_score(y_test, y_pred_lgb, average="macro"))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lgb))

=== LightGBM ===
Accuracy: 0.5619859884252209
Macro-F1: 0.5570200166425794

Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.37      0.37      1101
           1       0.60      0.67      0.63      1149
           2       0.69      0.76      0.72       990
           3       0.49      0.45      0.47      1086
           4       0.50      0.42      0.46      1031
           5       0.69      0.69      0.69      1209

    accuracy                           0.56      6566
   macro avg       0.56      0.56      0.56      6566
weighted avg       0.56      0.56      0.56      6566

