In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 讀取資料集
multi_data = pd.read_csv("data.csv")
binary_data = pd.read_csv("data_2genre.csv")

# 定義特徵欄位
features = [
    "tempo", "beats", "chroma_stft", "rmse", "spectral_centroid", 
    "spectral_bandwidth", "rolloff", "zero_crossing_rate", 
    "mfcc1", "mfcc2", "mfcc3", "mfcc4", "mfcc5", "mfcc6", "mfcc7",
    "mfcc8", "mfcc9", "mfcc10", "mfcc11", "mfcc12", "mfcc13", 
    "mfcc14", "mfcc15", "mfcc16", "mfcc17", "mfcc18", "mfcc19", "mfcc20"
]

# 分別提取特徵和標籤
# 多分類資料集
X_multi = multi_data[features]
y_multi = multi_data["label"]

# 二元分類資料集
X_binary = binary_data[features]
y_binary = binary_data["label"]


In [5]:
# 將資料分成訓練集與測試集
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42)

X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    X_binary, y_binary, test_size=0.2, random_state=42)
# 正規
scaler = StandardScaler()
X_train_multi = scaler.fit_transform(X_train_multi)
X_test_multi = scaler.transform(X_test_multi)

X_train_binary = scaler.fit_transform(X_train_binary)
X_test_binary = scaler.transform(X_test_binary)

In [11]:
print("------------------------KMeans----------------------")
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, silhouette_score

# 初始化 KMeans
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(X_train_multi)

# 預測並評估
labels = kmeans.labels_
nmi_score = normalized_mutual_info_score(y_train_multi, labels)
sil_score = silhouette_score(X_train_multi, labels)

print("KMeans 分10群結果:")
print(f"Normalized Mutual Info Score: {nmi_score:.4f}")
print(f"Silhouette Score: {sil_score:.4f}")
print("--------------------KMeans--------------------")


------------------------KMeans----------------------
KMeans 分10群結果:
Normalized Mutual Info Score: 0.3033
Silhouette Score: 0.1112
--------------------KMeans--------------------




In [36]:
print("------------------------AgglomerativeClustering----------------------")
from sklearn.cluster import AgglomerativeClustering

# 初始化 AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=1)
agglo.fit(X_train_multi)

# 預測並評估
labels = agglo.labels_
nmi_score = normalized_mutual_info_score(y_train_multi, labels)
sil_score = silhouette_score(X_train_multi, labels)

print("AgglomerativeClustering 分10群結果:")
print(f"Normalized Mutual Info Score: {nmi_score:.4f}")
print(f"Silhouette Score: {sil_score:.4f}")
print("--------------------AgglomerativeClustering--------------------")


------------------------AgglomerativeClustering----------------------
AgglomerativeClustering 分10群結果:
Normalized Mutual Info Score: 0.3107
Silhouette Score: 0.1004
--------------------AgglomerativeClustering--------------------


In [13]:
print("------------------------DBSCAN----------------------")
from sklearn.cluster import DBSCAN
# 初始化 DBSCAN，調整 eps 和 min_samples 參數
dbscan = DBSCAN(eps=2,min_samples=10) #min_samples4以上太大會全被當雜訊2才可或是把eps調高
dbscan.fit(X_train_multi)

# 預測並評估
labels = dbscan.labels_

# 排除 -1 的噪音標籤進行評估
valid_mask = labels != -1

# 檢查是否有有效的樣本
if valid_mask.sum() > 0:
    nmi_score = normalized_mutual_info_score(y_train_multi[valid_mask], labels[valid_mask])
    sil_score = silhouette_score(X_train_multi[valid_mask], labels[valid_mask])

    print("DBSCAN 分群結果:")
    print(f"Normalized Mutual Info Score: {nmi_score:.4f}")
    print(f"Silhouette Score: {sil_score:.4f}")
else:
    print("所有樣本都被標記為噪音，無法進行有效評估")
    
print("--------------------DBSCAN--------------------")


------------------------DBSCAN----------------------
DBSCAN 分群結果:
Normalized Mutual Info Score: 0.1999
Silhouette Score: 0.3639
--------------------DBSCAN--------------------
