In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

sns.set(style="whitegrid", palette="muted")
plt.rcParams["figure.figsize"] = (10, 6)

df = pd.read_csv("spotify_dataset.csv")
print("First 5 rows of dataset:")
print(df.head())
print("\nDataset info:")
print(df.info())

df = df.dropna()

if "playlist_genre" in df.columns:
    le_genre = LabelEncoder()
    df["playlist_genre_encoded"] = le_genre.fit_transform(df["playlist_genre"])

if "playlist_name" in df.columns:
    le_name = LabelEncoder()
    df["playlist_name_encoded"] = le_name.fit_transform(df["playlist_name"])

num_features = df.select_dtypes(include=[np.number]).columns
X = df[num_features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

if "playlist_genre" in df.columns:
    plt.figure(figsize=(8,5))
    sns.countplot(data=df, x="playlist_genre", palette="Set2", order=df["playlist_genre"].value_counts().index)
    plt.title("Distribution of Playlist Genres")
    plt.xticks(rotation=30)
    plt.show()

plt.figure(figsize=(12,8))
sns.heatmap(df[num_features].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Numerical Features")
plt.show()

inertia = []
silhouette = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(X_scaled, kmeans.labels_))

plt.figure()
plt.plot(K_range, inertia, marker="o")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal k")
plt.show()

plt.figure()
plt.plot(K_range, silhouette, marker="o", color="purple")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Analysis for Clusters")
plt.show()

k_optimal = 4
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df["PCA1"] = X_pca[:, 0]
df["PCA2"] = X_pca[:, 1]

plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x="PCA1", y="PCA2", hue="Cluster", palette="Set1", s=60, alpha=0.8)
plt.title("Clusters of Songs (PCA 2D Projection)")
plt.show()

if "playlist_genre" in df.columns:
    plt.figure(figsize=(10,6))
    sns.scatterplot(data=df, x="PCA1", y="PCA2", hue="playlist_genre", style="Cluster", s=70, palette="tab10")
    plt.title("Clusters Colored by Playlist Genre")
    plt.show()

if "playlist_name" in df.columns:
    plt.figure(figsize=(10,6))
    sns.scatterplot(data=df, x="PCA1", y="PCA2", hue="playlist_name", style="Cluster", s=70)
    plt.title("Clusters Colored by Playlist Name")
    plt.show()

print("\nCluster Counts:")
print(df["Cluster"].value_counts())

if "playlist_genre" in df.columns:
    cluster_summary = df.groupby(["Cluster", "playlist_genre"]).size().unstack(fill_value=0)
    print("\nCluster Summary by Genre:")
    print(cluster_summary)

    cluster_summary.plot(kind="bar", stacked=True, colormap="viridis")
    plt.title("Cluster Distribution across Playlist Genres")
    plt.xlabel("Cluster")
    plt.ylabel("Count")
    plt.show()

df.to_csv("spotify_clustered_output.csv", index=False)
print("\nFinal clustered dataset saved as spotify_clustered_output.csv")
