In [4]:
import pandas as pd  # For data manipulation
import altair as alt  # For creating visualizations
from sklearn.cluster import KMeans  # For K-Means clustering
from sklearn.preprocessing import StandardScaler  # For scaling data

df = pd.read_pickle('Data/4TC_playlist.pkl')

df_numeric = df[['popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']].copy()

# Scale the numeric features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)

# Fit KMeans with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42)
df_clustered = df.copy()
df_clustered['cluster'] = kmeans.fit_predict(df_scaled)

# Create the scatter plot
chart = alt.Chart(df_clustered).mark_circle().encode(
    x='danceability',
    y='energy',
    color='cluster:N',
    size='acousticness',
    tooltip=['danceability', 'energy', 'acousticness', 'cluster', 'track_name', 'artist', 'album']
).properties(
    title='Cluster Scatter Plot'
).interactive()