In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spotipy

%matplotlib inline

from pandas import json_normalize
from random import randint, choice
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
df=pd.read_csv('df_playlists.csv')

In [None]:
# Initialize an empty dictionary to store unique combinations of other columns as keys
# and corresponding lists of unique names as values
names_dict = {}

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    # Create a tuple of values for all columns except 'name'
    key = tuple(row.drop('name'))
    
    # Get the name from the row
    name = str(row['name'])  # Convert to string to handle NaN values
    
    # If the key already exists in the dictionary
    if key in names_dict:
        # Check if the name is not already in the list associated with the key
        if name not in names_dict[key]:
            # Append the name to the list of names
            names_dict[key].append(name)
    # If the key does not exist in the dictionary, create a new entry
    else:
        names_dict[key] = [name]

# Convert the dictionary into a list of dictionaries
data = [{'artists': ", ".join(names), **dict(zip(df.columns.drop('name'), key))} for key, names in names_dict.items()]

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data)


In [None]:
audio_df = df.select_dtypes(include=np.number)

In [None]:
audio_df['popularity']=audio_df['popularity']**2

In [None]:
scaler = StandardScaler()
audio_scaled = scaler.fit_transform(audio_df)
pd.DataFrame(audio_scaled, columns=audio_df.columns)

In [None]:
K = range(2, 30)

inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k,random_state=15)
    kmeans.fit(audio_scaled)
    inertia.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

plt.show()

In [None]:
K = range(2, 30)
silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(audio_scaled)
    silhouette.append(silhouette_score(X=audio_scaled, labels=kmeans.predict(audio_scaled)))

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Silhoutte Score showing the optimal k')

plt.show()

In [None]:
kmeans = KMeans(n_clusters=27, random_state=42)
kmeans.fit(audio_scaled)
clusters = kmeans.predict(audio_scaled)

In [None]:
clusters

In [None]:
pd.Series(clusters).value_counts()

In [None]:
df

In [None]:
df.to_csv('tracks_w_audio.csv',index=False)