In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
from dotenv import load_dotenv
import pandas as pd

load_dotenv("./.env")

client_id = os.environ.get("SPOTIFY_CLIENT_ID")
client_secret = os.environ.get("SPOTIFY_CLIENT_SECRET")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id, client_secret), requests_timeout=45)

In [None]:
df = pd.read_csv('features_clustered.csv')
df = df.drop(columns='cluster')

In [99]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import numpy as np

def getClusterIds(df):
    # Normalize the data
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(df[["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]])
    

    # Perform PCA
    # pca = PCA(n_components=9)
    # pca.fit(X_normalized)
    # X_pca = pca.transform(X_normalized)

    # Predict clusters using KMeans
    kmeans = KMeans(n_clusters=1000, n_init=10) # TODO: Find optimal number of clusters using an elbow graph
    kmeans.fit(X_normalized)
    cluster_ids = kmeans.predict(X_normalized)
    return cluster_ids, X_normalized, kmeans

cluster_ids, X_normalized, kmeans = getClusterIds(df)

In [101]:
import pickle

with open("kmeans1000.pickle", "wb") as f: #saves the model in dir and file name given
    pickle.dump(kmeans,f)

In [92]:
with open("kmeans1000.pickle", "rb") as f: #don't forget the correct path
     pickle.load(f) 

In [85]:
df['cluster'] = cluster_ids
df.cluster.value_counts()

cluster
91      142
1133    133
323     123
126     122
237     118
       ... 
1750      1
878       1
1659      1
1595      1
1450      1
Name: count, Length: 2000, dtype: int64

In [44]:
df['cluster_distance'] = kmeans.transform(X_pca).min(axis=1)

In [58]:
df[df.id.isin(['4cvzNj4FmjNtkY0tSTwufs', '2aUDpZRSit5Zg2R6L5r6dW', '3tXcaUTyYJm5nBpLYp7Tmb'])]

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,cluster,cluster_distance
16331,0.334,0.423,1,-13.562,1,0.0348,0.0173,0.994,0.0732,0.0859,100.058,audio_features,2aUDpZRSit5Zg2R6L5r6dW,spotify:track:2aUDpZRSit5Zg2R6L5r6dW,https://api.spotify.com/v1/tracks/2aUDpZRSit5Z...,https://api.spotify.com/v1/audio-analysis/2aUD...,93227,4,270,2.685626
49778,0.764,0.975,0,-5.68,1,0.176,0.052,0.512,0.254,0.111,150.017,audio_features,3tXcaUTyYJm5nBpLYp7Tmb,spotify:track:3tXcaUTyYJm5nBpLYp7Tmb,https://api.spotify.com/v1/tracks/3tXcaUTyYJm5...,https://api.spotify.com/v1/audio-analysis/3tXc...,363200,4,450,2.630694
74477,0.327,0.345,6,-13.282,0,0.031,0.0125,0.833,0.15,0.0364,200.096,audio_features,4cvzNj4FmjNtkY0tSTwufs,spotify:track:4cvzNj4FmjNtkY0tSTwufs,https://api.spotify.com/v1/tracks/4cvzNj4FmjNt...,https://api.spotify.com/v1/audio-analysis/4cvz...,175256,4,456,3.518514


In [69]:
df.to_csv('features_clustered2.csv')

In [84]:
df[df['id']== '2dHHgzDwk4BJdRwy9uXhTO']

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,cluster,cluster_distance


In [None]:
K = range(5, 40) #range of the clusters
inertia = []

 # Normalize the data
scaler = StandardScaler()
X_normalized = scaler.fit_transform(df[["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]])
 # df_normalized = pd.DataFrame(X_normalized, columns=["danceability", "energy",  "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"])



for k in K: #for every data in set build a inertia
    print("Training a K-Means model with {} clusters! ".format(k))
    print()
    kmeans1 = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans1.fit(X_normalized)
    inertia.append(kmeans1.inertia_)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0)) #changing the scale
plt.title('Elbow Method showing the optimal k') 