In [77]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
from dotenv import load_dotenv

load_dotenv("./.env")

client_id = os.environ.get("SPOTIFY_CLIENT_ID")
client_secret = os.environ.get("SPOTIFY_CLIENT_SECRET")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id, client_secret), requests_timeout=45)

In [109]:
import pandas as pd
from joblib import Parallel, delayed

df = pd.read_csv('tracks6.csv')
list_songs = df['0'].dropna().values.tolist()
batch_ids = [list_songs[i:i+99] for i in range(0, len(list_songs), 99)]
# existing_features = pd.read_csv('features.csv')


In [110]:
def getFeatures(id):
    try: return sp.audio_features(id)
    except: return

features = Parallel(
    n_jobs=50
)(
    delayed(getFeatures)(i) for i in batch_ids
)

# features = pd.read_csv('features.csv')


In [111]:
features2 = [i for i in features if i != None]
flattened = [b for b in features2 for b in b if b]


In [114]:
df2 = pd.DataFrame(flattened).drop_duplicates()
df2.to_csv('features.csv', index=False)

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import numpy as np

def getClusterIds(df):
    # Normalize the data
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(df[["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]])
    # df_normalized = pd.DataFrame(X_normalized, columns=["danceability", "energy",  "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

    # Perform PCA
    pca = PCA(n_components=9)
    pca.fit(X_normalized)
    X_pca = pca.transform(X_normalized)

    # Predict clusters using KMeans
    kmeans = KMeans(n_clusters=1000, n_init=10) # TODO: Find optimal number of clusters using an elbow graph
    kmeans.fit(X_pca)
    cluster_ids = kmeans.predict(X_pca)
    return cluster_ids, X_pca

cluster_ids, X_pca = getClusterIds(df2)

# Plot clusters
for c in np.unique(cluster_ids):
    plt.scatter(X_pca[cluster_ids == c, 0], X_pca[cluster_ids == c, 1], marker='.', s=10)

In [None]:
# Add cluster ids to dataframe
df2['cluster'] = cluster_ids
df2['cluster'].value_counts()

In [104]:
df2.to_csv('features_clustered.csv', index=False)