In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from itertools import product

pd.set_option('display.max_columns', None)

In [None]:
# this cell may take up to 30 seconds to run to convert the Hugginface Dataset class to Pandas DataFrame for better EDA

HF_SPOTIFY_DATASET_PATH = "maharshipandya/spotify-tracks-dataset"
# https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset

ds = load_dataset(HF_SPOTIFY_DATASET_PATH) #only has 'train' split as key
df = pd.DataFrame(ds["train"])

df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.68k [00:00<?, ?B/s]

dataset.csv:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/114000 [00:00<?, ? examples/s]

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [None]:
# stratified based on genre
df_train, df_test_val = train_test_split(
    df,
    train_size=0.6,
    stratify=df['track_genre'],
    random_state=42
)

df_test, df_val = train_test_split(
    df_test_val,
    train_size=0.5,
    random_state=42
)



#X_val and X_test will have 50% of X_test_val
#Verify that we have a 60/20/20 split
total_samples = len(df)
train_samples = len(df_train)
val_samples = len(df_val)
test_samples = len(df_test)

print(f"Total samples: {total_samples}")
print(f"Training samples: {train_samples}")
print(f"Validation samples: {val_samples}")
print(f"Test samples: {test_samples}")

Total samples: 114000
Training samples: 68400
Validation samples: 22800
Test samples: 22800


In [None]:
# # Note: Used 20% of the data due to runtime crashing
# stratified based on genre
df_reduced, _ = train_test_split(
    df_train,
    train_size=0.2,
    random_state=42
)


In [None]:
# stdev center data due to difference in scale
#only use numeric values for pca
df_numeric = df_reduced.select_dtypes(include=['int64', 'float64']).drop(["Unnamed: 0", "mode", "time_signature", "key"], axis=1)

df_scaled = df_numeric.copy()
df_scaled = df_scaled / df_scaled.std()
df_scaled

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
8912,2.419987,2.644015,2.093910,2.420020,-2.617224,0.928370,0.320822,0.003877,0.650263,2.552995,2.675672
66898,0.358517,1.831852,4.361349,3.657620,-0.594618,0.442449,0.005187,0.004879,0.206181,2.579909,4.310307
30331,0.000000,1.425153,3.453217,2.077119,-1.704138,0.398011,0.380789,0.000000,0.740137,2.057006,4.175129
29623,2.240728,1.851403,2.984690,2.443669,-1.160105,0.313965,0.797558,0.000000,0.613256,0.569041,4.886591
29999,0.941106,2.360071,3.100376,1.639623,-1.969069,3.381159,0.503721,0.004782,0.377998,0.515213,2.287957
...,...,...,...,...,...,...,...,...,...,...,...
63910,1.434066,1.798395,4.239879,3.196476,-1.186677,0.264696,0.509717,0.000000,0.539243,3.702611,4.581674
110291,2.016656,1.840129,2.909494,3.724624,-1.140225,0.406705,0.006267,0.223589,0.729563,1.107323,4.648472
104887,1.165179,2.156632,1.382444,2.814161,-1.239624,0.927404,1.694061,0.000019,1.342820,1.587932,5.704364
49002,2.733689,2.450605,2.521947,3.425078,-0.486756,0.594118,0.162510,0.000000,0.517567,0.147259,5.222903


## Clustering

The problem we want to address is to see if there are similarities between different genres.  We settled on 8 clusters after performing silhouette analysis

In [None]:
from sklearn.cluster import AgglomerativeClustering

#hierarchical
hclust = AgglomerativeClustering(n_clusters=8, metric='euclidean', linkage='ward')
y_hclust_labels = hclust.fit(df_scaled).labels_

In [None]:
from sklearn.cluster import KMeans

#kmeans clust
kmeans = KMeans(n_clusters=8)
y_kmeans = kmeans.fit_predict(df_scaled)

In [None]:
px.scatter(df_scaled, x="energy", y="loudness",
            color=y_hclust_labels,
            opacity=0.2,
            hover_name=df_reduced["track_genre"],
            title="Hierarchical Clustering")

In [None]:
px.scatter(df_scaled, x="energy", y="loudness",
            color=y_kmeans,
            opacity=0.2,
            hover_name=df_reduced["track_genre"],
            title="Kmeans Clustering")

In [None]:
def tot_within_sum_of_square(data, clusters):
    data = data.values
    unique_clusters = np.unique(clusters)
    total_wss = 0
    for cluster_label in unique_clusters:
      cluster_points = data[clusters == cluster_label]
      centroid = np.mean(cluster_points, axis=0)
      for point in cluster_points:
        squared_distance = (point - centroid)**2
        total_wss += np.sum(squared_distance)
    return total_wss

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples, rand_score, adjusted_rand_score

# K means inertia
print(f"1. K-means Inertia: {round(kmeans.inertia_, 1)}")

# WSS k means
kmeans_wss = tot_within_sum_of_square(df_scaled, y_kmeans)
print(f"2. K-means WSS: {round(kmeans_wss, 1)}")

# WSS hierarchical
hclust_wss = tot_within_sum_of_square(df_scaled, y_hclust_labels)
print(f"3. Hierarchical Clustering WSS: {round(hclust_wss, 1)}")

# silhouette k means
kmeans_silhouette = silhouette_score(df_scaled, y_kmeans)
print(f"4. K-means Silhouette Score: {round(kmeans_silhouette, 3)}")

# silhouette hierarchical
hclust_silhouette = silhouette_score(df_scaled, y_hclust_labels)
print(f"5. Hierarchical Clustering Silhouette Score: {round(hclust_silhouette, 3)}")

# rand
rand = rand_score(y_kmeans, y_hclust_labels)
print(f"6. Rand Index (K-means vs Hierarchical): {round(rand, 3)}")

# adjusted rand
adj_rand = adjusted_rand_score(y_kmeans, y_hclust_labels)
print(f"7. Adjusted Rand Index (K-means vs Hierarchical): {round(adj_rand, 3)}")

1. K-means Inertia: 80145.3
2. K-means WSS: 80145.2
3. Hierarchical Clustering WSS: 87389.6
4. K-means Silhouette Score: 0.149
5. Hierarchical Clustering Silhouette Score: 0.115
6. Rand Index (K-means vs Hierarchical): 0.824
7. Adjusted Rand Index (K-means vs Hierarchical): 0.397


## PCA and Analysis

In [None]:
pca_U, pca_d, pca_V = np.linalg.svd(df_scaled)
pca_d

array([826.83227137, 190.73885458, 144.31584339, 128.97327718,
       120.42168058, 113.43291544, 108.18922143, 102.81905796,
        79.27588901,  66.03385223,  60.20723711])

In [None]:
pca_U.shape

(13680, 13680)

In [None]:
pca_V.shape

(11, 11)

In [None]:
pca_d.shape

(11,)

### Cumulative Variability

In [None]:
prop_var = np.square(pca_d) / sum(np.square(pca_d))
pd.DataFrame(
    {"PC": 1 + np.arange(0, prop_var.shape[0]),
     "variability_explained": prop_var.round(2),
     "cumulative_variability_explained": prop_var.cumsum().round(2)
     }).head(10)

Unnamed: 0,PC,variability_explained,cumulative_variability_explained
0,1,0.83,0.83
1,2,0.04,0.88
2,3,0.03,0.9
3,4,0.02,0.92
4,5,0.02,0.94
5,6,0.02,0.96
6,7,0.01,0.97
7,8,0.01,0.98
8,9,0.01,0.99
9,10,0.01,1.0


PC1 and 2 explain 88% of the variability.  For the purposes of dimensionality reduction, we will be using PC 1 and 2

In [None]:
px.line(x=np.arange(prop_var.shape[0]),
        y=prop_var,
        labels={"x": "PC",
                "y": "Proportion explained"})

The scree plot confirms that we should use PC1 and PC2

In [None]:
# create the PCA-transformed dataset

# multiply the original data and the PCA loadings
pca_transform = pca_V.T[:,:2]
transformed_df = df_scaled@pca_transform
transformed_df.columns = ['PC1', 'PC2']
transformed_df

Unnamed: 0,PC1,PC2
8912,-6.194073,-0.324122
66898,-7.393331,1.931981
30331,-6.364062,0.255976
29623,-6.796588,0.043149
29999,-5.348988,-0.479743
...,...,...
63910,-8.073351,1.296322
110291,-7.117883,1.071436
104887,-7.108550,-0.173060
49002,-7.079202,1.002664


In [None]:
# plot PC1 vs PC2
px.scatter(transformed_df, x="PC1", y="PC2",
           opacity=0.2, hover_name=df_reduced["track_genre"])

No significant results from plotting PC1 vs PC2.  Let's run a clustering algo

## PCA and Clustering

Let's see if certain genres have similarities.  The code below determines the best number of clusters to use.


In [None]:
hist = []
for i in range(5, 10):
  km = KMeans(n_clusters=i)
  y = km.fit_predict(transformed_df)
  sil = silhouette_score(transformed_df, y)
  hist.append(sil)


In [None]:
for i in range(5, 10):
  print(f"{i} clusters: {hist[i-6]}")

5 clusters: 0.3205184701682462
6 clusters: 0.35351634863959736
7 clusters: 0.3336061052408832
8 clusters: 0.33703937060278316
9 clusters: 0.32590529723613865


There isn't a significant difference between each cluster in regard to silhouette score.  We are opting to use 8 since our groups include: distortion heavy music (dubstep, heavy metal, etc.), pop music: (pop edm, regular pop, indie, etc.), instrumental: (guitar, piano, etc.), chill music: (lofi, chillstep), club/dance tracks: (deep house, tech house, etc.), which makes 5 groups with an extra 3 to reveal other patterns in the data that we have not considered.

In [None]:
pca_hclust_labels = hclust.fit(transformed_df).labels_
px.scatter(transformed_df, x="PC1", y="PC2",
            color=pca_hclust_labels,
            opacity=0.2,
            hover_name=df_reduced["track_genre"],
            title="Hierarchical Clustering")

In [None]:
pca_kmeans = kmeans.fit_predict(transformed_df)
px.scatter(transformed_df, x="PC1", y="PC2",
            color=pca_kmeans,
            opacity=0.2,
            hover_name=df_reduced["track_genre"],
            title="Kmeans Clustering")

In [None]:
# K means inertia
print(f"1. K-means Inertia: {round(kmeans.inertia_, 1)}")

# WSS k means
kmeans_wss = tot_within_sum_of_square(transformed_df, pca_kmeans)
print(f"2. K-means WSS: {round(kmeans_wss, 1)}")

# WSS hierarchical
hclust_wss = tot_within_sum_of_square(transformed_df, pca_hclust_labels)
print(f"3. Hierarchical Clustering WSS: {round(hclust_wss, 1)}")

# silhouette k means
kmeans_silhouette = silhouette_score(transformed_df, pca_kmeans)
print(f"4. K-means Silhouette Score: {round(kmeans_silhouette, 3)}")

# silhouette hierarchical
hclust_silhouette = silhouette_score(transformed_df, pca_hclust_labels)
print(f"5. Hierarchical Clustering Silhouette Score: {round(hclust_silhouette, 3)}")

# rand
rand = rand_score(pca_kmeans, pca_hclust_labels)
print(f"6. Rand Index (K-means vs Hierarchical): {round(rand, 3)}")

# adjusted rand
adj_rand = adjusted_rand_score(pca_kmeans, pca_hclust_labels)
print(f"7. Adjusted Rand Index (K-means vs Hierarchical): {round(adj_rand, 3)}")

1. K-means Inertia: 7120.7
2. K-means WSS: 7120.3
3. Hierarchical Clustering WSS: 8018.5
4. K-means Silhouette Score: 0.323
5. Hierarchical Clustering Silhouette Score: 0.273
6. Rand Index (K-means vs Hierarchical): 0.869
7. Adjusted Rand Index (K-means vs Hierarchical): 0.51
