In [24]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Load the scaled dataset
df_scaled = pd.read_csv('../data/6_spotify_scaled_cleaned.csv')
# Store original column order
original_columns = df_scaled.columns.tolist()
print("Dataset shape:", df_scaled.shape)

Dataset shape: (7282, 53)


KMeans Clustering and Dataset training

In [25]:
# Initialize and train KMeans model
kmeans = KMeans(n_clusters=53, random_state=42)
cluster_labels = kmeans.fit_predict(df_scaled)

# Add cluster labels to the dataframe and reorder columns
df_scaled['cluster'] = cluster_labels
df_scaled = df_scaled[original_columns + ['cluster']]

# Display cluster distribution
cluster_dist = df_scaled['cluster'].value_counts().sort_index()
print("\nCluster distribution (number of songs in each cluster):")
display(cluster_dist)


Cluster distribution (number of songs in each cluster):


cluster
0     6212
1        1
2       43
3       15
4       11
5        2
6       24
7       16
8       36
9       30
10      31
11       3
12      16
13      13
14      22
15      26
16      15
17      28
18      21
19      15
20      37
21      21
22      22
23      52
24      18
25       1
26       3
27      18
28      19
29       2
30      18
31      32
32      15
33      16
34      16
35      17
36      19
37       1
38      20
39      22
40      87
41       4
42      48
43       6
44      16
45      51
46      23
47      25
48       9
49       8
50      24
51      11
52      21
Name: count, dtype: int64

In [26]:
# Save the model
import joblib
joblib.dump(kmeans, '../models/kmeans_model.joblib')

# Save the clustered dataset
df_scaled.to_csv('../data/7_spotify_clustered.csv', index=False)
print("Model and clustered dataset saved successfully!")

Model and clustered dataset saved successfully!


In [19]:
# Analyze cluster characteristics
cluster_centers = pd.DataFrame(
    kmeans.cluster_centers_,
    columns=original_columns  # Use original column order
)

# For each cluster, find the most prominent features
for cluster in range(53):
    print(f"\nCluster {cluster} characteristics:")
    # Get the top 5 most prominent features (highest absolute values)
    cluster_features = cluster_centers.iloc[cluster].abs().sort_values(ascending=False).head(5)
    print(cluster_features)


Cluster 0 characteristics:
acoustic blues        0.126672
metal                 0.102012
adult standards       0.099227
urban contemporary    0.093418
dub                   0.084807
Name: 0, dtype: float64

Cluster 1 characteristics:
atl trap          85.328776
atl hip hop       17.765379
release_year       1.565346
acoustic blues     0.126672
metal              0.102012
Name: 1, dtype: float64

Cluster 2 characteristics:
adult standards       10.077866
release_year           0.641188
acoustic blues         0.126672
metal                  0.102012
urban contemporary     0.093418
Name: 2, dtype: float64

Cluster 3 characteristics:
gothic symphonic metal    22.010604
gothic metal              12.028307
release_year               0.396470
acoustic blues             0.126672
metal                      0.102012
Name: 3, dtype: float64

Cluster 4 characteristics:
norwegian black metal    20.672517
metal                     3.499709
release_year              0.187883
acoustic blues          

In [27]:
# Print all column names
print("Column names in the clustered dataset:")
for i, col in enumerate(df_clustered.columns, 1):
    print(f"{i}. {col}")

Column names in the clustered dataset:
1. release_year
2. childrens choir
3. punk n roll
4. womens music
5. ambient
6. athens indie
7. atl trap
8. atmospheric sludge
9. australian electropop
10. disco
11. early music
12. gospel blues
13. gothic metal
14. gothic symphonic metal
15. hardcore
16. indie rock
17. jazz
18. latin rock
19. metal
20. mpb
21. new orleans blues
22. norwegian black metal
23. old school thrash
24. pagan black metal
25. post-punk argentina
26. ska argentino
27. socal pop punk
28. speed metal
29. suomi rock
30. symphonic rock
31. urban contemporary
32. vocal jazz
33. west coast rap
34. zolo
35. accordion
36. acoustic blues
37. adult standards
38. afro-cuban percussion
39. afropop
40. alternative dance
41. atl hip hop
42. baroque
43. bolero
44. boy band
45. bronx hip hop
46. c-pop
47. ccm
48. classic finnish pop
49. colombian pop
50. deep ragga
51. dub
52. dusseldorf electronic
53. emo
54. cluster


Joining Categorical and Numerical Columns with Cluster Assignments

In [30]:
# Load previous dataset
df_full = pd.read_csv('../data/5_full_dataset.csv')

# Perform outer join using index
df_merged = pd.merge(df_full[['title', 'artist', 'album', 'is_explicit', 'album_cover']], df_clustered, 
                    left_index=True, right_index=True, 
                    how='outer')

# Save the merged dataset
df_merged.to_csv('../data/7_clustered_dataset.csv', index=False)

# Display basic info about the merged dataset
print("Original full dataset shape:", df_full.shape)
print("Clustered dataset shape:", df_clustered.shape)
print("Merged dataset shape:", df_merged.shape)
print("\nFirst few rows of the merged dataset:")
display(df_merged.head())

Original full dataset shape: (7282, 25)
Clustered dataset shape: (7282, 54)
Merged dataset shape: (7282, 59)

First few rows of the merged dataset:


Unnamed: 0,title,artist,album,is_explicit,album_cover,release_year,childrens choir,punk n roll,womens music,ambient,...,bronx hip hop,c-pop,ccm,classic finnish pop,colombian pop,deep ragga,dub,dusseldorf electronic,emo,cluster
0,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,False,https://i.scdn.co/image/ab67616d0000b2739e6b95...,0.627359,-0.011719,-0.016575,-0.023444,-0.011719,...,-0.049779,-0.062128,-0.066436,-0.068491,-0.051147,-0.05986,-0.084807,-0.046926,-0.051147,0
1,On efface,Julie Zenatti,Comme vous...,False,https://i.scdn.co/image/ab67616d0000b27398d445...,0.338748,-0.011719,-0.016575,-0.023444,-0.011719,...,-0.049779,-0.062128,-0.066436,-0.068491,-0.051147,-0.05986,-0.084807,-0.046926,-0.051147,0
2,Howells Delight,Anonymous,The Best of the Baltimore Consort,False,https://i.scdn.co/image/ab67616d0000b27353a906...,0.843818,-0.011719,-0.016575,-0.023444,-0.011719,...,-0.049779,-0.062128,-0.066436,-0.068491,-0.051147,-0.05986,-0.084807,-0.046926,-0.051147,0
3,Martha Served,I Hate Sally,Don't Worry Lady,True,https://i.scdn.co/image/ab67616d0000b273e6d949...,0.555206,-0.011719,-0.016575,-0.023444,-0.011719,...,-0.049779,-0.062128,-0.066436,-0.068491,-0.051147,-0.05986,-0.084807,-0.046926,-0.051147,0
4,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",False,https://i.scdn.co/image/ab67616d0000b27349ea4d...,1.637499,-0.011719,-0.016575,-0.023444,-0.011719,...,-0.049779,-0.062128,-0.066436,-0.068491,-0.051147,-0.05986,-0.084807,-0.046926,-0.051147,0


In [33]:
# Check for null values in each column
null_counts = df_merged.isnull().sum()

# Display columns with null values and their counts
print("Columns with null values:")
print("------------------------")
for column, null_count in null_counts.items():
    if null_count > 0:
        print(f"{column}: {null_count} null values")

# Get total number of rows with any null value
total_rows_with_nulls = df_merged.isnull().any(axis=1).sum()
print(f"\nTotal rows with at least one null value: {total_rows_with_nulls}")
print(f"Percentage of rows with nulls: {(total_rows_with_nulls/len(df_merged)*100):.2f}%")

# Display a few example rows that contain null values
print("\nExample rows with null values:")
display(df_merged[df_merged.isnull().any(axis=1)].head())

Columns with null values:
------------------------

Total rows with at least one null value: 0
Percentage of rows with nulls: 0.00%

Example rows with null values:


Unnamed: 0,title,artist,album,is_explicit,album_cover,release_year,childrens choir,punk n roll,womens music,ambient,...,c-pop,ccm,classic finnish pop,colombian pop,deep ragga,dub,dusseldorf electronic,emo,cluster,isHot
