In [1]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
import pickle


In [None]:
# import CSVs

hot_100 = pd.read_csv('extended_hot_100.csv')
not_hot = pd.read_csv('extended_not_hot.csv')

songs_df = pd.concat([hot_100, not_hot], ignore_index=True)
songs_df = songs_df.dropna()


In [None]:
features = ['danceability',  'valence']
songs_features = songs_df[features]

scaler = StandardScaler()
songs_features_scaled = scaler.fit_transform(songs_features)

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(songs_features_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

print(songs_features.isnull().sum())

In [None]:
songs_features_clean = songs_features.dropna()

print(songs_features_clean.isnull().sum())

In [None]:
songs_features_scaled_clean = scaler.fit_transform(songs_features_clean)

wcss_clean = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(songs_features_scaled_clean)
    wcss_clean.append(kmeans.inertia_)


plt.plot(range(1, 11), wcss_clean)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
songs_clusters = kmeans.fit_predict(songs_features_scaled_clean)

# scatter plot
plt.figure(figsize=(10,7))
for i in range(3):  # Since you've chosen 3 clusters
    plt.scatter(songs_features_scaled_clean[songs_clusters == i, 0], 
                songs_features_scaled_clean[songs_clusters == i, 1], 
                label=f'Cluster {i+1}')

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red', marker='X', label='Centroids')
plt.title('KMeans Clustering of Songs')
plt.xlabel('Scaled Danceability')
plt.ylabel('Scaled Valence')
plt.legend()
plt.show()


# add cluster column to dataframe
songs_df_clean = songs_df.dropna()
songs_df_clean['cluster'] = songs_clusters

print(songs_df_clean.head())


In [None]:
#save the scaler and model
scaler_filename_pickle = 'scaler_pickle.pkl'
kmeans_filename_pickle = 'kmeans_pickle.pkl'


# Saving the scaler
with open(scaler_filename_pickle, 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Saving the KMeans model
with open(kmeans_filename_pickle, 'wb') as kmeans_file:
    pickle.dump(kmeans, kmeans_file)
