In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import seaborn as sns

In [None]:
tracks = pd.read_csv('tracks_enriched.csv')
playlists = pd.read_csv('playlists-collections.csv')

In [None]:
tracks.head(1)

In [None]:
tracks['category_name'] = 0

In [None]:
for i, row in playlists.iterrows():
    tracks.loc[tracks['playlist_id'] == row['playlist_id'], 'category_name'] = row['category_name']

In [None]:
tracks.head(2)

In [None]:
X = tracks[['track_id', 'popularity', 'danceability', 'energy', 'loudness', 'speechiness','acousticness', 
            'instrumentalness', 'liveness', 'valence', 'tempo', 'category_name']]

## Apply PCA to look at data shape

In [None]:
pca = PCA(2) 
scaler = MinMaxScaler()
X_prepped = scaler.fit_transform(X.drop(columns=['category_name', 'track_id']))
X_2D = pca.fit_transform(X_prepped)
X_2D

In [None]:
X_2D = pd.DataFrame(X_2D, columns=['A', 'B'])
X_2D['category_name'] = tracks['category_name']

In [None]:
X_2D.head(3)

#### Plot tracks with selected categories from original playlist

In [None]:
X_2D['category_name'].unique()

In [None]:
subset_X_2D = X_2D[X_2D['category_name'].isin(['Hip-Hop', 'Rock', 'R&B', 'Romance', 'Jazz', 'Reggae'])]
subset_X_2D.shape

In [None]:
sns.lmplot(x='A', y='B', data=subset_X_2D, hue='category_name', fit_reg=False)

### Apply KMeans Clustering

In [None]:
K = range(5, 30)

inertia = []
silhouette = []
predictions = []
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=13)
    kmeans.fit(X_prepped)
    pred = kmeans.predict(X_prepped)
    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(X_prepped, pred))
    predictions.append(pred)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

ax[0].plot(K, inertia, 'bx-')
ax[0].set_xlabel('k')
ax[0].set_ylabel('inertia')
ax[0].set_xticks(np.arange(min(K), max(K)+1, 1.0))

ax[1].plot(K, silhouette, 'bx-')
ax[1].set_xlabel('k')
ax[1].set_ylabel('silhouette score')
ax[1].set_xticks(np.arange(min(K), max(K)+1, 1.0))

plt.show()