# Clustering tracks using KMeans and finding the nearest tracks to the clusters centers per each clustering session

# Imports:

In [2]:
import MySQLdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../')
import myenvvar
from sklearn.cluster import KMeans
import datetime as dt

In [12]:
from scipy.cluster.vq import vq

In [50]:
from sklearn.metrics import pairwise_distances_argmin_min

# Create DB connection

In [3]:
conn = MySQLdb.Connection(
        host=myenvvar.db_vars['host'],
        user=myenvvar.db_vars['user'],
        passwd=myenvvar.db_vars['password'],
        port=myenvvar.db_vars['port'],
        db=myenvvar.db_vars['db']
        )

# Import kmeans_clustering_params table from DB into df

In [4]:
q = "SELECT * FROM kmeans_clustering_params"
kmeans_clustering_params_df = pd.read_sql(q, conn)

In [5]:
kmeans_clustering_params_df.head()

Unnamed: 0,clustering_datetime,random_state,k,interia_score
0,2020-06-08 07:39:06,3,4,278.98907
1,2020-06-08 10:15:39,3,4,296.64117


# Import kmeans_clustering_params table from DB into df

In [6]:
q = "SELECT * FROM kmeans_clustering_centroids"
kmeans_clustering_centroids_df = pd.read_sql(q, conn)

In [7]:
kmeans_clustering_centroids_df.head()

Unnamed: 0,clustering_datetime,cluster_number,danceability_c,energy_c,loudness_c,speechiness_c,acousticness_c,instrumentalness_c,liveness_c,valence_c,tempo_c
0,2020-06-08 07:39:06,0,0.63331,0.43852,0.57906,0.11503,0.55891,0.0991,0.16212,0.42861,0.46887
1,2020-06-08 07:39:06,1,0.65696,0.35716,0.50855,0.09188,0.67125,0.8393,0.14519,0.37126,0.41454
2,2020-06-08 07:39:06,2,0.64188,0.73599,0.67261,0.05511,0.04879,0.77775,0.20413,0.27952,0.56177
3,2020-06-08 07:39:06,3,0.64076,0.75129,0.73627,0.09015,0.07814,0.0581,0.17893,0.46776,0.51825
4,2020-06-08 10:15:39,0,0.64482,0.73588,0.66794,0.05619,0.04913,0.77476,0.20069,0.28655,0.51462


# Find nearest tracks to clusters centroids for each KMeans clustering session

In [36]:
# filter tracks per added_datetime
def get_tracks_features_per_clustering_date_df(clustering_datetime):
    q = "select * from audio_features af where af.added_datetime <" +"'"+ str(clustering_datetime) + "'"
    tracks_features_per_clustering_date_df = pd.read_sql(q, conn)
    cols_to_drop = ['key', 'time_signature', 'total_available_markets', 'mode', 'duration_ms','popularity','added_datetime']
    tracks_features_per_clustering_date_df = tracks_features_per_clustering_date_df.drop(cols_to_drop, axis=1)
    tracks_features_per_clustering_date_df.set_index('track_id',inplace=True)
    return tracks_features_per_clustering_date_df

In [62]:
for ind in kmeans_clustering_params_df.index:
    tracks_features_per_clustering_date_df = get_tracks_features_per_clustering_date_df(kmeans_clustering_params_df['clustering_datetime'][ind])
    kmeans_model = KMeans(n_clusters=kmeans_clustering_params_df['k'][ind], random_state=kmeans_clustering_params_df['random_state'][ind]).fit(tracks_features_per_clustering_date_df)
    preds = kmeans_model.predict(tracks_features_per_clustering_date_df)
    sq_dist_vec_space = kmeans_model.transform(tracks_features_per_clustering_date_df)**2
    
    min_indices = []
    for label in np.unique(preds):
        indices_records_curr_cluster = np.where(preds==label)[0] # indices of records from curr cluster
        dist_arr_curr_cluster = [sq_dist_vec_space[i] for i in indices_records_curr_cluster]
        min_dist_curr_cluster = np.argmin([x[label] for x in dist_arr_curr_cluster])
        min_indices.append(min_dist_curr_cluster)
        
    print("clustering datetime:",kmeans_clustering_params_df['clustering_datetime'][ind])
    print("nearest tracks to cluster centroid indices", min_indices)
    track_ids_list = list(tracks_features_per_clustering_date_df.index.values)
    for i in range(len(min_indices)):
        print("track nearest to center in cluster ", i)
        print("spotify:track:"+str(track_ids_list[min_indices[i]]))
        
    print("") # print empty line between clustering sessions

clustering datetime: 2020-06-08 07:39:06
nearest tracks to cluster centroid indices [113, 212, 212, 497]
track nearest to center in cluster  0
spotify:track:0PNJQTpE2oBUbOFgzSxBz5
track nearest to center in cluster  1
spotify:track:180BGvBq7bOB8mX8iFT5Ni
track nearest to center in cluster  2
spotify:track:180BGvBq7bOB8mX8iFT5Ni
track nearest to center in cluster  3
spotify:track:2Q9N9K13AsJeutVQpNSO0r

clustering datetime: 2020-06-08 10:15:39
nearest tracks to cluster centroid indices [172, 92, 443, 230]
track nearest to center in cluster  0
spotify:track:0wtHjEAzLYYyGuk2WtpGwS
track nearest to center in cluster  1
spotify:track:0kvKNrIyDP9ScCw2Jo16bK
track nearest to center in cluster  2
spotify:track:2a1qzI3rJQqmJGG0MomA9b
track nearest to center in cluster  3
spotify:track:17gGSEIql8L3FwhqlmkZ1I

