In [3]:
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from pandas_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_samples, silhouette_score

warnings.filterwarnings("ignore")

Dataset prep

In [4]:
features = pd.read_csv('data/spotify_songs_features.csv')
played =  pd.read_csv('data/lastfm_played_tracks.csv')
ids = pd.read_csv('data/spotify_tracks_ids.csv')
#raw: the whole data
raw = played.join(features.join(ids.set_index('sp_id'), on='id').set_index(['artist', 'song']), on=['artist', 'song'], how='inner')
raw.drop(columns = ['unix_timestamp', 'key', 'mode', 'type', 'uri', 'track_href', 'analysis_url'], axis = 1, inplace  =True)

#df: all songs to be added the cluster labels later on
df = raw.drop_duplicates()

#dropping speechiness to evaluate the results - it shown low similarity in between the clusters
df.drop(columns=['speechiness', 'liveness', 'danceability'], axis=1, inplace=True)

#X: nparray generated from all songs features, to be used on the clusterization 
#       removing the columns not to be used on the clusterization 
#       normalized to all feature values between 0 and 1
X = df.drop(columns = ['artist', 'song', 'id', 'duration_ms', 'time_signature', 'no_id', 'tempo'],axis=1)
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

### Feature analysis

In [5]:
profile = ProfileReport(
    pd.DataFrame(X, columns = df.drop(columns = ['artist', 'song', 'id', 'duration_ms', 'time_signature', 'no_id', 'tempo'],axis=1).columns), 
    title='Song features 2019',
    html={'style':{'full_width':True}})
profile.to_file('profiling.html')

Summarize dataset: 100%|██████████| 19/19 [00:07<00:00,  2.54it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.04s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 180.40it/s]


In [6]:
#Visualizing all features distributions on scatterplots
fig = px.scatter_matrix(
    pd.DataFrame(X, columns = df.drop(columns = ['artist', 'song', 'id', 'duration_ms', 'time_signature', 'no_id', 'tempo'],axis=1).columns),               width=1200, 
    height=1600)
fig.show()

### No of clusters validation
*   **Elbow Method**

In [7]:
value = []
for i in range(1,11):
    kmeans = KMeans(
        n_clusters=i, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=4
    )
    kmeans.fit(X)
    value.append(kmeans.inertia_)
#plot the chart so we can find the elbolw
fig = go.Figure(data=go.Scatter(x=np.arange(1,11),y=value))
fig.update_layout(title="Inertia vs Cluster Number",xaxis=dict(range=[0,11],title="Cluster Number"),
                  yaxis={'title':'Inertia'},
                 )
fig.show()                

*  ** Silhouette**

In [8]:
value = []
for i in range(2,11):
    kmeans = KMeans(n_clusters=i, random_state=4)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    value.append(silhouette_avg)

fig = go.Figure(data=go.Scatter(x=np.arange(2,11),y=value))
fig.update_layout(title="Avg Silhoutte Value vs Cluster Number",xaxis=dict(range=[0,11],title="Cluster Number"),
                  yaxis={'title':'Avg Silhouette Value'},
                 )
fig.show()            

### Clusterization implementation

In [9]:
kmeans = KMeans(
        n_clusters=3, 
        init="random",
        max_iter=10000,
        random_state=4)
kmeans.fit(X) #X from the above cell

KMeans(init='random', max_iter=10000, n_clusters=3, random_state=4)

In [10]:
#Visualizing all features distributions on scatterplots
df['cluster'] = kmeans.labels_
df['cluster'] = df['cluster'].apply(str)
fig = px.scatter_matrix(
    df,
    dimensions = df.drop(columns = ['artist', 'song', 'id', 'duration_ms', 'time_signature', 'no_id', 'tempo'],axis=1).columns,
    color = "cluster",
    color_discrete_sequence=['#003f5c','#bc5090','#ffa600'],
    width = 1200, 
    height = 1600,
    title="Labeled scatter plot after clusterization",
    )
fig.update_traces(opacity=0.4)
fig.show()

In [11]:
polar = pd.DataFrame(
    X,
    columns = df.drop(
        columns = [
            "id", 'artist', 'song', 'id',
            'duration_ms', 'time_signature', 'no_id', 'tempo',
            'cluster'
        ],
        axis = 1).columns
    )
polar['cluster'] = kmeans.labels_
polar = polar.groupby("cluster").mean().reset_index()
polar = pd.melt(polar, id_vars = ["cluster"])
fig = px.line_polar(
    polar,
    r = "value",
    theta = "variable",
    color = "cluster",
    color_discrete_sequence = ['#003f5c', '#bc5090', '#ffa600'],
    line_close = True,
    title="Average feature value by cluster"
    )
#fig.update_traces(opacity = 0.33)
fig.show()

In [10]:
df.drop(columns = ['no_id'],axis=1, inplace=True)
df.to_csv('data/clusterization.csv', index=False)

just to generate the playlists pics :)

In [None]:
polar = pd.DataFrame(
    X,
    columns = df.drop(
        columns = [
            "id", 'artist', 'song', 'id',
            'duration_ms', 'time_signature', 'no_id', 'tempo',
            'cluster'
        ],
        axis = 1).columns
    )
polar['cluster'] = kmeans.labels_
polar = polar.groupby("cluster").mean().reset_index()
polar = pd.melt(polar, id_vars = ["cluster"])
fig = px.line_polar(
    polar,
    r = "value",
    theta = "variable",
    color = "cluster",
    color_discrete_sequence = ['lightblue', '#bc5090', '#ffa600'],
    template = 'plotly_dark',
    line_close=True,
    width = 800, 
    height = 800,
    )
fig.update_layout(showlegend=True)
fig.show()