In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from pandas_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler

In [None]:
features = pd.read_csv('../data/spotify_songs_features.csv')
played =  pd.read_csv('../data/lastfm_played_tracks.csv')
ids = pd.read_csv('../data/spotify_tracks_ids.csv')
raw = played.join(features.join(ids.set_index('sp_id'), on='id').set_index(['artist', 'song']), on=['artist', 'song'], how='inner')
raw.drop(columns = ['unix_timestamp', 'key', 'mode', 'type', 'uri', 'track_href', 'analysis_url'], axis = 1, inplace  =True)
df = raw.drop_duplicates()

In [None]:
profile = ProfileReport(
    df.drop(columns = ['artist', 'song', 'id', 'duration_ms', 'time_signature', 'no_id', 'tempo'],axis=1), 
    title='Song features 2019',
    html={'style':{'full_width':True}})
profile.to_file('profiling.html')

In [None]:
#Visualizing all features distributions on scatterplots
fig = px.scatter_matrix(df.drop(columns = ['artist', 'song', 'id', 'duration_ms', 'time_signature', 'no_id', 'tempo'],axis=1),
width=1200, height=1600)
fig.show()

In [None]:
#Defining how many clusters would fit best for our data

X=df.drop(columns = ['artist', 'song', 'id', 'duration_ms', 'time_signature', 'no_id'],axis=1)

#using StandardScaler to normalize allv alues between 0 and 1
scaler = MinMaxScaler()
scaler.fit(X)
X=scaler.transform(X)

#get the inertial value for 1-1 clusters
inertia = []
for i in range(1,11):
    kmeans = KMeans(
        n_clusters=i, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=4
    )
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
#plot the chart so we can find the elbolw
fig = go.Figure(data=go.Scatter(x=np.arange(1,11),y=inertia))
fig.update_layout(title="Inertia vs Cluster Number",xaxis=dict(range=[0,11],title="Cluster Number"),
                  yaxis={'title':'Inertia'},
                 )

In [None]:
#implementing the k-means model
#I decided to go with 4. Whatever
kmeans = KMeans(
        n_clusters=4, 
        init="k-means++",
        n_init=10,
        max_iter=1000,
        tol=1e-04, 
        random_state=4)
kmeans.fit(X) #X from the above cell

In [None]:
clusters=pd.DataFrame(X,columns=kdf.drop(columns=["id", 'count'],axis=1).columns)
clusters['label']=kmeans.labels_
polar=clusters.groupby("label").mean().reset_index()
polar=pd.melt(polar,id_vars=["label"])
fig = px.line_polar(polar, r="value", theta="variable", color="label", line_close=True, height=800,width=1400)
fig.show()

In [None]:
clusters['id'] = kdf['id']
clusters.groupby('label').count()['id']

In [None]:
clusters.to_csv('../data/clusterization.csv', index=False)