In [182]:
import pandas as pd
import numpy as np

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


from pyclustertend import hopkins
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from collections import Counter

In [128]:
SEED = 2021
DATA_PATH = '../data/data.csv'

### Data exploration

In [129]:
df = pd.read_csv(DATA_PATH, index_col='Unnamed: 0')
df = df.drop('Unnamed: 1', axis=1)
df.head()

Unnamed: 0,name,artist,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
All Out 50s,Don't Be Cruel,Elvis Presley,59,0.697,0.55,2,-11.496,1,0.179,0.856,3.4e-05,0.0907,0.844,84.802,122893
All Out 50s,I've Got You Under My Skin - Remastered 1998,Frank Sinatra,66,0.585,0.247,1,-12.612,1,0.04,0.452,9e-06,0.107,0.591,127.15,223760
All Out 50s,Smoke Gets In Your Eyes,The Platters,0,0.29,0.227,3,-13.06,1,0.0311,0.944,7.9e-05,0.617,0.224,114.278,157293
All Out 50s,"What'd I Say, Pt. 1 & 2",Ray Charles,62,0.54,0.681,4,-5.44,1,0.0508,0.808,0.0,0.162,0.794,88.385,307053
All Out 50s,Dream A Little Dream Of Me,Ella Fitzgerald,0,0.455,0.167,0,-13.613,1,0.0739,0.918,0.0,0.173,0.404,76.118,185067


In [18]:
df.describe()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,53.938571,0.611729,0.627966,4.955714,-8.281466,0.77,0.056079,0.311508,0.016717,0.177165,0.649825,120.179027,217733.592857
std,27.399246,0.142201,0.204401,3.576994,3.577956,0.421133,0.053247,0.284552,0.094045,0.146845,0.225371,26.892387,54347.249353
min,0.0,0.149,0.021,0.0,-24.385,0.0,0.0228,6.5e-05,0.0,0.0207,0.065,62.658,109960.0
25%,50.0,0.526,0.488,2.0,-10.49025,1.0,0.0321,0.0582,0.0,0.085375,0.49,100.0395,179073.25
50%,65.0,0.623,0.6565,5.0,-7.5155,1.0,0.03885,0.2205,2e-06,0.121,0.6835,118.8605,214400.0
75%,72.0,0.71125,0.7865,8.0,-5.6145,1.0,0.055475,0.53725,0.000169,0.214,0.82825,132.96675,247076.75
max,85.0,0.967,0.993,11.0,-1.526,1.0,0.463,0.982,0.954,0.882,0.985,206.313,522307.0


#### popularity
The popularity of the track. The value will be between 0 and 100, with 100 being the most popular.
The popularity of a track is a value between 0 and 100, with 100 being the most popular. The popularity is calculated by algorithm and is based, in the most part, on the total number of plays the track has had and how recent those plays are.

Generally speaking, songs that are being played a lot now will have a higher popularity than songs that were played a lot in the past. Duplicate tracks (e.g. the same track from a single and an album) are rated independently. Artist and album popularity is derived mathematically from track popularity. Note that the popularity value may lag actual popularity by a few days: the value is not updated in real time.

In [22]:
len(df[df['popularity'] == 0])

112

In [19]:
feature_cols = df.drop(['name', 'artist'], axis=1).columns

In [21]:
fig = make_subplots(rows=5, cols=3)

for i, col in enumerate(feature_cols):
    row_index = i // 3
    col_index = i % 3
    fig.add_trace(go.Histogram(x=df[col], name=col), row=row_index + 1, col=col_index + 1)
fig.update_layout(title_text="Feature histograms", width=960, height=800, )
fig.show()

In [None]:
fig = make_subplots(rows=4, cols=3)

for i, col in enumerate(feature_cols):
    row_index = i // 3
    col_index = i % 3
    fig.add_trace(go.Box(y=df[col], name=col), row=row_index + 1, col=col_index + 1)
fig.update_layout(title_text="Audio feature boxplots", width=960, height=800, )
fig.show()

In [44]:
fig = go.Figure(data=go.Splom(
                  dimensions=[dict(label='Danceability', values=df['danceability']),
                              dict(label='Energy', values=df['energy']),
                              dict(label='Loudness', values=df['loudness']),
                              dict(label='Speechiness', values=df['speechiness']),
                              dict(label='Instrumentalness', values=df['instrumentalness']),
                              dict(label='Liveness', values=df['liveness']),
                              dict(label='Valence', values=df['valence']),
                              dict(label='Tempo', values=df['tempo'])],
                  marker=dict(color=df['popularity'],
                              colorscale='Viridis',
                              line_color='white',
                              line_width=0.5,
                              colorbar=dict(thickness=20)),
                  diagonal=dict(visible=False)))

fig.update_layout(title="Audio features scatterplot matrix colored by popularity",
                  dragmode='select',
                  width=1000,
                  height=1000)

fig.show()

In [117]:
cols = ['danceability', 'energy',  'speechiness', 'acousticness', 'liveness', 'valence']

def get_sample(seed=2021):
    sample = df[cols + ['name']].sample(random_state=seed)
    name = sample['name']
    sample = sample.drop('name', axis=1).T
    sample = sample.values.squeeze(1)
    #sample = (sample - np.mean(sample)) / np.std(sample)
    return sample, name.values[0]

In [119]:
sample_1, name_1 = get_sample(42)
sample_2, name_2 = get_sample(2021)

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=sample_1,
      theta=cols,
      fill='toself',
      name=name_1
    ))

fig.add_trace(go.Scatterpolar(
      r=sample_2,
      theta=cols,
      fill='toself',
      name=name_2
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

fig.show()

### Clustering

In [159]:
def check_clustering_tendency(data):
    return hopkins(data, data.shape[0])


def silhouette(data, max_k):
    silhouette_values = []

    for k in range(2, max_k):
        kmeans = KMeans(n_clusters=k, random_state=42).fit(data)
        silhouette_value = silhouette_score(data, kmeans.labels_)
        silhouette_values.append(silhouette_value)
    return silhouette_values


def get_labels(data, k):
    kmeans = KMeans(n_clusters=k, random_state=SEED).fit(data)
    return kmeans.labels_

In [125]:
check_clustering_tendency(df[feature_cols])

0.08010525503690626

In [167]:
silhouette_results = silhouette(df[feature_cols], 10)

fig = px.line(silhouette_results, markers=True, x=[i for i in range(2, 10)], y=silhouette_results, title='Silhouette values by k')
fig.add_annotation(x=3, y=0.5490264,
            text="Maximum silhouette value indicates that 3 is best k",
            showarrow=True,
            arrowhead=5)
fig.update_xaxes(title_text='k')
fig.update_yaxes(title_text='silhouette')

fig.show()

In [156]:
max_metric_value = max(silhouette_results)
silhouette_results.index(max_metric_value) + 2

3

In [178]:
classes = get_labels(df[feature_cols], k=3)

In [191]:
labels = [0, 1, 2]
values = list(Counter(classes).values())

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])

fig.update_layout(title='Class distribution in percent')
fig.show()

AttributeError: 'dict_values' object has no attribute 'to_list'