# Market Segmentation

## Hierarchical Agglomerative Clustering

### Import Data

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import KernelDensity

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 1000)

### Scale Data

In [2]:
# Read CSV to DataFrame
artists = pd.read_csv('Data/artists_train.csv')

# Drop unnamed column
artists.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

# Create id_name_genre column
#artists['id_name'] = artists['artist_id'] + ', ' + artists['artist_name'].fillna('None')

# Preview dataframe
print(artists.info())
artists.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194248 entries, 0 to 194247
Data columns (total 28 columns):
artist_id                  194248 non-null object
artist_name                191716 non-null object
artist_followers           194248 non-null float64
artist_popularity          194248 non-null float64
track_popularity_mean      194248 non-null float64
track_popularity_std       194248 non-null float64
track_release_year_mean    194248 non-null float64
track_release_year_std     194248 non-null float64
acousticness_mean          194248 non-null float64
acousticness_std           194248 non-null float64
danceability_mean          194248 non-null float64
danceability_std           194248 non-null float64
duration_ms_mean           194248 non-null float64
duration_ms_std            194248 non-null float64
energy_mean                194248 non-null float64
energy_std                 194248 non-null float64
instrumentalness_mean      194248 non-null float64
instrumentalness_std    

Unnamed: 0,artist_id,artist_name,artist_followers,artist_popularity,track_popularity_mean,track_popularity_std,track_release_year_mean,track_release_year_std,acousticness_mean,acousticness_std,danceability_mean,danceability_std,duration_ms_mean,duration_ms_std,energy_mean,energy_std,instrumentalness_mean,instrumentalness_std,liveness_mean,liveness_std,loudness_mean,loudness_std,speechiness_mean,speechiness_std,tempo_mean,tempo_std,valence_mean,valence_std
0,2jTsRGHAvKXXSbeO52G1t5,Webbstur,3453.0,28.0,17.8,10.19586,2017.3,0.948683,0.170563,0.2555,0.5925,0.116009,245121.3,51594.498027,0.8404,0.086234,0.41395,0.373703,0.15339,0.093559,-6.7449,1.570696,0.04442,0.015108,124.997,2.174399,0.42026,0.266024
1,5CRw2KQzb9aH8HuyuQ6QCL,Ozy,103.0,6.0,2.5,5.233439,2012.7,4.001389,0.197861,0.354064,0.6189,0.177373,267232.6,94528.177818,0.7245,0.278383,0.67246,0.294077,0.21175,0.136774,-9.2551,5.195449,0.09468,0.055908,148.4241,21.335355,0.44111,0.272943
2,11fxyJpCpCnuG2c6NMuZNt,Bia Macedo,5177.0,11.0,7.8,4.391912,2017.0,0.0,0.26262,0.159743,0.6322,0.087031,180036.6,12755.167904,0.8767,0.123941,0.0,0.0,0.377,0.226577,-2.9268,2.2887,0.12018,0.090222,153.8066,20.089296,0.7754,0.090969
3,0pRBqfB42iVqXKIFJdfFKS,Brett Detar,1781.0,20.0,14.1,7.752419,2011.8,1.549193,0.183107,0.27025,0.4248,0.128387,259215.9,49391.796063,0.6632,0.20092,0.072125,0.140028,0.22184,0.176418,-7.4456,2.286342,0.03678,0.017357,135.1359,29.490275,0.4869,0.26961
4,5MpELOfAiq7aIBTij30phD,Harry James,17750.5,53.0,34.7,9.894709,1977.8,31.701901,0.8419,0.158289,0.4435,0.140408,197270.7,38825.948564,0.24639,0.198372,0.267144,0.373276,0.18035,0.086693,-12.8619,4.617813,0.03994,0.018269,100.5924,21.342652,0.4042,0.173864


### Normalize Data

In [3]:
# Standardize with scalers

# Define X
X_train = artists.drop(columns=['artist_id', 'artist_name'])

# Standardize with Standard Scaler
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_sscaled = ss.fit_transform(X_train)
X_train_sscaled = pd.DataFrame(X_train_sscaled, columns=X_train.columns)

# Standardize with MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_mmscaled = mms.fit_transform(X_train)
X_train_mmscaled = pd.DataFrame(X_train_mmscaled, columns=X_train.columns)

In [None]:
# HAC
agg_clust = AgglomerativeClustering(n_clusters=10)
agg_clust
assigned_clust = agg_clust.fit_predict(X_train_sscaled)

In [None]:
# Visualizing how HAC works
from plot_agg_alg import plot_agglomerative_algorithm
plot_agglomerative_algorithm()


In [None]:
from plot_agg import plot_agglomerative
plot_agglomerative()

In [None]:



# Dendrogram
from scipy.cluster.hierarchy import dendrogram, ward

linkage_array = ward(X)
dendrogram(linkage_array)

ax = plt.gca()
bounds = ax.get_xbound()
ax.plot(bounds, [16, 16], '--', c='k')
ax.plot(bounds, [9, 9], '--', c='k')
ax.text(bounds[1], 16, ' 2 clusters', va='center', fontdict={'size': 12})
ax.text(bounds[1], 9, ' 3 clusters', va='center', fontdict={'size': 12})
plt.xlabel("Data index")
plt.ylabel("Cluster distance")


# Create data - 6 Cluster Example
k = 6
m = 400
X, y = make_blobs(n_samples= m, n_features=2, centers=k, cluster_std=0.8,  random_state = 1234)
plt.scatter(X[:, 0], X[:, 1], c = y, s = 10);

# Try different linkage settings in HAC algorithm
agg_comp = AgglomerativeClustering(linkage ="complete", n_clusters=6)
agg_avg = AgglomerativeClustering(linkage ="average", n_clusters=6)
agg_ward = AgglomerativeClustering(linkage ="ward", n_clusters=6)

as_comp = agg_comp.fit_predict(X)
as_avg = agg_avg.fit_predict(X)
as_ward = agg_ward.fit_predict(X)

# Visualize predictions
plt.scatter(X[:, 0], X[:, 1], c = as_comp, s = 10);
plt.scatter(X[:, 0], X[:, 1], c = as_avg, s = 10);
plt.scatter(X[:, 0], X[:, 1], c = as_ward, s = 10);

# Dendrogram for Ward cluster
from scipy.cluster.hierarchy import dendrogram, ward
linkage_array = ward(X)
dendrogram(linkage_array)

ax = plt.gca()
bounds = ax.get_xbound()
plt.xlabel("Sample index")
plt.ylabel("Cluster distance");

# Make visualization more interpretable
plt.title('Hierarchical Clustering Dendrogram (truncated)')
dendrogram(linkage_array,  truncate_mode='lastp', p=12)
plt.xlabel('cluster size')
plt.ylabel('distance')
plt.show()

# Run k-means to compare
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters = 6)
k_means.fit(X)
y_hat = k_means.predict(X)

plt.scatter(X[:, 0], X[:, 1], c = y_hat, s = 10)
cl_centers = k_means.cluster_centers_
plt.scatter(cl_centers[:, 0], cl_centers[:, 1], c='black', s=40);

# Evaluation
labels_kmeans = k_means.labels_
labels_comp = agg_comp.labels_
labels_avg = agg_avg.labels_
labels_ward = agg_ward.labels_

# Adjusted Rand Index
# Bounded between -1 and 1. 
# Closer to 1 is good, while closer to -1 is bad.
from sklearn import metrics
metrics.adjusted_rand_score(labels_kmeans, y)  
metrics.adjusted_rand_score(labels_ward, y)
metrics.adjusted_rand_score(labels_avg, y)  
metrics.adjusted_rand_score(labels_comp, y)  

# Fowlkes Mallows Score
# Bounded between 0 and 1. Closer to 1 is better.
metrics.fowlkes_mallows_score(labels_kmeans, y)
metrics.fowlkes_mallows_score(labels_ward, y)
metrics.fowlkes_mallows_score(labels_avg, y)
metrics.fowlkes_mallows_score(labels_comp, y)

# Calinski-Harabaz Index
# This score is not bounded. The higher, the better.
metrics.calinski_harabaz_score(X, labels_kmeans)
metrics.calinski_harabaz_score(X,labels_ward)
metrics.calinski_harabaz_score(X,labels_avg)
metrics.calinski_harabaz_score(X,labels_comp)

# Silhouette Coefficient
# Bounded at -1 and 1. 
# Closer to -1 suggests incorrect clustering.
# Closer to +1 shows that each cluster is very dense.
metrics.silhouette_score(X, labels_kmeans)
metrics.silhouette_score(X, labels_ward)
metrics.silhouette_score(X,labels_avg)
metrics.silhouette_score(X, labels_comp)
