In [None]:
import pandas as pd

customer_data = pd.read_csv("customer_data.csv")

In [None]:
import seaborn as sns

# Dropping CustomerID column from data 
customer_data = customer_data.drop('CustomerID', axis=1)

sns.pairplot(customer_data)

In [None]:
customer_data["Age"].hist()

In [None]:
intervals = [15, 20, 30, 40, 50, 60, 70]
col = customer_data['Age']
customer_data['Age Groups'] = pd.cut(x=col, bins=intervals)

# To be able to look at the result stored in the variable
customer_data['Age Groups']

In [None]:
customer_data_encoded = pd.get_dummies(customer_data)
# Display the one-hot encoded dataframe
customer_data_encoded

In [None]:
sns.pairplot(customer_data_encoded)

In [None]:
shc.linkage?

In [None]:
import scipy.cluster.hierarchy as shc
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 20))
plt.title("Customers Dendrogram")

# Selecting Annual Income and Spending Scores by index
selected_data = customer_data_encoded.iloc[:, 1:3]
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean")
shc.dendrogram(Z=clusters)
plt.show()

In [None]:
plt.figure(figsize=(20,20))
clusters = shc.linkage(selected_data, 
            method='complete', 
            metric="euclidean")
shc.dendrogram(Z=clusters)
plt.show()

In [None]:
plt.figure(figsize=(20,20))
clusters = shc.linkage(selected_data, 
            method='centroid', 
            metric="euclidean")
shc.dendrogram(Z=clusters)
plt.show()

In [None]:
plt.figure(figsize=(20,20))
clusters = shc.linkage(selected_data, 
            method='average', 
            metric="euclidean")
shc.dendrogram(Z=clusters)
plt.show()

In [None]:
plt.figure(figsize=(20,20))
clusters = shc.linkage(selected_data, 
            method='single', 
            metric="euclidean")
shc.dendrogram(Z=clusters)
plt.show()

In [None]:
plt.figure(figsize=(10, 7))
plt.title("Customers Dendogram with line")
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean")
shc.dendrogram(clusters)
plt.axhline(y = 125, color = 'r', linestyle = '-')

In [None]:
customer_data_encoded.iloc[:, 1:3].head()

In [None]:
from sklearn.cluster import AgglomerativeClustering
selected_data = customer_data_encoded.iloc[:, 1:3]
clustering_model = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
clustering_model.fit(selected_data)
clustering_model.labels_

In [None]:
data_labels = clustering_model.labels_
sns.scatterplot(x='Annual Income (k$)', 
                y='Spending Score (1-100)', 
                data=selected_data, 
                hue=data_labels,palette='tab10').set_title('Labeled Customer Data')

In [None]:
from sklearn.cluster import KMeans
import numpy as np

k = 5
x = selected_data.iloc[:,0].values
y = selected_data.iloc[:,1].values
X = list(zip(x, y))
# Number of clusters
kmeans = KMeans(n_clusters=k)
# Fitting the input data
kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.labels_
# Centroid values
centroids = kmeans.cluster_centers_
# Comparing with scikit-learn centroids
print(centroids) # From sci-kit learn
colors = ['r', 'g', 'b', 'y', 'c', 'm', 'r', 'g', 'b', 'y', 'c', 'm', 'r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for i in range(k):
        points = np.array([X[j] for j in range(len(X)) if labels[j] == i])
        ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
ax.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')

In [None]:
sns.scatterplot(x=x, 
                y=y, 
                data=None, 
                hue=kmeans.labels_,palette='tab10').set_title('Labeled Customer Data')

In [None]:
Nc = range(1, 20)
kmeans_lst = [KMeans(n_clusters=i) for i in Nc]
score = [kmeans_lst[i].fit(X).score(X) for i in range(len(kmeans_lst))]

plt.figure(figsize=(15,15))
plt.plot(Nc, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.yticks(Nc)
plt.title('Elbow Curve')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
sil = []
for k in range(2, 25):
    kmeans = KMeans(n_clusters = k).fit(X)
    labels = kmeans.predict(X)
    sil.append(silhouette_score(X, labels))

In [None]:
from matplotlib.ticker import AutoMinorLocator

ax = plt.subplot()

ax.xaxis.set_minor_locator(AutoMinorLocator())
ax.tick_params(which='minor', length = 4, color='red')
ax.tick_params(which='major', length = 8, color='blue')

ax.yaxis.set_minor_locator(AutoMinorLocator())
ax.tick_params(which='minor', length = 4, color='red')
ax.tick_params(which='major', length = 8, color='blue')

ax.plot([x for x in range(2, 25)], sil)