In [34]:
# Initial import
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [35]:
# load data 
file_path = "shopping_data_processed.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,Card Member,Age,Annual_Income,Spending_Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
5,0,22.0,17.0,76.0
6,0,35.0,18.0,6.0
7,0,23.0,18.0,94.0
8,1,64.0,19.0,3.0
9,0,30.0,19.0,72.0


In [36]:
df_shopping.hvplot.scatter(x='Annual_Income',y='Spending_Score')

In [37]:
# function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=0)
    model
    # Fit the model
    model.fit(df)
    
    # Add new class column to df_shopping
    df['class'] = model.labels_
    
    # plot the dataset
    fig = px.scatter_3d(df, x='Annual_Income', y='Spending_Score',z='Age', color='class',symbol='class',width=800)
    fig.update_layout(legend=dict(x=0,y=1))
    fig.show()

In [38]:
# pass df_shopping and 2 to the function
test_cluster_amount(df_shopping, 2)

# plot the results 
df_shopping.hvplot.scatter(x='Annual_Income',y='Spending_Score', by='class')

In [39]:
# make 3d plot
fig = px.scatter_3d(df_shopping, x='Annual_Income', y='Spending_Score',z='Age', color='class',symbol='class',width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [40]:
# test with 3 clusters
test_cluster_amount(df_shopping, 3)

In [41]:
# make 3d plot
fig = px.scatter_3d(df_shopping, x='Annual_Income', y='Spending_Score',z='Age', color='class',symbol='class',width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [42]:
# test with 4 clusters
test_cluster_amount(df_shopping, 4)

In [43]:
# test with 5 clusters
test_cluster_amount(df_shopping, 5)

### Create elbow curve to determine best number of clusters


In [44]:
# create empty dictionary with clusters as keys and inertia as values
inertia = []

k = list(range(1,11))
# loop through K
for i in k:
    km = KMeans(n_clusters=i, random_state=0).fit(df_shopping)
    inertia.append(km.inertia_)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [45]:
inertia

[309336.0550000001,
 213353.4670076726,
 143595.18756788792,
 104527.52546158804,
 75424.02157802146,
 58413.18538324422,
 51219.472570910344,
 44443.10700365701,
 40760.33139288892,
 37717.325042342745]

In [46]:
# create dataframe
df_elbow = pd.DataFrame({'k':k,'inertia':inertia})
df_elbow.hvplot.line(x='k',y='inertia', xticks=k, title='Elbow Curve')

In [48]:
# create function to  get dataframe from k clusters
def get_clusters(k, data):
    # create copy of dataset
    data = data.copy()
    
    # initiate K-Means model
    km = KMeans(n_clusters=k, random_state=0)
    
    #fit the model
    km.fit(data)
    
    # predict clusters
    predictions = km.predict(data)
    
    # create return DataFrame with predicted clusters
    data['class'] = km.labels_
    
    return data

In [49]:
five_clusters = get_clusters(5, df_shopping)
five_clusters.head()

Unnamed: 0,Card Member,Age,Annual_Income,Spending_Score,class
0,1,19.0,15.0,39.0,0
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,0
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,0


In [50]:
six_clusters = get_clusters(6, df_shopping)
six_clusters.head()

Unnamed: 0,Card Member,Age,Annual_Income,Spending_Score,class
0,1,19.0,15.0,39.0,3
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,3
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,3


In [51]:
# plot 2d scatter for 5 clusters
five_clusters.hvplot.scatter(x='Annual_Income',y='Spending_Score', by='class')

In [56]:
# create 3d plot for 5 clusters
fig = px.scatter_3d(
    five_clusters,
    x='Age',
    y='Spending_Score',
    z='Annual_Income',
    color='class',
    symbol='class',
    width=800
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [54]:
# 2d plot for six clusters
six_clusters.hvplot.scatter(x='Annual_Income', y='Spending_Score',by='class')

In [57]:
# 3d plot for six clusters
fig = px.scatter_3d(
    six_clusters,
    x='Age',
    y='Spending_Score',
    z='Annual_Income',
    color='class',
    symbol='class',
    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()