In [None]:
# adopted from https://blog.cambridgespark.com/how-to-determine-the-optimal-number-of-clusters-for-k-means-clustering-14f27070048f

In [None]:
# Import required packages
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
categorical_features = ['bidang_usaha']
continuous_features = ['pendapatan_tahunan','usia_perusahaan','jumlah_karyawan']

In [None]:
data = pd.read_csv('cluster_konsumen.csv')
data.head()

In [None]:
data.drop(["id"], axis = 1, inplace=True)
data.head()


In [None]:
data.iloc[:,1:].head()

In [None]:
data[continuous_features].describe()

In [None]:
for col in categorical_features:
    dummies = pd.get_dummies(data[col], prefix=col)
    data = pd.concat([data, dummies], axis=1)
    data.drop(col, axis=1, inplace=True)
data.head()

In [None]:
mms = MinMaxScaler()
mms.fit(data.iloc[:,1:])
data_transformed = mms.transform(data.iloc[:,1:])

In [None]:
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(data_transformed)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
k_means = KMeans(n_clusters=6)
k_means.fit(data_transformed)

In [None]:
clusters = k_means.fit_predict(data)
data["label"] = clusters

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(data.usia_perusahaan[data.label == 0], data["pendapatan_tahunan"][data.label == 0], data["jumlah_karyawan"][data.label == 0], c='blue', s=60, edgecolors='white', marker='X')
ax.scatter(data.usia_perusahaan[data.label == 1], data["pendapatan_tahunan"][data.label == 1], data["jumlah_karyawan"][data.label == 1], c='red', s=60, edgecolors='white', marker='o')
ax.scatter(data.usia_perusahaan[data.label == 2], data["pendapatan_tahunan"][data.label == 2], data["jumlah_karyawan"][data.label == 2], c='green', s=60, edgecolors='white', marker='o')
ax.scatter(data.usia_perusahaan[data.label == 3], data["pendapatan_tahunan"][data.label == 3], data["jumlah_karyawan"][data.label == 3], c='orange', s=60, edgecolors='white', marker='o')
ax.scatter(data.usia_perusahaan[data.label == 4], data["pendapatan_tahunan"][data.label == 4], data["jumlah_karyawan"][data.label == 4], c='purple', s=60, edgecolors='white', marker='o')
ax.scatter(data.usia_perusahaan[data.label == 5], data["pendapatan_tahunan"][data.label == 5], data["jumlah_karyawan"][data.label == 5], c='black', s=60, edgecolors='white', marker='o')
ax.view_init(30, 120)
ax.set_zlabel('jumlah_karyawan')

ax2 = fig.add_subplot(122, projection='3d')
ax2.scatter(data.usia_perusahaan[data.label == 0], data["pendapatan_tahunan"][data.label == 0], data["jumlah_karyawan"][data.label == 0], 
            c='blue', s=60, edgecolors='white', marker='X', alpha=1)
ax2.scatter(data.usia_perusahaan[data.label == 1], data["pendapatan_tahunan"][data.label == 1], data["jumlah_karyawan"][data.label == 1],
            c='red', s=60, edgecolors='white', marker='o', alpha=1, linewidths=2)
ax2.scatter(data.usia_perusahaan[data.label == 2], data["pendapatan_tahunan"][data.label == 2], data["jumlah_karyawan"][data.label == 2], 
            c='green', s=60, edgecolors='white', marker='o', alpha=1)
ax2.scatter(data.usia_perusahaan[data.label == 3], data["pendapatan_tahunan"][data.label == 3], data["jumlah_karyawan"][data.label == 3], 
            c='orange', s=60, edgecolors='white', marker='o', alpha=1)
ax2.scatter(data.usia_perusahaan[data.label == 4], data["pendapatan_tahunan"][data.label == 4], data["jumlah_karyawan"][data.label == 4], 
            c='purple', s=60, edgecolors='white', marker='o', alpha=1)
ax2.scatter(data.usia_perusahaan[data.label == 5], data["pendapatan_tahunan"][data.label == 5], data["jumlah_karyawan"][data.label == 5], 
            c='black', s=60, edgecolors='white', marker='o', alpha=1)
ax2.view_init(15,165)
ax2.set_zlabel('jumlah_karyawan')

plt.xlabel("usia_perusahaan")
plt.ylabel("pendapatan_tahunan")

plt.show()

In [None]:
data.usia_perusahaan[data.label == 0]