In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager
%matplotlib inline
import os
import warnings
warnings.filterwarnings('ignore')

font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()
plt.rcParams["font.family"] = font_family

In [None]:
def to_int(data):
    data['Gender'] = np.where(data['Gender'] == '남성', 0, 1)
    data['Job']  = np.where(data['Job'] == '비경활', 0,
                    np.where(data['Job'] == '은퇴', 1,
                        np.where(data['Job'] == '부분은퇴', 2,
                            np.where(data['Job'] == '임금_고숙련', 3,
                                np.where(data['Job'] == '임금_중숙련', 4,
                                    np.where(data['Job'] == '임금_저숙련', 5,
                                        np.where(data['Job'] == '자영업', 6, 7)))))))
    data['Edu'] = np.where(data['Edu'] == '초등', 0,
                    np.where(data['Edu'] == '중등', 1,
                        np.where(data['Edu'] == '고등', 2, 3)))
    data['Health'] = np.where(data['Health'] == '활동에 제한 있음', 0,
                        np.where(data['Health'] == '그렇지 않은 편', 1, 2))
    data['Earn'] = np.where(data['Earn'] == '하', 0,
                    np.where(data['Earn'] == '중', 1, 2))
    data['Self_conf'] = np.where(data['Self_conf'] == '하', 0,
                            np.where(data['Self_conf'] == '중', 1, 2))
    return data

In [None]:
from kmodes.kmodes import KModes
df_lst = [data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8]

In [None]:
K = range(1, 20)
for idx, df in enumerate(df_lst):
    cost = []
    for num_clusters in list(K):
        kmode = KModes(n_clusters=num_clusters, init = 'random', n_init =  5)
        kmode.fit_predict(df)
        cost.append(kmode.cost_)

    plt.plot(K, cost, 'bx-')
    plt.xlabel('No. of clusters')
    plt.ylabel('Cost')
    plt.title(f'Elbow Method For {idx+1}th Optimal K')
    plt.show()

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
data = to_int(data_1)
kmodes = KModes(n_clusters=6, init='random')
kmodes.fit(data)

In [None]:
kmodes.labels_

In [None]:
silhouette_score(data, kmodes.labels_)

In [None]:
def visualize_silhouette(cluster_lists, X_features): 
    from kmodes.kmodes import KModes
    from sklearn.metrics import silhouette_samples, silhouette_score

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import math

    # 입력값으로 클러스터링 갯수들을 리스트로 받아서, 각 갯수별로 클러스터링을 적용하고 실루엣 개수를 구함
    n_cols = len(cluster_lists)

    # plt.subplots()으로 리스트에 기재된 클러스터링 수만큼의 sub figures를 가지는 axs 생성 
    fig, axs = plt.subplots(figsize=(4*n_cols, 4), nrows=1, ncols=n_cols)

    # 리스트에 기재된 클러스터링 갯수들을 차례로 iteration 수행하면서 실루엣 개수 시각화
    for ind, n_cluster in enumerate(cluster_lists):

        # KMeans 클러스터링 수행하고, 실루엣 스코어와 개별 데이터의 실루엣 값 계산. 
        clusterer = KModes(n_clusters = n_cluster, max_iter=500, random_state=2022)
        cluster_labels = clusterer.fit_predict(X_features)

        sil_avg = silhouette_score(X_features, cluster_labels)
        sil_values = silhouette_samples(X_features, cluster_labels)

        y_lower = 10
        axs[ind].set_title('Number of Cluster : '+ str(n_cluster)+'\n' \
                          'Silhouette Score :' + str(round(sil_avg,3)) )
        axs[ind].set_xlabel("The silhouette coefficient values")
        axs[ind].set_ylabel("Cluster label")
        axs[ind].set_xlim([-0.1, 1])
        axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
        axs[ind].set_yticks([])  # Clear the yaxis labels / ticks
        axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])

        # 클러스터링 갯수별로 fill_betweenx( )형태의 막대 그래프 표현. 
        for i in range(n_cluster):
            ith_cluster_sil_values = sil_values[cluster_labels==i]
            ith_cluster_sil_values.sort()

            size_cluster_i = ith_cluster_sil_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_cluster)
            axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
                                facecolor=color, edgecolor=color, alpha=0.7)
            axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10

        axs[ind].axvline(x=sil_avg, color="red", linestyle="--")

In [None]:
visualize_silhouette([2,3,4,5,6], data)