In [None]:
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings; warnings.filterwarnings('ignore')

from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import MinMaxScaler # distance 기반의 알고리즘의 경우 scaling 필요
from sklearn.decomposition import PCA

# K-means
from sklearn.cluster import KMeans
# Hiearchy clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierachy import dendrogram, linkage
# Spectral clustering
from sklearn.cluster import SpectralClustering
# DBSCAN
from sklearn.cluster import DBSCAN
# HDBSCAN
# !pip install -q hdbscan
import hdbscan

In [None]:
# scaling
scaler = MinMaxScaler().fit(X)
X_scal = scaler.transform(X)
X_scal = pd.DataFrame(X_scal, columns=X.columns)

# 차원 축소
# 그래프로 그리기 위해서
pca = PCA(n_components=2).fit(X)
X_pca = pca.fit_transform(X)
X_emm = pd.DataFrame(X_pca, columns=['axis_1', 'axis_2'])
print('pca variance : {}'.format(pca.explained_variance_ratio_))

In [None]:
# K-means

# parameters
    # n_clusters : k 개수
    # n_init : 초기 중심점 설정, default=10
    # max_iter : 몇 번 round 진행할 지, default=300

# modeling
for cluster in list(range(2,6)):
    Cluster = KMeans(n_clusters=cluster).fit(X_scal) # scaled X
    labels = Cluster.predict(X_scal)

    # label added to dataframe
    data['{} label'.format(cluster)] = labels
    labels = pd.DataFrame(labels, columns=['labels'])
    # plot data setting
    plot_data = pd.concat([X_EMM, labels], axis=1)
    groups = plot_data.groupby('labels')

    mar = ['o', '+', '*', 'D', ',', 'h',
           '1', '2', '3', '4', 's', '<', '>']
    colo = ['red', 'orange', 'green', 'blue', 'cyan',
            'magenta', 'black', 'yellow', 'grey', 'orchid', 'lightpink']

    fig, ax = plt.subplots(figsize=(5,5))
    for j, (name, group) in enumerate(groups):
        ax.plot(group['AXIS1'],
                group['AXIS2'],
                marker=mar[j],
                linestyle='',
                label=name,
                c=colo[j],
                ms=10)
        ax.legend(fontsize=12, loc='upper right')
    plt.title('Scatter plot', fontsize=20)
    plt.xlabel('AXIS1', fontsize=14)
    plt.ylabel('AXIS2', fontsize=14)
    plt.show()
    print('-'*60)

    gc.collect()

In [None]:
# HC

method = ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']

# for m in method:
#     HC = linkage(X_scal, method=m)
#     plt.figure(figsize=(5,5))
#     dendrogram(HC,
#                leaf_rotation=90,
#                leaf_font_size=20)
#     plt.show()

HC = linkage(X_scal, method=method[-1])
plt.figure(figsize=(5,5))
dendrogram(HC,
           leaf_rotation=90,
           leaf_font_size=20)
plt.show()

In [None]:
# Spectral clustering

# parameters
    # n_clusters
    # affinity : 유사도 행렬 만드는 기법
        # nearest neighbors, rbf, precomputed, precomputed_nearest_neighbors
    # n_neighbors : 유사도 계산시 주변 몇개의 n개를 보고 판단할 것인지
        # ignored for 'rbf'

# modeling
for cluster in list(range(2,6)):
    Cluster = SpectralClustering(n_clusters=cluster).fit(X_scal)
    labels = Cluster.labels_ # predict X

    # label add to dataframe
    data['{} label'.format(cluster)] = labels
    labels = pd.DataFrame(labels, columns=['labels'])
    # plot data setting
    plot_data = pd.concat([X_EMM, labels], axis=1)
    groups = plot_data.groupby('labels')

    mar = ['o', '+', '*', 'D', ',', 'h', '1', '2', '3', '4', 's', '<', '>']
    colo = ['red', 'orange', 'green', 'blue', 'cyan', 'magenta', 'black', 'yellow', 'grey', 'orchid', 'lightpink']

    fig, ax = plt.subplots(figsize=(5,5))
    for j, (name, group) in enumerate(groups):
        ax.plot(group['AXIS1'],
                group['AXIS2'],
                marker=mar[j],
                linestyle='',
                label=name,
                c=colo[j],
                ms=4)
        ax.legend(fontsize=12, loc='upper right')
    plt.title('Scatter plot', fontsize=20)
    plt.xlabel('AXIS1', fontsize=14)
    plt.ylabel('AXIS2', fontsize=14)
    plt.show()
    print('-'*70)

    gc.collect()

# confusion matrix
cm = confusion_matrix(data['censor'], data['2 label'])
print(cm)

# ACC & F1-score
# 정확도 자체는 좋지 않음
# 클러스터링은 distance를 가지고 단순히 분포를 나누어주는 것임

print('Test Acc : {}'.format((cm[0,0] + cm[1,1])/cm.sum()))
print('F1-score : {}'.format(f1_score(data['censor'], data['2 label'])))

In [None]:
# DBSCAN

# parameters
    # epsilon : 이웃 판단 거리
    # metric : default=euclidean
    # min_samples : eps 안에 최소 몇개의 이웃의 숫자가 분포하는지

epsilon = [.1, .2, .3, .4, .5, .6, .7]
minPls = [5, 10, 15, 20]

for e in epsilon:
    for m in minPls:
        print('epsilon : {}, minPls : {}'.format(e, m))
        db = DBSCAN(eps=e, min_samples=m).fit(test_data)
        palette = sns.color_palette()
        cluster_colors = [palette[col]
                          if col >= 0 else (.5, .5, .5) for col in db.labels_]
        plt.scatter(test_data.T[0], test_data.T[1], c=cluster_colors, **plot_kwds) # 2차원 예시 데이터
        plt.show()

In [None]:
# HDBSCAN

# parameters
    # min_cluster_size : cluster 안에 적어도 몇개가 있어야 하는지
    # cluster_selection_epsilon : combining HDBSCAN with DBSCAN

minsize = [3, 5, 10, 15, 20, 30]

for m in minsize:
    print('min_cluster_size : {}'.format(m))
    db = hdbscan.HDBSCAN(min_cluster_size=m).fit(test_data)
    palette = sns.color_palette()
    cluster_colors = [palette[col]
                        if col >= 0 else (.5, .5, .5) for col in db.labels_]
    plt.scatter(test_data.T[0], test_data.T[1], c=cluster_colors, **plot_kwds) # 2차원 예시 데이터
    plt.show()