# クラスタリング

## 凝集型クラスタリング

### 例題 3.1

In [16]:
import numpy as np
import math

D = [0, 1, 3, 5.5]

# 事例間の類似度は、差の逆数
def sim(d_k, d_l):
    diff = math.fabs(d_k - d_l)
    return 0 if diff <= 0.0 else (1 / diff)

def merge(lst, lhs_idx, rhs_idx):
    tmp = []

    for i in range(0, len(lst)):
        if (i is not lhs_idx and i is not rhs_idx):
            tmp = tmp + [lst[i]]
    
    return tmp + [lst[lhs_idx] + lst[rhs_idx]]

def aggolomerative_clustering(sim_fn):
    C = [[d] for d in D]
    
    print(C)

    while len(C) >= 2:
        indices = [(c_i, c_j) for c_i in enumerate(C)
                                           for c_j in enumerate(C)
                                           if c_j[1] is not c_i[1]]
        idx = np.argmax([sim_fn(c_i, c_j) for (_, c_i), (_, c_j) in indices])
        (c_m_idx, c_m), (c_n_idx, c_n) = indices[idx]
        C = merge(C, c_m_idx, c_n_idx)

        print(C)

# 単連結法による凝集型クラスタリング
def sim_single_link(c_i, c_j):
    return max([sim(x_k, x_l) for x_k in c_i for x_l in c_j])

print("aggolomerative clustering with single-link method:")
aggolomerative_clustering(sim_single_link)

# 完全連結法による凝集型クラスタリング
def sim_complete_link(c_i, c_j):
    return min([sim(x_k, x_l) for x_k in c_i for x_l in c_j])

print("aggolomerative clustering with complete-link method:")
aggolomerative_clustering(sim_complete_link)

aggolomerative clustering with single-link method:
[[0], [1], [3], [5.5]]
[[3], [5.5], [0, 1]]
[[5.5], [3, 0, 1]]
[[5.5, 3, 0, 1]]
aggolomerative clustering with complete-link method:
[[0], [1], [3], [5.5]]
[[3], [5.5], [0, 1]]
[[0, 1], [3, 5.5]]
[[0, 1, 3, 5.5]]


## k-平均法

### 例題 3.2

In [22]:
D = [0, 1, 3, 5.5]
k = 2

ms = [-1, 6]
old_ms = [math.nan, math.nan]

def convergence(ms, old_ms):
    return np.isclose(ms[0], old_ms[0])

while not convergence(ms, old_ms):
    clusters = [[], []]

    for x in D:
        c_max = np.argmax([sim(x, m) for m in ms])
        clusters[c_max] += [x]
    
    old_ms = ms
    ms = [np.mean(c) for c in clusters]
    
    print("clusters: %s" % clusters)
    print("new center: %s" % ms)

clusters: [[0, 1], [3, 5.5]]
new center: [0.5, 4.25]
clusters: [[0, 1], [3, 5.5]]
new center: [0.5, 4.25]
