## 设计K-Means算法进行聚类

#### 读取训练数据

In [7]:
from tensorflow.examples.tutorials.mnist import input_data      

data = input_data.read_data_sets('data/fashion')
X_train=data.train.images
y_train=data.train.labels

Extracting data/fashion/train-images-idx3-ubyte.gz
Extracting data/fashion/train-labels-idx1-ubyte.gz
Extracting data/fashion/t10k-images-idx3-ubyte.gz
Extracting data/fashion/t10k-labels-idx1-ubyte.gz


#### 设计K-Means算法

In [9]:
import numpy as np
import tensorflow as tf
from collections import Counter

k=10 # 自己设定k值

# 选择随机点作为起始位置
start_pos = tf.Variable(X_train[np.random.randint(X_train.shape[0], size=k), :], dtype=tf.float32)
centroids = tf.Variable(start_pos.initialized_value(), 'S', dtype=tf.float32)

# 填充点
points = tf.Variable(X_train, 'X_train', dtype=tf.float32)
ones_like = tf.ones((points.get_shape()[0], 1))
prev_assignments = tf.Variable(tf.zeros((points.get_shape()[0], ), dtype=tf.int64))

# 找到所有点之间的距离
p1 = tf.matmul(
    tf.expand_dims(tf.reduce_sum(tf.square(points), 1), 1),
    tf.ones(shape=(1, k))
)
p2 = tf.transpose(tf.matmul(
    tf.reshape(tf.reduce_sum(tf.square(centroids), 1), shape=[-1, 1]),
    ones_like,
    transpose_b=True
))

# 在Kmeans算法中，一般采用欧氏距离计算两个点的距离，欧氏距离如下：d=sqrt( ∑(xi1-xi2)^ ) i=1,2..n 
distance = tf.sqrt(tf.add(p1, p2) - 2 * tf.matmul(points, centroids, transpose_b=True))

# 将每个点分配到最近的质心
point_to_centroid_assignment = tf.argmin(distance, axis=1)

# 重新计算中心
total = tf.unsorted_segment_sum(points, point_to_centroid_assignment, k)
count = tf.unsorted_segment_sum(ones_like, point_to_centroid_assignment, k)
means = total / count

# 如果当前和以前的作业有任何差异，请继续
is_continue = tf.reduce_any(tf.not_equal(point_to_centroid_assignment, prev_assignments))

with tf.control_dependencies([is_continue]):
    loop = tf.group(centroids.assign(means), prev_assignments.assign(point_to_centroid_assignment))

sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 进行多次迭代
has_changed, cnt = True, 0
while has_changed and cnt < 300:
    cnt += 1
    has_changed, _ = sess.run([is_continue, loop])

# 展示结果
res = sess.run(point_to_centroid_assignment)
nums_in_clusters = [[] for i in range(10)]
for cluster, real_num in zip(list(res), list(y_train)):
    nums_in_clusters[cluster].append(real_num)

for i in range(10):
    print(Counter(nums_in_clusters[i]).most_common(3))

[(1, 4914), (3, 2923), (0, 180)]
[(0, 3178), (3, 1539), (6, 964)]
[(5, 3231), (6, 797), (0, 652)]
[(9, 2615), (5, 256), (7, 184)]
[(9, 1872), (5, 211), (7, 11)]
[(6, 1841), (2, 1594), (0, 1339)]
[(7, 1367), (9, 798), (5, 691)]
[(8, 2998), (6, 104), (2, 62)]
[(4, 3284), (2, 3203), (6, 1743)]
[(7, 3544), (5, 1079), (8, 65)]
