# kernel k-means

In [3]:
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML

In [4]:
class kernel_k_means:
    
    def __init__(self, data, ground, cluster_num):
        
        with open(data) as f:
            lines = f.readlines()
        self.points = np.array([[float(x) for x in line.strip().split(" ")] for line in lines])

        with open(ground) as f:
            lines = f.readlines()
        self.labels = np.array([float(line.strip()) for line in lines])
        
        self.cluster_num = cluster_num
        
        self.gran_mat = self.calc_gran_mat()
        
        self.predicts = np.zeros((len(self.points), self.cluster_num))
        for i in range(len(self.points)):
            self.predicts[i][random.randint(0, self.cluster_num - 1)] = 1
            
        self.cluster_count = np.zeros((self.cluster_num, self.cluster_num))
        for predict in self.predicts:
            idx = np.argmax(predict)
            self.cluster_count[idx][idx] += 1.0
            
        for i in range(self.cluster_num):
            self.cluster_count[i][i] = 1.0 / float(self.cluster_count[i][i])
        
    def calc_gran_mat(self):
        n = len(self.points)
        mat = np.zeros((n, n))
        
        for i in range(n):
            for j in range(i, n):
                mat[i][j] = self.RBF_kernel(self.points[i], self.points[j], -1.0 / 2.0 / 4.0 / 4.0)
        for i in range(n):
            for j in range(0, i):
                mat[i][j] = mat[j][i]
            
        return mat
    
    def RBF_kernel(self, x1, x2, gamma):
        return math.exp(gamma * (math.pow(x1[0]-x2[0], 2) + math.pow(x1[1]-x2[1], 2)))
    
    def E_step(self):
        
        kk = np.dot(self.cluster_count, self.predicts.T)
        kk = np.dot(kk, self.gran_mat)
        kk = np.dot(kk, self.predicts)
        kk = np.dot(kk, self.cluster_count)
        kk = kk.diagonal()
        
        for point_idx in range(len(self.points)):
            k = np.dot(self.gran_mat[point_idx,:], self.predicts)
            k = np.dot(k, self.cluster_count)

            distance = kk - 2* k
            self.predicts[point_idx, :] = np.zeros(self.cluster_num)
            self.predicts[point_idx, np.argmin(distance)] = 1
            
    def M_step(self):
        self.cluster_count = np.zeros((self.cluster_num, self.cluster_num))
        for predict in self.predicts:
            idx = np.argmax(predict)
            self.cluster_count[idx][idx] += 1.0
            
        for i in range(self.cluster_num):
            self.cluster_count[i][i] = 1.0 / float(self.cluster_count[i][i])

In [6]:
color_list = ["green", "blue", "red", "orange", "yellow", "purple", "black"]

model = kernel_k_means("data/test2_data.txt", "data/test2_ground.txt", 2)
x, y = zip(*model.points)

fig, ax = plt.subplots()
fig.set_size_inches((8, 4.5))

def update(i):
    ax.clear()
    
    if i == 0:
        return ax.scatter(x, y, color='black', marker='.')
        
    model.E_step()
    model.M_step()
        
    colors = [color_list[np.argmax(row)] for row in model.predicts]
    return ax.scatter(x, y, color=colors, marker='.')
    
ani = animation.FuncAnimation(fig=fig, func=update, frames=10, interval=300, blit=False)
HTML(ani.to_html5_video())