## Задание 3
Реализация алгоритма кластеризации `C-Means`

In [1]:
import numpy as np
from sklearn.datasets import make_blobs
import random

class C_Means():
    def __init__(self, dataset, n_clusters=3, fuzzy_c=2, cut_param=.9):
        self.dataset = dataset
        self.n_clusters = n_clusters
        self.fuzzy_c = fuzzy_c
        self.cut_param = cut_param
        self.max_iter_num = 1000
        self.tolerance = .01
        self.fitted = False
        self.dist = np.zeros((self.dataset.shape[0],self.n_clusters))
        self.centroids = random.sample(list(dataset), k=n_clusters)
        self.u = np.array([[np.random.uniform(0,1) for i in range(self.n_clusters)] for j in range(self.dataset.shape[0])])

    def get_dist2(self, list1, list2):
        return sum((i - j) ** 2 for i, j in zip(list1, list2))

    def distribute_data(self):
        self.dist = np.array([[self.get_dist2(i,j) for i in self.centroids] for j in self.dataset])
        self.u = (1 / self.dist) ** (1/(self.fuzzy_c-1))
        self.u = (self.u / self.u.sum(axis=1)[:,None])
        # Change nan to 1
        self.u[np.isnan(self.u)] = 1

    def recalculate_centroids(self):
        self.centroids = (self.u.T).dot(self.dataset) / self.u.sum(axis=0)[:, None]

    def fit(self):
        iter = 1
        while iter < self.max_iter_num:
            prev_centroids = np.copy(self.centroids)
            self.recalculate_centroids()
            self.distribute_data()
            if max([self.get_dist2(i, k) for i, k in zip(self.centroids, prev_centroids)]) < self.tolerance:
                break
            iter += 1
        self.fitted = True

    def check_is_fitted(self):
        if self.fitted:
            return True
        else:
            raise AttributeError("You must train classifier before predicting data!")

    def get_labels(self):
        self.check_is_fitted()

        result = np.array([])

        for i in range(self.dataset.shape[0]):
            max_probability = max(self.u[i][j] for j in range(self.n_clusters))
            _, max_probability_cluster_index = np.where(self.u == max_probability)

            if max_probability < self.cut_param:
                result = np.append(result, 0).astype(int)
            else:
                result = np.append(result, max_probability_cluster_index + 1).astype(int)

        return result

In [2]:
dataset, _ = make_blobs(centers=3, n_features=2)

model = C_Means(dataset, 3, 1.2, .9)
model.fit()

In [3]:
print(model.get_labels())

[3 3 3 3 3 3 3 2 3 2 3 3 3 3 0 3 0 0 3 3 1 2 3 3 3 3 3 3 3 2 3 3 2 2 3 1 3
 3 3 3 3 3 3 1 3 3 2 3 3 3 0 3 3 1 3 0 2 2 3 3 3 1 3 1 2 3 2 0 3 2 3 3 3 1
 2 3 3 2 3 1 2 3 3 3 1 2 3 3 2 3 3 3 3 3 3 3 2 3 3 2]
