## Задание 2
Реализация `k-means` алгоритма + оптимизация количества кластеров

In [28]:
import numpy as np
from sklearn.datasets import load_iris
import random

class Classifier:
    
    def __init__(self, dataset, n_clusters=3):
        self.dataset = dataset
        self.n_clusters = n_clusters
        self.max_n_iter = 100
        self.fitted = False
        self.labels = np.array([])
        self.centroids = random.sample(dataset, k=n_clusters)
                
    def get_distance(self, list1, list2):
        return sum((i - j)**2 for i,j in zip(list1, list2))
        
    def distribute_data(self):
        self.labels = np.array([])
        
        for elem in self.dataset:
            distances = [self.get_distance(elem, center) for center in self.centroids]
            minimum_distance_index = distances.index(min(distances))
            self.labels = np.append(list(self.labels), minimum_distance_index).astype(int)
        
    def recalculate_centroids(self):
        for i in range(self.n_clusters):
            num = 0
            temp = np.zeros(self.dataset[0].shape)
            
            for k, label in enumerate(self.labels):
                if label == i:
                    temp = temp + self.dataset[k]
                    num += 1
                    
            self.centroids[i] = temp / num
                        
    def fit(self):
        iterator = 1
        
        while iterator < self.max_n_iter:
            prev_centroids = np.copy(self.centroids)
            self.distribute_data()
            self.recalculate_centroids()
            if max([self.get_distance(i, k) for i, k in zip(self.centroids, prev_centroids)]):
                break
            iterator += 1
            
        self.fitted = True
        
    def check_is_fitted(self):
        if self.fitted:
            return True
        else:
            raise AttributeError("You must train classifier before predicting data!")
           
    def predict(self, list):
        self.check_is_fitted()
        
        result = np.array([])
        
        for elem in list:
            distances = [self.get_distance(elem, center) for center in self.centroids]
            min_distance_index = distances.index(min(distances))
            result = np.append(result, min_distance_index).astype(int)
            
        return result

In [29]:
dataset = load_iris()
data = list(dataset['data'])

test_data = random.sample(data, k=5)

In [30]:
inertia_classes = []
K = range(2,15)
for k in K:
    km = Classifier(data, k)
    km.fit()
    inertia = 0
    for i in km.centroids:
        #inertia formula
        inertia += np.linalg.norm(i-data)
    inertia = inertia/len(km.centroids)
    inertia_classes.append(inertia)

In [31]:
inertia_classes

[33.732295516778834,
 35.83610472214159,
 36.106611858599315,
 35.195767160365975,
 36.12554097253959,
 33.28616363975803,
 37.53319432222951,
 40.35787790604981,
 34.819447776419246,
 36.45041650269941,
 38.928570901101686,
 35.75545609840049,
 37.226518695057436]

In [32]:
optimal_number_of_classes = inertia_classes.index(min(inertia_classes)) + 1
optimal_number_of_classes

6

In [33]:
model_euclidean2 = Classifier(data, optimal_number_of_classes)

model_euclidean2.fit()

prediction_result = model_euclidean2.predict(test_data)
print("Prediction result for Euclidean2 metric:")
print(prediction_result)

Prediction result for Euclidean2 metric:
[0 3 3 2 2]
