## Algorytmy klasteryzacji danych

Biblioteki

In [ ]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings

Wczytywanie danych

In [49]:
warnings.filterwarnings('ignore')
iris = pd.read_csv('Iris.csv', index_col='Id')

label_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

features_train, features_test, labels_train, labels_test = train_test_split(iris[label_columns].to_numpy(), iris[['Species']].to_numpy(), test_size=0.3)

### Algorytm  K-Najbliższych Sąsiadów KNN

In [50]:
class NearestNeighborClassifier():
    def __init__(self, n_neighbors):
        self.n_neighbors = n_neighbors
        
    def fit(self, features, labels):
        self.features_train = features
        self.labels_train = labels

    def predict(self, features_test):
        predictions = []

        for row in features_test:
            n_labels = self.closest_n_labels(row)
            label = max(set(n_labels), key=n_labels.count)
            predictions.append(label)

        return predictions

    def closest_n_labels(self, row):
        best_dists = []
        best_indexes = []
        for i in range(0, self.n_neighbors):
            best_dists.append(distance.euclidean(row, self.features_train[i]))
            best_indexes.append(i)

        for i in range(0, len(self.features_train)):
            dist = distance.euclidean(row, self.features_train[i])
            if dist < max(best_dists):
                max_index = np.argmax(best_dists)
                best_dists.pop(max_index)
                best_indexes.pop(max_index)
                
                best_dists.append(dist)
                best_indexes.append(i)
                
        output_labels = []
                
        for idx in best_indexes:
            output_labels.append(self.labels_train[idx][0])

        return output_labels
    

model_KNN = NearestNeighborClassifier(n_neighbors=2)

model_KNN.fit(features_train, labels_train)
predictions_KNN = model_KNN.predict(features_test)

result_KNN = accuracy_score(labels_test, predictions_KNN)
print(f'Skuteczność: {round(result_KNN, 4)*100}%')

Skuteczność: 91.11%


### KNN dla centroidów

In [51]:
class KNN_Centroid():
    
    def fit(self, features, labels):
        self.features_train = features
        self.labels_train = labels               
        self.unique_labels = np.unique(labels)
        self.centroids = self.get_centroids(self.features_train, self.labels_train)

    def predict(self, features_test):
        predictions = []

        for row in features_test:
            label = self.closest_centroid(row)
            predictions.append(label)

        return predictions
    
    def get_centroids(self, features_train, labels_train):
        centroids = {}

        df = pd.DataFrame(np.hstack((features_train, labels_train)), 
                          columns=[f'col{i}' for i in range(len(self.features_train[0]))] + ['label']
                         ).groupby('label').mean()

        labels = df.index.tolist()

        for label in labels:
            centroids[label] = df.loc[label].to_numpy()

        return centroids

    def closest_centroid(self, row):
        best_dist = distance.euclidean(row, self.centroids[self.unique_labels[0]])
        best_label = self.unique_labels[0]
        
        for i in range(len(self.unique_labels)):
            dist = distance.euclidean(row, self.centroids[self.unique_labels[i]])
            if dist < best_dist:
                best_dist = dist
                best_label = self.unique_labels[i]

        return best_label
    

model_KNN_centroid = KNN_Centroid()

model_KNN_centroid.fit(features_train, labels_train)

predictions_KNN_centroids = model_KNN_centroid.predict(features_test)

result_KNN_centroids = accuracy_score(labels_test, predictions_KNN_centroids)
print(f'Skuteczność: {round(result_KNN_centroids, 4)*100}%')

Skuteczność: 86.67%


### Analizy składowych systemu decyzyjnego PCA

In [52]:
def PCA(X, n_comp):
    mean = np.mean(X, axis=0)
    X_minus_mean = X - mean

    cov_matrix = np.cov(X_minus_mean, rowvar=False)
    e_values, e_vectors = np.linalg.eig(cov_matrix)

    sorted_indices = np.argsort(e_values)[::-1][:n_comp]
    e_values = e_values[sorted_indices]
    e_vectors = e_vectors[:, sorted_indices]

    output = np.dot(X_minus_mean, e_vectors)

    return output, e_values, e_vectors

output_data, e_values, e_vectors = PCA(features_train, n_comp=2)

print("PC:")
print(e_vectors)

print("Output data:")
print(output_data)

PC:
[[ 0.36439759 -0.65313273]
 [-0.09009356 -0.7337238 ]
 [ 0.85466749  0.18104425]
 [ 0.35866564  0.04785396]]
Output data:
[[ 3.08995502e-02  5.97897154e-01]
 [ 2.09122714e+00 -3.71989341e-01]
 [ 1.31794902e+00 -2.28683249e-01]
 [-2.80099965e+00  2.53076854e-01]
 [-2.68810250e+00  1.32495626e-01]
 [ 2.53286837e-01  5.70080232e-01]
 [ 1.95394111e+00  1.88487719e-01]
 [-3.23959869e+00  5.25329944e-01]
 [ 6.42119517e-01  3.58803387e-01]
 [-2.71928163e+00 -9.28814361e-02]
 [ 3.78152635e+00 -2.46496458e-01]
 [-2.66887678e+00 -7.97167994e-01]
 [-2.72926652e+00  2.62647646e-01]
 [ 2.41035116e+00 -3.81477655e-01]
 [ 1.37183898e+00  2.85846306e-01]
 [ 3.59435259e-01  3.02978427e-01]
 [ 1.27378500e+00  1.19010910e-01]
 [ 1.76462340e+00  5.20089821e-01]
 [ 2.60044976e+00 -3.38721382e-01]
 [ 1.88085389e+00 -1.25342779e-01]
 [ 1.51659667e-01  6.90885644e-01]
 [ 8.03063443e-01  3.87541920e-01]
 [-2.64056830e+00 -5.84296986e-01]
 [ 2.82977866e-01  3.58246383e-01]
 [-2.61816704e+00 -1.07979922e+00]