<a href="https://colab.research.google.com/github/olcaykursun/ML/blob/main/NearestCentroid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Nearest mean classifier, our version from scratch vs scikit's
Created on Fri Feb  3 20:45:36 2023

@author: okursun

Useful links (mentioned in the lecture video):
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestCentroid.html
https://en.wikipedia.org/wiki/Nearest_centroid_classifier
https://numpy.org/doc/stable/user/basics.broadcasting.html
https://numpy.org/doc/stable/reference/generated/numpy.append.html
"""



import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
iris.target *= 2
X_train, X_test, y_train, y_test = train_test_split(iris.data, 
                                                    iris.target, 
                                                    train_size=0.6, 
                                                    #random_state=123, 
                                                    stratify=iris.target)
#%%

classes = np.unique(y_train)
num_classes = len(classes)

means = []
for c in classes :
    means.append(np.mean(X_train[y_train==c], axis = 0))
print(means)
means =np.array(means)

import pandas
y_one_hot_encoded = pandas.get_dummies(y_train).to_numpy()
#print(y_one_hot_encoded)
means2 = np.dot(X_train.T, y_one_hot_encoded) / np.sum(y_one_hot_encoded, axis=0)
print(means2)
means2 = means2.T

print(np.all(means == means2))
print(np.sum((means-means2)**2))

#%%

predictions = []
for test_vector, test_class in zip(X_test, y_test) :
    squared_Euclidean_distances = np.sum((means2 - test_vector)**2, axis = 1)
    print(squared_Euclidean_distances)
    idx_closest_center = np.argmin(squared_Euclidean_distances)
    predicted_class = classes[idx_closest_center]
    print(predicted_class)
    predictions.append(predicted_class)

print(np.mean(predictions == y_test))


#%%
from sklearn.neighbors import NearestCentroid
clf = NearestCentroid()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))


[array([4.94      , 3.35666667, 1.47333333, 0.24666667]), array([5.80333333, 2.69      , 4.21      , 1.30333333]), array([6.62      , 2.96666667, 5.61      , 2.06333333])]
[[4.94       5.80333333 6.62      ]
 [3.35666667 2.69       2.96666667]
 [1.47333333 4.21       5.61      ]
 [0.24666667 1.30333333 2.06333333]]
True
0.0
[ 0.0397      9.54022222 23.05228889]
0
[ 0.5817      8.74288889 20.41428889]
0
[ 0.16503333  8.60822222 21.13028889]
0
[12.77836667  0.77622222  1.90562222]
2
[8.36666667e-03 9.71088889e+00 2.31762889e+01]
0
[12.1177      0.71488889  2.01895556]
2
[6.27636667 0.45755556 5.66762222]
2
[15.6277      0.90688889  0.84495556]
4
[ 0.33236667  9.74422222 22.39028889]
0
[15.5617      1.01488889  1.57628889]
2
[ 0.05703333 10.77622222 24.59428889]
0
[ 0.82836667  8.54955556 19.93828889]
0
[7.37036667 0.38555556 5.77962222]
2
[ 0.07703333  9.98155556 23.40028889]
0
[ 0.13636667 10.02555556 24.15562222]
0
[10.8957      0.20955556  2.47962222]
2
[14.0957      0.94688889  1.256