In [1]:
import numpy as np
import pandas as pd

### KNN Classifier

In [2]:
def euclidean_distance(X_train, X_test):
    dist = np.zeros((len(X_test), len(X_train)))
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))
    return dist
class KNN:
    def __init__(self, k = 5):
        self.k = k
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        self.distances = euclidean_distance(self.X_train, X_test)
        pred = []
        for dist in self.distances:
            k_nearest_indices = np.argsort(dist)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_indices]
            pred.append(np.unique(k_nearest_labels)[np.argmax(np.unique(k_nearest_labels, return_counts=True)[1])])
        return np.array(pred)

In [3]:
df = pd.read_csv('diabetes.csv')
X = df.iloc[:, :-2].values
y = df.iloc[:, -2].values

In [4]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
X_train = X[:614]
X_test = X[614:]
y_train = y[:614]
y_test = y[614:]

In [5]:
knn = KNN(5)
knn.fit(X_train, y_train)

In [6]:
y_pred = knn.predict(X_test)

In [7]:
compared = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(compared)

     Actual  Predicted
0         1          1
1         0          0
2         0          0
3         0          0
4         1          0
..      ...        ...
149       0          1
150       0          0
151       0          0
152       1          1
153       0          0

[154 rows x 2 columns]


In [8]:
def accuracy(y_pred,y_test):
    return np.sum(y_pred == y_test)/len(y_test)

print("Accuracy: ", accuracy(y_pred, y_test))
print("accuracy percentage for KNN classifier: ", accuracy(y_pred, y_test)*100, "%")

Accuracy:  0.7077922077922078
accuracy percentage for KNN classifier:  70.77922077922078 %


### K-means Clustering

In [9]:
def kmeans(X, n_clusters, n_init, random_state):
    np.random.seed(random_state)
    centroids = X[np.random.choice(X.shape[0], n_clusters, replace=False)]

    for _ in range(n_init):
        labels = np.argmin(np.linalg.norm(X[:, np.newaxis] - centroids, axis=2), axis=1)
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(n_clusters)])
        
        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids
    return labels

In [10]:
n_clusters = 2
n_init = 100
random_state = 0
y_kmeans = kmeans(X, n_clusters, n_init, random_state)

In [11]:
print("Accuracy: ", accuracy(y_kmeans, y))
print("accuracy percentage for k-means clustering: ", accuracy(y_kmeans, y)*100, "%")

Accuracy:  0.66015625
accuracy percentage for k-means clustering:  66.015625 %
