In [215]:
import numpy as np
import pandas as pd
import sympy as sp

In [216]:
def covariance(X):
    mean = np.mean(X,axis=0)
    X = X - mean
    return np.dot(X.T, X)/(X.shape[0]-1)

In [217]:
class PCA:
    def __init__(self, cumvar_threshold=0.95):
        self.cumvar_threshold = cumvar_threshold
        self.components = None
        self.explained_variance = None
        self.explained_variance_ratio = None
        self.cumulative_variance_ratio = None
        self.n_components = None
        
    def fit(self, X):
        # covariance matrix
        cov = covariance(X)
        # eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        # sort eigenvalues and eigenvectors
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]
        # explained variance
        self.explained_variance = eigenvalues
        self.explained_variance_ratio = eigenvalues / eigenvalues.sum()
        self.cumulative_variance_ratio = np.cumsum(self.explained_variance_ratio)
        # number of components
        self.n_components = np.argmax(self.cumulative_variance_ratio >= self.cumvar_threshold) + 1
        # components
        self.components = eigenvectors[:, :self.n_components]
    
    def transform(self, X):
        return np.dot(X, self.components)

In [218]:
def euclidean_distance(X_train, X_test):
    dist = np.zeros((len(X_test), len(X_train)))
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))
    return dist

In [219]:
class KNN:
    def __init__(self, k = 5):
        self.k = k
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        self.distances = euclidean_distance(self.X_train, X_test)
        pred = []
        for dist in self.distances:
            k_nearest_indices = np.argsort(dist)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_indices]
            pred.append(np.unique(k_nearest_labels)[np.argmax(np.unique(k_nearest_labels, return_counts=True)[1])])
        return np.array(pred)

In [220]:
df = pd.read_csv('face.csv')

In [221]:
classes = df['target'].unique()
train = pd.DataFrame()
test = pd.DataFrame()

for i in classes:
    train = pd.concat([train, df[df['target'] == i].iloc[2:]], ignore_index=True)
    test = pd.concat([test, df[df['target'] == i].iloc[:2]], ignore_index=True)

print(test)

           0         1         2         3         4         5         6  \
0   0.309917  0.367769  0.417355  0.442149  0.528926  0.607438  0.657025   
1   0.454545  0.471074  0.512397  0.557851  0.595041  0.640496  0.681818   
2   0.541322  0.586777  0.640496  0.661157  0.685950  0.685950  0.690083   
3   0.644628  0.690083  0.702479  0.702479  0.706612  0.719008  0.727273   
4   0.578512  0.603306  0.632231  0.665289  0.677686  0.710744  0.723140   
..       ...       ...       ...       ...       ...       ...       ...   
75  0.144628  0.219008  0.326446  0.471074  0.570248  0.644628  0.677686   
76  0.252066  0.219008  0.227273  0.272727  0.318182  0.388430  0.458678   
77  0.355372  0.392562  0.446281  0.462810  0.475207  0.491736  0.500000   
78  0.545455  0.611570  0.640496  0.657025  0.636364  0.648760  0.690083   
79  0.334711  0.404959  0.475207  0.537190  0.561983  0.553719  0.586777   

           7         8         9  ...      4087      4088      4089      4090  \
0   0.

In [222]:
X_train = train.drop(['target'], axis=1).values
y_train = train['target'].values
X_test = test.drop(['target'], axis=1).values
y_test = test['target'].values

In [223]:
print(X_train)

[[0.3181818  0.40082645 0.49173555 ... 0.14049587 0.14876033 0.15289256]
 [0.1983471  0.19421488 0.19421488 ... 0.75206614 0.75206614 0.73966944]
 [0.5        0.54545456 0.58264464 ... 0.17768595 0.17355372 0.17355372]
 ...
 [0.5        0.53305787 0.607438   ... 0.17768595 0.14876033 0.19008264]
 [0.21487603 0.21900827 0.21900827 ... 0.57438016 0.59090906 0.60330576]
 [0.5165289  0.46280992 0.28099173 ... 0.35950413 0.3553719  0.38429752]]


In [224]:
pca = PCA(0.95)
pca.fit(X_train)

In [225]:
X_train1 = pca.transform(X_train)
print(X_train1.shape)

(320, 111)


In [226]:
X_test1 = pca.transform(X_test)
print(X_test1.shape)

(80, 111)


In [227]:
knn = KNN(5)
knn.fit(X_train1, y_train)

In [228]:
y_pred = knn.predict(X_test1)

  dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))


In [229]:
compared = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(compared)

    Actual  Predicted
0        0          0
1        0          0
2        1          1
3        1          1
4        2         39
..     ...        ...
75      37         37
76      38         38
77      38         38
78      39         39
79      39          4

[80 rows x 2 columns]


In [230]:
def accuracy(y_pred,y_test):
    return np.sum(y_pred == y_test)/len(y_test)

print("Accuracy: ", accuracy(y_pred, y_test))
print("accuracy percentage: ", accuracy(y_pred, y_test)*100, "%")

Accuracy:  0.8625
accuracy percentage:  86.25 %
