In [126]:
import numpy as np
import pandas as pd
import sympy as sp

In [127]:
def covariance(X):
    mean = np.mean(X,axis=0)
    X = X - mean
    return np.dot(X.T, X)/(X.shape[0]-1)

### PCA class

In [128]:
class PCA:
    def __init__(self, cumvar_threshold=0.95):
        self.cumvar_threshold = cumvar_threshold
        self.components = None
        self.explained_variance = None
        self.explained_variance_ratio = None
        self.cumulative_variance_ratio = None
        self.n_components = None
        
    def fit(self, X):
        # covariance matrix
        cov = covariance(X)
        # eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        # sort eigenvalues and eigenvectors
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]
        # explained variance
        self.explained_variance = eigenvalues
        self.explained_variance_ratio = eigenvalues / eigenvalues.sum()
        self.cumulative_variance_ratio = np.cumsum(self.explained_variance_ratio)
        # number of components
        self.n_components = np.argmax(self.cumulative_variance_ratio >= self.cumvar_threshold) + 1
        # components
        self.components = eigenvectors[:, :self.n_components]
    
    def transform(self, X):
        return np.dot(X, self.components)

### KNN Classifier

In [129]:
def euclidean_distance(X_train, X_test):
    dist = np.zeros((len(X_test), len(X_train)))
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))
    return dist
class KNN:
    def __init__(self, k = 5):
        self.k = k
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        self.distances = euclidean_distance(self.X_train, X_test)
        pred = []
        for dist in self.distances:
            k_nearest_indices = np.argsort(dist)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_indices]
            pred.append(np.unique(k_nearest_labels)[np.argmax(np.unique(k_nearest_labels, return_counts=True)[1])])
        return np.array(pred)

In [130]:
data = pd.read_csv('gender.csv')

In [131]:
classes = data.iloc[:, 1].unique()
train = pd.DataFrame()
test = pd.DataFrame()

for i in classes:
    train = pd.concat([train, data[data.iloc[:, 1] == i].iloc[10:]], ignore_index=True)
    test = pd.concat([test, data[data.iloc[:, 1] == i].iloc[:10]], ignore_index=True)

print(test)

    Unnamed: 0 Unnamed: 1         0         1         2         3         4  \
0            1       male -0.066420  0.151611  0.027740  0.052771 -0.066105   
1            2       male -0.030614  0.049667  0.008084 -0.050324  0.007649   
2            3       male -0.096178  0.061127  0.035326 -0.035388 -0.090728   
3            4       male -0.103057  0.085044  0.078333 -0.035873 -0.028163   
4            5       male -0.125815  0.120046  0.023131 -0.042901  0.038215   
5            6       male -0.149119  0.125288  0.142323 -0.009087 -0.031394   
6            7       male -0.139035  0.073513 -0.001770 -0.034225 -0.101610   
7            8       male -0.074126 -0.000669  0.004166 -0.082413 -0.096091   
8            9       male -0.166220  0.042769 -0.031647 -0.036892 -0.143837   
9           10       male -0.185770  0.154008  0.073184 -0.070829 -0.144617   
10         400     female  0.039844  0.070357  0.130196 -0.007683 -0.077825   
11         401     female  0.001747  0.185678  0.073

In [132]:
X_train = train.iloc[:, 2:].values
y_train = train.iloc[:, 1].values
X_test = test.iloc[:, 2:].values
y_test = test.iloc[:, 1].values

In [133]:
print(X_train)

[[-0.10175994  0.09511936  0.02239008 ...  0.04522717  0.13483205
   0.0537758 ]
 [-0.12695727  0.06544437 -0.01474994 ... -0.02528606 -0.00342875
   0.05703329]
 [ 0.02178704  0.0477692   0.03115616 ... -0.05274343  0.03425189
   0.04634342]
 ...
 [-0.20285167  0.0370395   0.07973114 ...  0.03738441 -0.00625749
   0.03997689]
 [-0.08829999  0.06353012  0.04962703 ...  0.00970074 -0.01694169
   0.04807128]
 [-0.15620135  0.05516458  0.14271647 ... -0.0102984  -0.02885648
   0.0753232 ]]


In [134]:
pca = PCA(0.95)
pca.fit(X_train)

In [135]:
X_train1 = pca.transform(X_train)

In [136]:
print(X_train1)

[[-0.16201447  0.03870741  0.13590151 ...  0.0875488  -0.18615724
  -0.03118321]
 [-0.16369729  0.09623356 -0.26956394 ...  0.13118405 -0.16424507
  -0.03418276]
 [-0.13568836  0.33517518 -0.14235416 ...  0.04097319 -0.20750583
  -0.07148731]
 ...
 [-0.49782625  0.35616263 -0.23839154 ...  0.05620864 -0.2027782
   0.00444246]
 [-0.53388546  0.38099249 -0.1277724  ...  0.0451492  -0.22881357
  -0.03173833]
 [-0.54512133  0.05387415 -0.26936994 ...  0.06487287 -0.18464931
  -0.04050429]]


In [137]:
X_test1 = pca.transform(X_test)

In [138]:
print(X_test1)

[[-0.09864139  0.18567741 -0.18802642 ...  0.09239422 -0.15614565
   0.0072273 ]
 [-0.1110883   0.11430068 -0.32885361 ...  0.09853505 -0.20004937
  -0.05611838]
 [-0.13898797  0.06899293 -0.1259554  ...  0.08764153 -0.21843502
   0.02000268]
 ...
 [-0.57546444  0.02622321  0.0915027  ...  0.09526622 -0.20086673
   0.01346464]
 [-0.51033098  0.32639377 -0.20469761 ...  0.08536315 -0.18382309
   0.01236624]
 [-0.52183469  0.0644349   0.01447099 ...  0.02715599 -0.19544759
   0.08034389]]


In [139]:
knn = KNN(5)
knn.fit(X_train1, y_train)

In [140]:
y_pred = knn.predict(X_test1)

In [141]:
compared = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(compared)

    Actual Predicted
0     male      male
1     male      male
2     male      male
3     male      male
4     male      male
5     male      male
6     male      male
7     male    female
8     male      male
9     male      male
10  female      male
11  female      male
12  female    female
13  female    female
14  female    female
15  female    female
16  female    female
17  female    female
18  female    female
19  female    female


In [142]:
def accuracy(y_pred,y_test):
    return np.sum(y_pred == y_test)/len(y_test)

print("Accuracy: ", accuracy(y_pred, y_test))
print("accuracy percentage: ", accuracy(y_pred, y_test)*100, "%")

Accuracy:  0.85
accuracy percentage:  85.0 %
