In [1]:
import numpy as np
import sympy as sp
import pandas as pd

### Linear Discriminant Analysis (LDA)

In [2]:
class LDA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.X_train = None
        self.y_train = None
        self.Sw = None
        self.Sb = None
        self.eigen_values = None
        self.eigen_vectors = None

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        Sw = np.zeros((self.X_train.shape[1], self.X_train.shape[1]))
        Sb = np.zeros((self.X_train.shape[1], self.X_train.shape[1]))
        Sw = np.zeros((self.X_train.shape[1], self.X_train.shape[1]))
        for i in range(len(self.X_train)):
            x = self.X_train[i].reshape(self.X_train.shape[1], 1)
            m = np.mean(self.X_train[self.y_train == self.y_train[i]])
            Sw += np.dot((x - m), (x - m).T)
        m = np.mean(self.X_train)
        for i in range(len(self.X_train)):
            x = self.X_train[i].reshape(self.X_train.shape[1], 1)
            m_i = np.mean(self.X_train[self.y_train == self.y_train[i]])
            Sb += len(self.X_train[self.y_train == self.y_train[i]]) * np.dot((m_i - m), (m_i - m).T)
        self.Sw = Sw
        self.Sb = Sb
        A = np.dot(np.linalg.inv(Sw), Sb)
        eigen_values, eigen_vectors = np.linalg.eig(A)
        eigen_vectors = eigen_vectors.T
        idx = np.argsort(abs(eigen_values))[::-1]
        eigen_vectors = eigen_vectors[idx]
        eigen_values = eigen_values[idx]
        self.eigen_values = eigen_values[0:self.n_components]
        self.eigen_vectors = eigen_vectors[0:self.n_components]

    def transform(self, X):
        return np.dot(X, self.eigen_vectors.T)

In [3]:
def euclidean_distance(X_train, X_test):
    dist = np.zeros((len(X_test), len(X_train)))
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))
    return dist

In [4]:
def covariance(X):
    mean = np.mean(X,axis=0)
    X = X - mean
    return np.dot(X.T, X)/(X.shape[0]-1)

In [5]:
class KNN:
    def __init__(self, k = 5):
        self.k = k
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        self.distances = euclidean_distance(self.X_train, X_test)
        pred = []
        for dist in self.distances:
            k_nearest_indices = np.argsort(dist)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_indices]
            pred.append(np.unique(k_nearest_labels)[np.argmax(np.unique(k_nearest_labels, return_counts=True)[1])])
        return np.array(pred)

In [6]:
data = pd.read_csv('gender.csv')

In [7]:
classes = data.iloc[:, 1].unique()
train = pd.DataFrame()
test = pd.DataFrame()

for i in classes:
    train = pd.concat([train, data[data.iloc[:, 1] == i].iloc[10:]], ignore_index=True)
    test = pd.concat([test, data[data.iloc[:, 1] == i].iloc[:10]], ignore_index=True)

print(test)

    Unnamed: 0 Unnamed: 1         0         1         2         3         4  \
0            1       male -0.066420  0.151611  0.027740  0.052771 -0.066105   
1            2       male -0.030614  0.049667  0.008084 -0.050324  0.007649   
2            3       male -0.096178  0.061127  0.035326 -0.035388 -0.090728   
3            4       male -0.103057  0.085044  0.078333 -0.035873 -0.028163   
4            5       male -0.125815  0.120046  0.023131 -0.042901  0.038215   
5            6       male -0.149119  0.125288  0.142323 -0.009087 -0.031394   
6            7       male -0.139035  0.073513 -0.001770 -0.034225 -0.101610   
7            8       male -0.074126 -0.000669  0.004166 -0.082413 -0.096091   
8            9       male -0.166220  0.042769 -0.031647 -0.036892 -0.143837   
9           10       male -0.185770  0.154008  0.073184 -0.070829 -0.144617   
10         400     female  0.039844  0.070357  0.130196 -0.007683 -0.077825   
11         401     female  0.001747  0.185678  0.073

In [8]:
X_train = train.iloc[:, 2:].values
y_train = train.iloc[:, 1].values
X_test = test.iloc[:, 2:].values
y_test = test.iloc[:, 1].values
print(f"X_train: {len(X_train)}")
print(f"X_test: {len(X_test)}")

X_train: 780
X_test: 20


In [9]:
lda = LDA(1)
lda.fit(X_train, y_train)

In [10]:
X_train = lda.transform(X_train)
X_test = lda.transform(X_test)
print(f"X_train: {len(X_train)}")
print(f"X_test: {len(X_test)}")
print(X_train)

X_train: 780
X_test: 20
[[2.44600149e-06+0.j]
 [2.34091353e-06+0.j]
 [2.15523648e-06+0.j]
 [2.35880176e-06+0.j]
 [2.36760754e-06+0.j]
 [2.72282453e-06+0.j]
 [2.35996022e-06+0.j]
 [2.23167874e-06+0.j]
 [1.95743297e-06+0.j]
 [2.56550604e-06+0.j]
 [2.10378168e-06+0.j]
 [2.36028426e-06+0.j]
 [1.95646622e-06+0.j]
 [2.14526410e-06+0.j]
 [2.15403782e-06+0.j]
 [2.04833888e-06+0.j]
 [2.23740913e-06+0.j]
 [2.22047386e-06+0.j]
 [2.29610920e-06+0.j]
 [2.09869820e-06+0.j]
 [2.26044357e-06+0.j]
 [2.63529580e-06+0.j]
 [2.04368301e-06+0.j]
 [1.95623039e-06+0.j]
 [2.20509096e-06+0.j]
 [2.45267106e-06+0.j]
 [2.33065195e-06+0.j]
 [2.49606368e-06+0.j]
 [2.34590489e-06+0.j]
 [2.32396325e-06+0.j]
 [2.19421819e-06+0.j]
 [2.21273452e-06+0.j]
 [2.61549480e-06+0.j]
 [2.57753238e-06+0.j]
 [2.36552782e-06+0.j]
 [2.21110201e-06+0.j]
 [1.95641154e-06+0.j]
 [3.04672081e-06+0.j]
 [2.69177993e-06+0.j]
 [2.13894617e-06+0.j]
 [2.28188878e-06+0.j]
 [2.32001551e-06+0.j]
 [2.76598170e-06+0.j]
 [2.33225098e-06+0.j]
 [2.2215

In [11]:
knn = KNN(3)
knn.fit(X_train, y_train)

In [12]:
y_pred = knn.predict(X_test)
compared = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(compared)

    Actual Predicted
0     male      male
1     male      male
2     male      male
3     male      male
4     male      male
5     male      male
6     male      male
7     male      male
8     male      male
9     male      male
10  female      male
11  female      male
12  female    female
13  female    female
14  female    female
15  female    female
16  female    female
17  female    female
18  female    female
19  female    female


  dist[i,j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))


In [13]:
def accuracy(y_pred,y_test):
    return np.sum(y_pred == y_test)/len(y_test)

print("Accuracy: ", accuracy(y_pred, y_test))
print("accuracy percentage: ", accuracy(y_pred, y_test)*100, "%")

Accuracy:  0.9
accuracy percentage:  90.0 %
