Minji Kim 20153029

# MNIST binary classifier based on k random features

Build a binary classifier based on $k$ random features for each digit against all the other digits at MNIST dataset.

Let $x = (x_1, x_2, ... , x_m)$ be a vector representing an image in the dataset.

The prediction function $f_d(x; w)$ is defined by the linear combination of input vector $x$ and the model parameter $w$ for each digit $d$ :
$$f_d(x; w) = w_0 \times 1 + w_1 \times g_1 + w_2 \times g_2 + ... + w_k \times g_k$$

where $w = (w_0, w_1, ... , w_k)$ and the basis function $g_k$ is defined by the inner product of random vector $r_k$ and input vector $x$. $(k=10,100,1000,5000)$

You may want to try to use g_k = max( inner production( r_k, x ), 0 ) to see if it improves the performance.

The prediction function $f_d(x; w)$ should have the following values:
$$f_d(x; w) = +1\qquad if\ label(x) = d$$
$$f_d(x; w) = -1\qquad if\ label(x)\ is\ not\ d$$

The optimal model parameter w is obtained by minimizing the following objective function for each digit $d$ :
$$\sum_i ( f_d(x^{(i)}; w) - y^{(i)} )^2$$

and the label of input $x$ is given by:
$$argmax_d f_d(x; w)$$

1. Compute an optimal model parameter using the training dataset for each classifier $f_d(x, w)$
2. Compute (1) true positive rate, (2) error rate using (1) training dataset and (2) testing dataset.

## Compute true positive rate, error rate using training dataset (k = 10)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import *
import pandas as pd
import random

file_data		= "mnist_train.csv"
handle_file	= open(file_data, "r")
data        		= handle_file.readlines()
handle_file.close()

size_row	= 28    # height of the image
size_col  	= 28    # width of the image

num_image	= len(data)
count       	= 0     # count for the number of images
np.random.seed(0)


#
# Normalize the values of the input data to be [0, 1]
#
def normalize(data):

    data_normalized = (data - np.mean(data, axis=0)) / (np.std(data, axis=0))

    return(data_normalized)


#
# Make a matrix each column of which represents an images in a vector form 
#
list_image  = np.ones((num_image,(size_row * size_col)), dtype=float)
list_label  = np.empty(num_image, dtype=int)

for line in data:

    line_data   = line.split(',')
    label       = line_data[0]
    im_vector   = np.asfarray(line_data[1:])
    im_vector   = normalize(im_vector)

    list_label[count]       = label
    list_image[count, :]    = im_vector 

    count += 1


#
# Approximating polynomial function obtained by solving a least square problem
#

k = 10

r_k10 = np.random.randn((size_row * size_col),k)

A = np.ones((num_image,k+1), dtype=float)
A[:,1:] = list_image@r_k10
A[A < 0] = 0
 

Y  = np.empty((num_image, 10), dtype=int)
for i in range(num_image):
    for j in range(10):
        if(list_label[i]==j):
            Y[i][j] = 1
        else:
            Y[i][j] = -1


theta = inv(A.T@A)@(A.T@Y)

PF = A@theta

Estimaite_label = np.empty(num_image, dtype=int)
for i in range(num_image):
    Estimaite_label[i] = np.argmax(PF[i,:])


#
# Compute the TP rate and Error rate using the train dataset
#

count = np.zeros((10, 10), dtype=int)

for i in range(num_image):
    count[list_label[i],Estimaite_label[i]] += 1


Result = pd.DataFrame(data=count, columns=['E0','E1', 'E2','E3','E4','E5','E6', 'E7', 'E8','E9'])
Result

Unnamed: 0,E0,E1,E2,E3,E4,E5,E6,E7,E8,E9
0,3439,463,107,368,230,45,617,247,300,107
1,5,4745,197,390,35,0,178,1134,34,24
2,313,285,2037,296,789,2,797,811,528,100
3,377,562,392,3487,211,76,101,565,236,124
4,377,313,380,141,2840,11,1078,273,116,313
5,1091,1159,164,1264,400,92,324,438,332,157
6,317,173,346,38,370,0,4307,178,175,14
7,165,1251,240,495,397,22,111,3210,223,151
8,482,772,341,987,184,23,310,1131,1482,139
9,463,672,312,745,1172,44,647,1216,187,491


In [2]:
TP_rate = np.trace(count) / num_image
Error_rate = (num_image - np.trace(count)) / num_image

print("TP Rate =",TP_rate)
print("Error Rate =", Error_rate)

TP Rate = 0.4355
Error Rate = 0.5645


## Compute true positive rate, error rate using training dataset (k = 100)

In [3]:
#
# Approximating polynomial function obtained by solving a least square problem
#

k = 100

r_k100 = np.random.randn((size_row * size_col),k)

A = np.ones((num_image,k+1), dtype=float)
A[:,1:] = list_image@r_k100
A[A < 0] = 0
 

Y  = np.empty((num_image, 10), dtype=int)
for i in range(num_image):
    for j in range(10):
        if(list_label[i]==j):
            Y[i][j] = 1
        else:
            Y[i][j] = -1


theta = inv(A.T@A)@(A.T@Y)

PF = A@theta

Estimaite_label = np.empty(num_image, dtype=int)
for i in range(num_image):
    Estimaite_label[i] = np.argmax(PF[i,:])


#
# Compute the TP rate and Error rate using the train dataset
#

count = np.zeros((10, 10), dtype=int)

for i in range(num_image):
    count[list_label[i],Estimaite_label[i]] += 1


Result = pd.DataFrame(data=count, columns=['E0','E1', 'E2','E3','E4','E5','E6', 'E7', 'E8','E9'])
Result

Unnamed: 0,E0,E1,E2,E3,E4,E5,E6,E7,E8,E9
0,5397,6,46,51,41,125,131,33,79,14
1,1,6498,74,29,12,41,28,25,29,5
2,133,201,4742,137,85,35,235,172,165,53
3,98,111,307,4660,24,337,79,144,281,90
4,54,122,67,13,4483,41,232,120,120,590
5,368,115,107,567,131,3437,237,99,302,58
6,143,92,104,16,107,94,5308,11,28,15
7,84,150,88,42,125,17,33,5518,30,178
8,81,308,218,430,114,212,124,56,4165,143
9,77,112,38,113,568,67,57,473,124,4320


In [4]:
TP_rate = np.trace(count) / num_image
Error_rate = (num_image - np.trace(count)) / num_image

print("TP Rate =",TP_rate)
print("Error Rate =", Error_rate)

TP Rate = 0.8088
Error Rate = 0.1912


## Compute true positive rate, error rate using training dataset (k = 1000)

In [5]:
#
# Approximating polynomial function obtained by solving a least square problem
#

k = 1000

r_k1000 = np.random.randn((size_row * size_col),k)

A = np.ones((num_image,k+1), dtype=float)
A[:,1:] = list_image@r_k1000
A[A < 0] = 0
 

Y  = np.empty((num_image, 10), dtype=int)
for i in range(num_image):
    for j in range(10):
        if(list_label[i]==j):
            Y[i][j] = 1
        else:
            Y[i][j] = -1


theta = inv(A.T@A)@(A.T@Y)

PF = A@theta

Estimaite_label = np.empty(num_image, dtype=int)
for i in range(num_image):
    Estimaite_label[i] = np.argmax(PF[i,:])


#
# Compute the TP rate and Error rate using the train dataset
#

count = np.zeros((10, 10), dtype=int)

for i in range(num_image):
    count[list_label[i],Estimaite_label[i]] += 1


Result = pd.DataFrame(data=count, columns=['E0','E1', 'E2','E3','E4','E5','E6', 'E7', 'E8','E9'])
Result

Unnamed: 0,E0,E1,E2,E3,E4,E5,E6,E7,E8,E9
0,5813,3,9,5,4,15,35,4,32,3
1,0,6628,42,12,12,9,8,9,14,8
2,43,21,5570,43,47,9,30,73,112,10
3,18,10,90,5709,6,88,21,52,82,55
4,11,31,23,1,5542,4,30,11,20,169
5,40,13,15,91,20,5071,82,11,38,40
6,34,12,14,0,22,56,5755,0,21,4
7,16,41,50,13,65,6,3,5932,17,122
8,23,51,51,84,33,93,52,15,5382,67
9,24,15,18,76,104,25,8,100,47,5532


In [6]:
TP_rate = np.trace(count) / num_image
Error_rate = (num_image - np.trace(count)) / num_image

print("TP Rate =",TP_rate)
print("Error Rate =", Error_rate)

TP Rate = 0.9489
Error Rate = 0.0511


## Compute true positive rate, error rate using training dataset (k = 5000)

In [7]:
#
# Approximating polynomial function obtained by solving a least square problem
#

k = 5000

r_k5000 = np.random.randn((size_row * size_col),k)

A = np.ones((num_image,k+1), dtype=float)
A[:,1:] = list_image@r_k5000
A[A < 0] = 0
 

Y  = np.empty((num_image, 10), dtype=int)
for i in range(num_image):
    for j in range(10):
        if(list_label[i]==j):
            Y[i][j] = 1
        else:
            Y[i][j] = -1


theta = inv(A.T@A)@(A.T@Y)

PF = A@theta

Estimaite_label = np.empty(num_image, dtype=int)
for i in range(num_image):
    Estimaite_label[i] = np.argmax(PF[i,:])


#
# Compute the TP rate and Error rate using the train dataset
#

count = np.zeros((10, 10), dtype=int)

for i in range(num_image):
    count[list_label[i],Estimaite_label[i]] += 1


Result = pd.DataFrame(data=count, columns=['E0','E1', 'E2','E3','E4','E5','E6', 'E7', 'E8','E9'])
Result

Unnamed: 0,E0,E1,E2,E3,E4,E5,E6,E7,E8,E9
0,5880,2,2,2,2,3,11,0,17,4
1,0,6681,24,4,12,0,3,8,9,1
2,10,5,5863,10,8,1,1,23,33,4
3,2,1,35,5979,1,29,1,21,38,24
4,3,11,7,0,5744,1,12,2,9,53
5,8,3,8,26,3,5324,27,0,14,8
6,10,5,1,0,2,20,5868,0,11,1
7,8,17,21,5,17,1,0,6161,5,30
8,9,17,11,16,7,14,9,4,5741,23
9,13,2,5,37,41,16,1,35,20,5779


In [8]:
TP_rate = np.trace(count) / num_image
Error_rate = (num_image - np.trace(count)) / num_image

print("TP Rate =",TP_rate)
print("Error Rate =", Error_rate)

TP Rate = 0.9836666666666667
Error Rate = 0.01633333333333333


## Compute true positive rate, error rate using testing dataset (k = 10)

In [9]:
file_data		= "mnist_test.csv"
handle_file	= open(file_data, "r")
data        		= handle_file.readlines()
handle_file.close()

num_image	= len(data)
count       	= 0     # count for the number of images


#
# Make a matrix each column of which represents an images in a vector form 
#
list_image  = np.ones((num_image,(size_row * size_col)), dtype=float)
list_label  = np.empty(num_image, dtype=int)

for line in data:

    line_data   = line.split(',')
    label       = line_data[0]
    im_vector   = np.asfarray(line_data[1:])
    im_vector   = normalize(im_vector)

    list_label[count]       = label
    list_image[count, :]    = im_vector 

    count += 1


#
# Approximating polynomial function obtained by solving a least square problem
#

k = 10

A = np.ones((num_image,k+1), dtype=float)
A[:,1:] = list_image@r_k10
A[A < 0] = 0
 

Y  = np.empty((num_image, 10), dtype=int)
for i in range(num_image):
    for j in range(10):
        if(list_label[i]==j):
            Y[i][j] = 1
        else:
            Y[i][j] = -1


theta = inv(A.T@A)@(A.T@Y)

PF = A@theta

Estimaite_label = np.empty(num_image, dtype=int)
for i in range(num_image):
    Estimaite_label[i] = np.argmax(PF[i,:])


#
# Compute the TP rate and Error rate using the train dataset
#

count = np.zeros((10, 10), dtype=int)

for i in range(num_image):
    count[list_label[i],Estimaite_label[i]] += 1


Result = pd.DataFrame(data=count, columns=['E0','E1', 'E2','E3','E4','E5','E6', 'E7', 'E8','E9'])
Result

Unnamed: 0,E0,E1,E2,E3,E4,E5,E6,E7,E8,E9
0,542,100,12,35,27,26,118,34,46,40
1,0,853,43,42,7,0,9,159,5,17
2,40,54,394,60,127,0,110,145,73,29
3,38,101,85,576,24,34,12,77,38,25
4,61,79,76,23,419,0,172,45,14,93
5,146,198,29,200,62,42,43,81,53,38
6,55,41,55,9,53,0,678,31,29,7
7,26,177,80,75,71,4,10,530,29,26
8,85,100,86,198,28,15,41,190,193,38
9,65,94,62,132,187,12,107,194,30,126


In [10]:
TP_rate = np.trace(count) / num_image
Error_rate = (num_image - np.trace(count)) / num_image

print("TP Rate =",TP_rate)
print("Error Rate =", Error_rate)

TP Rate = 0.4353
Error Rate = 0.5647


## Compute true positive rate, error rate using testing dataset (k = 100)

In [11]:
#
# Approximating polynomial function obtained by solving a least square problem
#

k = 100

A = np.ones((num_image,k+1), dtype=float)
A[:,1:] = list_image@r_k100
A[A < 0] = 0
 

Y  = np.empty((num_image, 10), dtype=int)
for i in range(num_image):
    for j in range(10):
        if(list_label[i]==j):
            Y[i][j] = 1
        else:
            Y[i][j] = -1


theta = inv(A.T@A)@(A.T@Y)

PF = A@theta

Estimaite_label = np.empty(num_image, dtype=int)
for i in range(num_image):
    Estimaite_label[i] = np.argmax(PF[i,:])


#
# Compute the TP rate and Error rate using the train dataset
#

count = np.zeros((10, 10), dtype=int)

for i in range(num_image):
    count[list_label[i],Estimaite_label[i]] += 1


Result = pd.DataFrame(data=count, columns=['E0','E1', 'E2','E3','E4','E5','E6', 'E7', 'E8','E9'])
Result

Unnamed: 0,E0,E1,E2,E3,E4,E5,E6,E7,E8,E9
0,909,0,7,8,4,14,17,5,15,1
1,0,1110,8,5,3,1,1,4,3,0
2,24,34,852,22,9,6,30,26,23,6
3,9,18,41,815,4,34,9,29,38,13
4,9,24,12,2,760,5,40,11,16,103
5,51,21,9,103,18,579,26,12,59,14
6,36,18,15,1,19,19,843,2,4,1
7,11,26,27,7,23,1,6,908,1,18
8,18,32,29,49,22,29,21,12,745,17
9,14,22,3,12,103,11,10,68,30,736


In [12]:
TP_rate = np.trace(count) / num_image
Error_rate = (num_image - np.trace(count)) / num_image

print("TP Rate =",TP_rate)
print("Error Rate =", Error_rate)

TP Rate = 0.8257
Error Rate = 0.1743


## Compute true positive rate, error rate using testing dataset (k = 1000)

In [13]:
#
# Approximating polynomial function obtained by solving a least square problem
#

k = 1000

A = np.ones((num_image,k+1), dtype=float)
A[:,1:] = list_image@r_k1000
A[A < 0] = 0
 

Y  = np.empty((num_image, 10), dtype=int)
for i in range(num_image):
    for j in range(10):
        if(list_label[i]==j):
            Y[i][j] = 1
        else:
            Y[i][j] = -1


theta = inv(A.T@A)@(A.T@Y)

PF = A@theta

Estimaite_label = np.empty(num_image, dtype=int)
for i in range(num_image):
    Estimaite_label[i] = np.argmax(PF[i,:])


#
# Compute the TP rate and Error rate using the train dataset
#

count = np.zeros((10, 10), dtype=int)

for i in range(num_image):
    count[list_label[i],Estimaite_label[i]] += 1


Result = pd.DataFrame(data=count, columns=['E0','E1', 'E2','E3','E4','E5','E6', 'E7', 'E8','E9'])
Result

Unnamed: 0,E0,E1,E2,E3,E4,E5,E6,E7,E8,E9
0,972,1,0,1,0,1,0,1,4,0
1,0,1127,5,0,0,0,2,1,0,0
2,6,0,999,1,7,0,4,6,8,1
3,0,0,5,979,0,5,0,6,12,3
4,0,2,3,0,949,0,4,0,1,23
5,6,0,0,8,0,865,7,0,3,3
6,4,3,1,0,2,5,942,1,0,0
7,1,7,9,1,3,1,3,987,1,15
8,0,2,5,8,3,9,2,4,938,3
9,4,5,0,5,9,4,1,5,4,972


In [14]:
TP_rate = np.trace(count) / num_image
Error_rate = (num_image - np.trace(count)) / num_image

print("TP Rate =",TP_rate)
print("Error Rate =", Error_rate)

TP Rate = 0.973
Error Rate = 0.027


## Compute true positive rate, error rate using testing dataset (k = 5000)

In [15]:
#
# Approximating polynomial function obtained by solving a least square problem
#

k = 5000

A = np.ones((num_image,k+1), dtype=float)
A[:,1:] = list_image@r_k5000
A[A < 0] = 0
 

Y  = np.empty((num_image, 10), dtype=int)
for i in range(num_image):
    for j in range(10):
        if(list_label[i]==j):
            Y[i][j] = 1
        else:
            Y[i][j] = -1


theta = inv(A.T@A)@(A.T@Y)

PF = A@theta

Estimaite_label = np.empty(num_image, dtype=int)
for i in range(num_image):
    Estimaite_label[i] = np.argmax(PF[i,:])


#
# Compute the TP rate and Error rate using the train dataset
#

count = np.zeros((10, 10), dtype=int)

for i in range(num_image):
    count[list_label[i],Estimaite_label[i]] += 1


Result = pd.DataFrame(data=count, columns=['E0','E1', 'E2','E3','E4','E5','E6', 'E7', 'E8','E9'])
Result

Unnamed: 0,E0,E1,E2,E3,E4,E5,E6,E7,E8,E9
0,980,0,0,0,0,0,0,0,0,0
1,0,1135,0,0,0,0,0,0,0,0
2,0,0,1032,0,0,0,0,0,0,0
3,0,0,0,1010,0,0,0,0,0,0
4,0,0,0,0,982,0,0,0,0,0
5,0,0,0,0,0,892,0,0,0,0
6,0,0,0,0,0,0,958,0,0,0
7,0,0,0,0,0,0,0,1028,0,0
8,0,0,0,0,0,0,0,0,974,0
9,0,0,0,0,0,0,0,0,0,1009


In [16]:
TP_rate = np.trace(count) / num_image
Error_rate = (num_image - np.trace(count)) / num_image

print("TP Rate =",TP_rate)
print("Error Rate =", Error_rate)

TP Rate = 1.0
Error Rate = 0.0
