20153029 Minji Kim

# MNIST Binary Classifier

Build a binary classifier to classify digit 0 against all the other digits at MNIST dataset.

Let $x = (x_1, x_2, ... , x_m)$ be a vector representing an image in the dataset.

The prediction function $f_w(x)$ is defined by the linear combination of data $(1, x)$ and the model parameter $w$:
$$f_w(x) = w_0 \times 1 + w_1 \times x_1 + w_2 \times x_2 + ... + w_m \times x_m$$ 
$$where \ w = (w_0, w_1, ... , w_m)$$

The prediction function $f_w(x)$ should have the following values:
$$f_w(x) = +1 \qquad if\ label(x) = 0$$
$$f_w(x) = -1 \qquad if\ label(x)\ is\ not\ 0$$

The optimal model parameter w is obtained by minimizing the following objective function:
$$\sum_i ( f_w(x^{(i)} - y^{(i)} )^2$$



## Compute an optimal model parameter using the training dataset

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import *
import pandas as pd

file_data		= "mnist_train.csv"
handle_file	= open(file_data, "r")
data        		= handle_file.readlines()
handle_file.close()

size_row	= 28    # height of the image
size_col  	= 28    # width of the image

num_image	= len(data)
count       	= 0     # count for the number of images

#
# Normalize the values of the input data to be [0, 1]
#
def normalize(data):

    data_normalized = (data - min(data)) / (max(data) - min(data))

    return(data_normalized)


#
# Make a matrix each column of which represents an images in a vector form 
#
list_image  = np.ones((num_image,(size_row * size_col)+1), dtype=float)
list_label  = np.empty(num_image, dtype=int)

for line in data:

    line_data   = line.split(',')
    label       = line_data[0]
    im_vector   = np.asfarray(line_data[1:])
    im_vector   = normalize(im_vector)

    list_label[count]       = label
    list_image[count, 1:]    = im_vector 

    count += 1


#
# Approximating polynomial function obtained by solving a least square problem
#
Y  = []
for i in range(num_image):
    if(list_label[i]==0):
        Y.append(1)
    else:
        Y.append(-1)

theta = pinv(list_image.T@list_image)@list_image.T@Y

PF = list_image@theta

Estimaite_Y = []
for i in range(num_image):
    if(PF[i]>0):
        Estimaite_Y.append(1)
    else:
        Estimaite_Y.append(-1)

## Compute accuracy using training dataset

Compute (1) True Positive, (2) False Positive, (3) True Negative, (4) False Negative based on the computed optimal model paramete

In [3]:
 #
# Compute the accuracy using the train dataset
#

a = 0
b = 0
c = 0
d = 0

for i in range(num_image):
    if(Y[i] == 1 and Estimaite_Y[i] == 1):
        a += 1
    if(Y[i] == 1 and Estimaite_Y[i] == -1):
        b += 1
    if(Y[i] == -1 and Estimaite_Y[i] == 1):
        c += 1
    if(Y[i] == -1 and Estimaite_Y[i] == -1):
        d += 1

TP = a/(a+b)
FP = c/(c+d)
TN = d/(c+d)
FN = b/(a+b)

In [4]:
Result1 = pd.DataFrame(data=[[a,b,a+b],[c,d,c+d],[a+c,b+d,a+b+c+d]], index=['Y = 1', 'Y = -1', 'All'],
                     columns=['Estimaite_Y = 1', 'Estimaite_Y = -1','Total'])
Result1

Unnamed: 0,Estimaite_Y = 1,Estimaite_Y = -1,Total
Y = 1,5167,756,5923
Y = -1,179,53898,54077
All,5346,54654,60000


In [5]:
Result2 = pd.DataFrame(data=[[TP,FP],[TN,FN]], index=['Positive','Negative'],
                     columns=['True','False'])
Result2

Unnamed: 0,True,False
Positive,0.872362,0.00331
Negative,0.99669,0.127638


## Compute accuracy using testing dataset

Compute (1) True Positive, (2) False Positive, (3) True Negative, (4) False Negative based on the computed optimal model paramete

In [6]:
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import *
import pandas as pd

file_data		= "mnist_test.csv"
handle_file	= open(file_data, "r")
data        		= handle_file.readlines()
handle_file.close()

num_image	= len(data)
count       	= 0     # count for the number of images


#
# Make a matrix each column of which represents an images in a vector form 
#
list_image  = np.ones((num_image,(size_row * size_col)+1), dtype=float)
list_label  = np.empty(num_image, dtype=int)

for line in data:

    line_data   = line.split(',')
    label       = line_data[0]
    im_vector   = np.asfarray(line_data[1:])
    im_vector   = normalize(im_vector)

    list_label[count]       = label
    list_image[count, 1:]    = im_vector 

    count += 1


#
# Approximating polynomial function obtained by solving a least square problem
#
Y  = []
for i in range(num_image):
    if(list_label[i]==0):
        Y.append(1)
    else:
        Y.append(-1)

theta = pinv(list_image.T@list_image)@list_image.T@Y

PF = list_image@theta

Estimaite_Y = []
for i in range(num_image):
    if(PF[i]>0):
        Estimaite_Y.append(1)
    else:
        Estimaite_Y.append(-1)

        
#
# Compute the accuracy using the train dataset
#

a = 0
b = 0
c = 0
d = 0

for i in range(num_image):
    if(Y[i] == 1 and Estimaite_Y[i] == 1):
        a += 1
    if(Y[i] == 1 and Estimaite_Y[i] == -1):
        b += 1
    if(Y[i] == -1 and Estimaite_Y[i] == 1):
        c += 1
    if(Y[i] == -1 and Estimaite_Y[i] == -1):
        d += 1

TP = a/(a+b)
FP = c/(c+d)
TN = d/(c+d)
FN = b/(a+b)

In [8]:
Result1 = pd.DataFrame(data=[[a,b,a+b],[c,d,c+d],[a+c,b+d,a+b+c+d]], index=['Y = 1', 'Y = -1', 'All'],
                     columns=['Estimaite_Y = 1', 'Estimaite_Y = -1','Total'])
Result1

Unnamed: 0,Estimaite_Y = 1,Estimaite_Y = -1,Total
Y = 1,901,79,980
Y = -1,32,8988,9020
All,933,9067,10000


In [9]:
Result2 = pd.DataFrame(data=[[TP,FP],[TN,FN]], index=['Positive','Negative'],
                     columns=['True','False'])
Result2

Unnamed: 0,True,False
Positive,0.919388,0.003548
Negative,0.996452,0.080612
