# Visualize average images

20153029 Minji Kim



### 1. Load MNIST training dataset.

 MNIST Dataset stored in "minist_train.csv" is a large database of handwritten numbers. These numeric images are in 28x28 matrices and are leveled to accommodate the grayscale level. The file contains 60,000 numeric images. The first column stores the numbers on the labels and the images in the remaining columns.
    
 To visualize the average image of this dataset, the data in the dataset was modified. The values in the first column were saved in the list_label, and the remaining columns were saved in the list_image and normalized using the normalize function.

In [None]:
# import matplotlib.pyplot as plt
import numpy as np
from math import sqrt

file_data		= "mnist_train.csv"
handle_file	= open(file_data, "r")
data        		= handle_file.readlines()
handle_file.close()

size_row	= 28    # height of the image
size_col  	= 28    # width of the image

num_image	= len(data)
count       	= 0     # count for the number of images


#
# normalize the values of the input data to be [0, 1]
#
def normalize(data):

    data_normalized = (data - min(data)) / (max(data) - min(data))

    return(data_normalized)


#
# make a matrix each column of which represents an images in a vector form 
#
list_image  = np.empty((size_row * size_col, num_image), dtype=float)
list_label  = np.empty(num_image, dtype=int)

for line in data:

    line_data   = line.split(',')
    label       = line_data[0]
    im_vector   = np.asfarray(line_data[1:])
    im_vector   = normalize(im_vector)

    list_label[count]       = label
    list_image[:, count]    = im_vector    

    count += 1

### 2. Compute the average images for each label (digit) based on L2-norm.

In order to calculate the average image, the data set was first sorted by labels. Then, the number of image vectors on each label was obtained, and the numbering arrangement was used to create a cumulative array. The datasets will be separated by label using this numerical and cumulative array. The average image calculation is calculated using the RMS value formula below.

$$rms(x) =  \sqrt{\frac{x_1^2+\cdot\cdot\cdot+x_n^2}{n}} = \frac{\lVert x \rVert}{\sqrt{n}}$$

In [None]:
#
# Sort vectors by label
#
idx = np.argsort(list_label)
list_label = list_label[idx]
list_image = list_image[:,idx]


#
# Number and Cumulative sum of image_vectors by Label
#
def vector_count(x, y):
    count = 0
    for i in range(60000):
        if x[i] == y:
            count += 1
    return(count)

label_count = [0,]
for i in range(10):
    label_count.append(vector_count(list_label,i))

label_cumsum = []
label_cumsum = np.cumsum(label_count)

#
# Calculate rms_image : average images
#

rms_image  = np.empty((size_row * size_col, 10), dtype=float)
for i in range(10):
    rms_image[:,i] = (np.linalg.norm(list_image[:,label_cumsum[i]+1:label_cumsum[i+1]], axis=1, ord=2))/sqrt(label_count[i+1])

### 3. Visualize the average images.

In [12]:
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt

file_data		= "mnist_train.csv"
handle_file	= open(file_data, "r")
data        		= handle_file.readlines()
handle_file.close()

size_row	= 28    # height of the image
size_col  	= 28    # width of the image

num_image	= len(data)
count       	= 0     # count for the number of images


#
# normalize the values of the input data to be [0, 1]
#
def normalize(data):

    data_normalized = (data - min(data)) / (max(data) - min(data))

    return(data_normalized)


#
# make a matrix each column of which represents an images in a vector form 
#
list_image  = np.empty((size_row * size_col, num_image), dtype=float)
list_label  = np.empty(num_image, dtype=int)

for line in data:

    line_data   = line.split(',')
    label       = line_data[0]
    im_vector   = np.asfarray(line_data[1:])
    im_vector   = normalize(im_vector)

    list_label[count]       = label
    list_image[:, count]    = im_vector    

    count += 1

    
#
# Sort vectors by label
#
idx = np.argsort(list_label)
list_label = list_label[idx]
list_image = list_image[:,idx]



#
# Number and Cumulative sum of image_vectors by Label
#
def vector_count(x, y):
    count = 0
    for i in range(60000):
        if x[i] == y:
            count += 1
    return(count)

label_count = [0,]
for i in range(10):
    label_count.append(vector_count(list_label,i))

label_cumsum = []
label_cumsum = np.cumsum(label_count)

#
# Calculate rms_image : average images
#

rms_image  = np.empty((size_row * size_col, 10), dtype=float)
for i in range(10):
    rms_image[:,i] = (np.linalg.norm(list_image[:,label_cumsum[i]+1:label_cumsum[i+1]], axis=1, ord=2))/sqrt(label_count[i+1])


#
# Plot
#
                  
f1 = plt.figure(1)

for i in range(10):

    label       = i
    im_vector   = rms_image[:, i]
    im_matrix   = im_vector.reshape((size_row, size_col))

    plt.subplot(2, 5, i+1)
    plt.title(label)
    plt.imshow(im_matrix, cmap='Greys', interpolation='None')

    frame   = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)

plt.show()


MemoryError: 