In [10]:
import os
import json
import random
from time import clock
from PIL import Image, ImageStat
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

## Data and Preprocessing

#### File Structure

To mitigate space/time complexity, we'll work instead with the Tiny Imagenet dataset, which is a subset of the Imagenet dataset.  

The Tiny Imagenet dataset is stored inside the directory `tiny-imagenet-200`, which itself contains 3 subdirectories: `train`, `val`, and `test`. Inside each of these subdirectories is a set of 200 folders corresponding to the 200 classes inside Tiny Imagenet.   

The folders are labeled by the Wordnet synset (e.g. `n01910747`) representing that class (e.g. ["Jellyfish", "cnidarian", "Coelenterate"]). Inside `train`, each class contains 500 sample images while inside each of `val` and `test`, each class contains 50 samples images.

Each image size in Tiny Imagenet is a 64x64x3 resolution image, which we artificially englarged to 224x224x3 in order to preserve the architectural integrity of Alexnet (256x256x3 images from Imagenet had 224x224x3 patches extracted from the original image before being fed into the network).

#### Data Augmentation

In order to combat overfitting, Krizhevsky et. al made efforts to artifically enlarge Imagenet with label-preserving transformations, namely:

1. **Image Translations/Horizontal Reflections**: From each image, several random 224x224 patches were extracted, with a horizontal reflection being generated from each patch. This technique enlarged the training dataset by a factor of 2048 (significantly combatting the overfitting problem). At test time, 5 patches were taken (from the image's four corners and the centre of the image), with a reflection generated for each patch. The final prediction was the average of the network's predictions on each of these 10 samples.
   
2. **PCA Colour Augmentation**: PCA was performed on the set of RBG values throughout the training set. The found principal components were first scaled by a magnitude proportional to the corresponding eigenvalues times a random variable drawn from a Gaussian with mean zero and standard deviation 0.1, then added to the training images. The desired effect here was that object identity be invariant to intensity and colour of illumination.

#### Preprocessing

Each image was cropped to a size of 256x256 (in order to ensure fixed sized input data), then from each pixel channel (RBG), the mean channel value over the entire dataset was subtracted from that channel. For Tiny Imagenet, we found the means to be [142, 143, 145] for Red, Green, and Blue respectively.

### Utility Functions

#### Return a one-hot encoding (of size 200) for each class

In [11]:
def one_hot(index):
    '''
    Args:
        index: integer value in range 0-199
    
    Return:
        vec: zero vector of size 200, with a 1 at the index position'''
    
    vec = np.zeros(200)
    vec[index] = 1.0
    return vec

#### Extract all Wordnet IDs from Tiny Imagenet

In [12]:
def get_wnids(folder_path):
    '''
    Args:
        folder_path: "tiny-imagenet-200/train
    
    Return:
        wnids: list of wnids in folder
    '''
    wnids = os.listdir(folder_path)
    return wnids

#### Rescale (using nearest neighbour alg) to size 224x224 and subtract channel means

In [13]:
def preprocess(image):
    '''
    Args: 
        image: Image object
    
    Returns:
        image: a resized 224x224x3 numpy array with R, B, and G means subtracted from respective channels
    '''
    means = np.array([142, 143, 145], np.int32)
    image = np.array(image.resize((224, 224)), np.int32)
    for i in range(3):
        image[:,:,i] -= means[i]
    
    
    
    return image

#### Randomly select an image from a given class folder and preprocess

In [14]:
def read_image(images_folder):
    '''
    Args:
        image_folder: path to image
        
    Returns:
        image: randomly selected image and preprocess
    '''
    
    flag = False
    while flag is False:
        image_path = os.path.join(images_folder, random.choice(os.listdir(images_folder)))
        img = Image.open(image_path).convert('RGB')
        if np.array(img).shape == (64, 64, 3):  # ignore grayscale images
            flag = True
            
    # load and normalize image
    img_array = preprocess(img)
    
    return img_array    

#### Randomly select a class folder, extract/preprocess image, get one-hot encoded class, return batch size of choice

In [15]:
def read_batch(batch_size, images_source, wnid_labels):
    '''
    Args:
        batch_size: desired batch_size
        images_source: "tiny-imagenet-200/train"
    '''
    batch_images = []
    batch_labels = []
    
    for i in range(batch_size):
        # random class choice 
        # (randomly choose a folder of image of the same class from a list of previously sorted wnids)
        class_index = random.randint(0, 199)

        flag = False
        while flag is False:
            folder = os.path.join(wnid_labels[class_index], "images")
            if os.path.exists(os.path.join(images_source,folder)):
                flag = True
        
        batch_images.append(read_image(os.path.join(images_source, folder)))
        batch_labels.append(one_hot(class_index))

    np.vstack(batch_images)
    np.vstack(batch_labels)
    return batch_images, batch_labels

In [16]:
wnids = get_wnids("tiny-imagenet-200/train")

In [17]:
# read_batch(1, "tiny-imagenet-200/train", wnids)

## Alexnet

#### ReLU Activation Functions

Krizhevsky et al replaced the standard neuron activation $ f(x) = tanh(x) $ in favour of the non-saturating (i.e. $ \lim_{{x \rightarrow \infty}} f(x) = \infty$) Rectified Linear Unit (ReLU) function. The work showed that deep CNNs with ReLU activations trained several times faster than their counterparts with tanh units.

#### Local Response Normalization

After kernel $i$ is applied at position $(x, y)$ and passed through the ReLU, the neuron activity is normalized by the sum over $n$ adjacent kernel maps at the same spatial position. Essentially "brightness normalization", where peaks are enlarged and flat regions dampened. Reduced top-1 and top-5 errr rates by 1.4% and 1.2% respectively. 

#### Overlapping Pooling

Whereas traditional pooling units were placed adjacently, without overlap, the Alexnet pooling units were set to overlap by a 1-pixel margin. Reduced top-1 and top-5 error rates by 0.4% and 0.3% respectively (compared to the non-overlapping scheme).

#### Dropout

For each neuron in the layer, we set a probability of 0.5 of setting the output of that neuron to zero. The neuron activity will not contribute to the forward pass or backpropogation. Reduces complex co-adaptations of neurons, forcing the learning of more robust image features. Roughly doubles the number of iterations required to converge.

In [18]:
sess = tf.InteractiveSession()

### Wrapper Functions for TF Operations

In [19]:
def weight(shape, name):
    init = tf.truncated_normal(shape, stddev=0.1)
    w = tf.Variable(init, name=name)
    return w

In [20]:
def bias(val, shape, name):
    init = tf.constant(val, shape=shape)
    return tf.Variable(init, name=name)

In [21]:
def conv(x, w, stride, padding="SAME"):
    return tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=padding)

In [22]:
def max_pool(x, kernel, stride, padding="VALID"):
    return tf.nn.max_pool(x, ksize=[1, kernel, kernel, 1], strides=[1, stride, stride, 1], padding=padding)

In [23]:
def lrn(x, radius=2, bias=1.0, alpha=2e-05, beta=0.75):
        return tf.nn.local_response_normalization(x, depth_radius=radius, alpha=alpha, beta=beta, bias=bias)

In [24]:
def relu(x):
    return tf.nn.relu(x)

In [25]:
def softmax(x):
    return tf.nn.softmax(x)

### Building the Model

In [26]:
# have to add weight decay (0.00005)
dropout = 0.5
momentum = 0.9
lmbda = 5e-04
learning_rate = 0.001
epochs = 1
batch_size = 128
display_step = 10

In [27]:
x = tf.placeholder(tf.float32, [None, 224, 224, 3], name="x")
y = tf.placeholder(tf.float32, [None, 200], name="y")
# lr = tf.placeholder(tf.float32)
keep_prob = tf.placeholder(tf.float32, name="keep_prob")

#### Layer 1: Convolution $ \rightarrow $ Max-Pooling $ \rightarrow $ Local Response Normalization

In [28]:
cnn_w1 = weight([11, 11, 3, 96], name="cnn_w1")
cnn_b1 = bias(0.0, [96], name="cnn_b1")
conv1 = tf.add(conv(x, cnn_w1, stride=4, padding="SAME"), cnn_b1)
conv1 = relu(conv1)
pool1 = max_pool(conv1, kernel=3, stride=2)
norm1 = lrn(pool1)

#### Layer 2: Convolution $ \rightarrow $ Max-Pooling $ \rightarrow $ Local Response Normalization

In [29]:
cnn_w2 = weight([5, 5, 96, 256], name="cnn_w2")
cnn_b2 = bias(0.0, [256], name="cnn_b2")
conv2 = tf.add(conv(norm1, cnn_w2, stride=1, padding="SAME"), cnn_b2)
conv2 = relu(conv2)
pool2 = max_pool(conv2, kernel=3, stride=2)
norm2 = lrn(pool2)

#### Layer 3: Convolution

In [30]:
cnn_w3 = weight([3, 3, 256, 384], name="cnn_w3")
cnn_b3 = bias(0.0, [384], name="cnn_b3")
conv3 = tf.add(conv(norm2, cnn_w3, stride=1, padding="SAME"), cnn_b3)

#### Layer 4: Convolution

In [31]:
cnn_w4 = weight([3, 3, 384, 384], name="cnn_w4")
cnn_b4 = bias(0.0, [384], name="cnn_b4")
conv4 = tf.add(conv(conv3, cnn_w4, stride=1, padding="SAME"), cnn_b4)

#### Layer 5: Convolution $\rightarrow$ Max-Pooling

In [32]:
cnn_w5 = weight([3, 3, 384, 256], name="cnn_w5")
cnn_b5 = bias(0.0, [256], name="cnn_b5")
conv5 = tf.add(conv(conv4, cnn_w5, stride=1, padding="SAME"), cnn_b5)
pool5 = max_pool(conv5, kernel=3, stride=2)

#### Reshaping

In [33]:
dim = pool5.get_shape().as_list()
flattened_dim = dim[1] * dim[2] * dim[3]  
flattened = tf.reshape(pool5, [-1, flattened_dim])

#### Layer 6: Fully-Connected with Dropout

In [34]:
fc_w1 = weight([flattened_dim, 4096], name="fc_w1")
fc_b1 = bias(0.0, [4096], name="fc_b1")
fc1 = tf.add(tf.matmul(flattened, fc_w1), fc_b1)
fc1 = relu(fc1)
fc1 = tf.nn.dropout(fc1, keep_prob)

#### Layer 7: Fully-Connected with Dropout

In [35]:
fc_w2 = weight([4096, 4096], name="fc_w2")
fc_b2 = bias(0.0, [4096], name="fc_b2")
fc2 = tf.add(tf.matmul(fc1, fc_w2), fc_b2)
fc2 = relu(fc2)
fc2 = tf.nn.dropout(fc2, keep_prob)

#### Layer 8: Fully-Connected

In [36]:
fc_w3 = weight([4096, 200], name="fc_w3")
fc_b3 = bias(0.0, [200], name="fc_b3")
pred = tf.add(tf.matmul(fc2, fc_w3), fc_b3)

### Training and Evaluation

In [38]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=pred, name="cross-entropy"))
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, name="optimizer").minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(pred,1), tf.argmax(y,1), name="correct_prediction")
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")

sess.run(tf.global_variables_initializer())

In [None]:
# tf.reset_default_graph()
for i in range(1):
    batch = read_batch(1, "tiny-imagenet-200/train", wnids)
    if i%1 == 0:
        train_accuracy = accuracy.eval(feed_dict={x:batch[0], y: batch[1], keep_prob: 1.0})
        print("step %d, training accuracy %g"%(i, train_accuracy))
    optimizer.run(feed_dict={x: batch[0], y: batch[1], keep_prob: 0.5})

test_batch = read_batch(5, "tiny-imagenet-200/train", wnids)
print("test accuracy %g"%accuracy.eval(feed_dict={
    x: test_batch[0], y: test_batch[1], keep_prob: 1.0}))