# ImageNet Classification with Deep Convolutional Neural Networks
### Advances in Neural Information Processing Systems 25 (NIPS 2012)
#### https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks


__Alex Krizhevsky__
University of Toronto
kriz@cs.utoronto.ca

__Ilya Sutskever__
University of Toronto
ilya@cs.utoronto.ca

__Geoffrey E. Hinton__
University of Toronto
hinton@cs.utoronto.ca

In [2]:
import tensorflow as tf

## Dataset

TODO: ILSRVC 2012 dataset

## Architecture



In [3]:
def alexnet_v1(inputs = None,
               num_classes=1000,
               is_training=True,
               keep_prob=0.5,
               scope='alexnet_v1',
               global_pool=False):
    
    mu = 0
    sigma = 0.01
    
    # Layer 1: Convolutional. Input = 224x224x3
    conv1_W = tf.Variable(tf.truncated_normal(shape=(11, 11, 3, 96), mean = mu, stddev = sigma))
    conv1_b = tf.Variable(tf.zeros(96))
    # Convolution
    conv1 = tf.nn.conv2d(inputs, conv1_W, strides=[1, 4, 4, 1], padding="SAME", name="conv1")
    conv1 = tf.nn.bias_add(conv1, conv1_b)
    # Activation
    conv1 = tf.nn.relu(conv1)
    # Local Response Normalization
    conv1 = tf.nn.local_response_normalization(conv1, depth_radius=5.0, bias=2.0, alpha=1e-4, beta=0.75)
    # Overlapping Max Pooling
    conv1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="VALID")
    
    # Layer 2: Convolutional
    conv2_W = tf.Variable(tf.truncated_normal(shape=(5, 5, 48, 256), mean = mu, stddev = sigma))
    conv2_b = tf.Variable(tf.zeros(256))
    # Convolution
    conv2 = tf.nn.conv2d(inputs, conv2_W, strides=[1, 1, 1, 1], padding="SAME", name="conv2")
    conv2 = tf.nn.bias_add(conv2, conv2_b)
    # Activation
    conv2 = tf.nn.relu(conv2)
    # Local Response Normalization
    conv2 = tf.nn.local_response_normalization(conv2, depth_radius=5.0, bias=2.0, alpha=1e-4, beta=0.75)
    # Overlapping Max Pooling
    conv2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="VALID")
    
    # Layer 3: Convolutional
    conv3_W = tf.Variable(tf.truncated_normal(shape=(3, 3, 256, 384), mean = mu, stddev = sigma))
    conv3_b = tf.Variable(tf.zeros(384))
    # Convolution
    conv3 = tf.nn.conv2d(inputs, conv3_W, strides=[1, 1, 1, 1], padding="SAME", name="conv3")
    conv3 = tf.nn.bias_add(conv3, conv3_b)
    # Activation
    conv3 = tf.nn.relu(conv3)
    
    # Layer 4: Convolutional
    conv4_W = tf.Variable(tf.truncated_normal(shape=(3, 3, 192, 384), mean = mu, stddev = sigma))
    conv4_b = tf.Variable(tf.zeros(256))
    # Convolution
    conv4 = tf.nn.conv2d(inputs, conv4_W, strides=[1, 1, 1, 1], padding="SAME", name="conv4")
    conv4 = tf.nn.bias_add(conv4, conv4_b)
    # Activation
    conv4 = tf.nn.relu(conv4)
    
    # Layer 5: Convolutional
    conv5_W = tf.Variable(tf.truncated_normal(shape=(3, 3, 192, 256), mean = mu, stddev = sigma))
    conv5_b = tf.Variable(tf.zeros(256))
    # Convolution
    conv5 = tf.nn.conv2d(inputs, conv5_W, strides=[1, 1, 1, 1], padding="SAME", name="conv5")
    conv5 = tf.nn.bias_add(conv5, conv5_b)
    # Activation
    conv5 = tf.nn.relu(conv5)
    
    # Layer 6: Fully Connected
    fc0 = tf.contrib.layers.flatten(conv5)
    
    # Layer 7: Fully Connected
    fc1_W = tf.Variable(tf.truncated_normal(shape=(4096, 4096), mean = mu, stddev = sigma))
    fc1_b = tf.Variable(tf.zeros(4096))
    fc1 = tf.matmul(fc0, fc1_W) + fc1_b
    
    # Activation.
    fc1 = tf.nn.relu(fc1)
    
    # Dropout
    fc1 = tf.nn.dropout(fc1, keep_prob)

    # Layer 8: Fully Connected. Input = 120. Output = 84.
    fc2_W = tf.Variable(tf.truncated_normal(shape=(4096, 1000), mean = mu, stddev = sigma))
    fc2_b = tf.Variable(tf.zeros(1000))
    fc2 = tf.matmul(fc1, fc2_W) + fc2_b
    
    # Activation.
    fc2 = tf.nn.relu(fc2)
    
    # Dropout
    fc2 = tf.nn.dropout(fc2, keep_prob)

    # Layer 9: Fully Connected. Input = 84. Output = 10.
    fc3_W = tf.Variable(tf.truncated_normal(shape=(84, 43), mean = mu, stddev = sigma))
    fc3_b = tf.Variable(tf.zeros(43))
    logits = tf.matmul(fc2, fc3_W) + fc3_b
    
    return logits

## Image Preprocessing

TODO
The input image dimensions vary while the network takes 224x224x3 as the input. Five random crops are taken of the image and a horizontally flipped copy is taken during training.