### Marissa McKee
#### Introduction to Convolutional Neural Networks using TensorFlow
UNT ADTA 5550 Summer 2020

### MNIST Dataset
Relevant Information: 
- 70,000 images in the dataset total
- Training data (mnist.train) 55,000 images
- Validation data (mnist.validate) 5,000 images – will not be using for this project 
- Test data (mnist.test) 10,000 images 


The dataset consist of pairs – handwritten digit images and it’s labels
- Numbers range from 0 to 9 meaning 10 patterns in total
- Handwritten number image: this is a gray scale with size 28 X 28 pixels
- Label: this is actual digit number the handwritten digit represents (0 – 9)

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

## 1. Import Data

In [2]:
# Import MNIST dataset from TensorFlow library
mnist = input_data.read_data_sets("MNIST_data/",one_hot=True) 

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


In [3]:
# Training record count
mnist.train.num_examples

55000

In [4]:
# Testing record count
mnist.test.num_examples

10000

## 2. Data Preparation

### Initialize Weights in Filter

In [5]:
# This function returns a tf.Variable used to store weights in a filter
# This variable is initialized with values that can be used to initialize weights
# The values are random numbers

def initialize_weights (filter_shape):
    init_random_dist = tf.truncated_normal(filter_shape, stddev=0.1)
    return(tf.Variable(init_random_dist))

### Initialize Bias

In [6]:
# This function returns a tf.Variable used to store bias
# This variable is initialized with values that can be used to initialize bias
# The value is initialized to 0.1

def initialize_bias(bias_shape):
    initial_bias_vals = tf.constant(0.1, shape=bias_shape)
    return(tf.Variable(initial_bias_vals))

### Convolutional Layer Computation: Dot Product (x*W)

In [7]:
# Define a function to set up a convolution layer (conv2D)
# Parameters:
# ---inputs: [batch,H,W,Channels]
# ---filter_shape: [filter H, W, in_channels: (in_depth=in_num_filters),out_channels(out_depth=out_num_filters)]
#     ---for example: [5,5,1,32]
# Return: Outputs of the layer: the dot product: inputs * Weights: x*W

def create_convolution_layer_and_compute_dot_product(inputs, filter_shape): 
    # Initialize the weights in the filter
    filter_initialized_with_weights = initialize_weights(filter_shape)
    
    # Create a convolution layer
    conv_layer_outputs = tf.nn.conv2d(inputs, filter_initialized_with_weights, strides=[1,1,1,1],
                                     padding='SAME')
    
    # Return the convolution layer outputs
    return(conv_layer_outputs)

### Create a ReLU Layer and Perform Computation: Dot Product + Bias (x.W+b)

In [8]:
# Define a ReLU function 
# First, set up a ReLU layer: an activaion function layer
# Second, perform the computation: dot product _ bias (x*W) + b

# ReLU: Rectified Linear Unit - activation function used in CNN
# In this layer, the bias has been initialized
# Parameters:
# ---inputs: Outputs from the preceding convolution layer: dot product inputs * weights
# ---filter_shape: [filter H, filter W, in_channels(in_depth=in_num_filters),out_channels(out_depth=out_num_filters)]
#     --- for example: [5,5,1,32]

# Return: Outputs of the layer: dot product + b: (x*W)+b

def create_relu_layer_and_compute_dotproduct_plus_b(inputs, filter_shape):
    # Initialize bias for each input channel
    b = initialize_bias([filter_shape[3]])
    
    # Perform the computation first by adding: inputs (x*W) + b
    # Create a ReLU layer associated with the preceding convolution layer
    relu_layer_outputs = tf.nn.relu(inputs + b)
    
    # Reutrn the outputs of the ReLU layer
    return(relu_layer_outputs)

### Create a Pooling Layer and Reduce Spatial Size

In [9]:
# Define a pooling function
# First, create a pooling layer
# Second, reduce the spatial size of the input data

# Pooling method: Max pooling
# Kernel size: 2x2
# Stride: 2

# Parameters:
# --- inputs: Outputs of the preceding layer

# Return: Outputs of the layer

def create_maxpool2by2_and_reduce_spatial_size(inputs):
    # Create a pooling layer
    pooling_layer_outputs = tf.nn.max_pool(inputs, ksize=[1,2,2,1],strides=[1,2,2,1],
                                          padding='SAME')
    
    # Return the pooling layer
    return(pooling_layer_outputs)

### Create Fully Connected Layer and Perform Computation: (inputs*Weights)+Bias

In [10]:
# Define a function to set up a fully connected layer
# Perform computation: (dot product ( inputs*Weight)) + bias
# Parameters:
# ---inputs: outputs of preceding layer
# ---size: the size of the outputs (number of channels)

def create_fully_connected_layer_and_compute_dotproduct_plus_bias(inputs,output_size):
    # Get the number of input channels from the input
    # Inputs: the outputs from the preceding layer or previous operation like reshaping 
    input_size = int(inputs.get_shape()[1])
    
    # Initialize the weights of the filter of the FC layer
    # Filter shape: [in_channels, out_channels]
    # Each weight for one filter cell
    W = initialize_weights([input_size,output_size])
    
    # Initialize the bias: each bias one output channel
    b = initialize_bias([output_size])
    
    # First, perform the computation for the FC layer: Dot product inputs * W
    # Second, add bias to get the results: outputs of the FC layer
    fc_xW_plus_bias_outputs = tf.matmul(inputs,W) + b
    
    # Return the results: outputs
    return(fc_xW_plus_bias_outputs)

## 3. Phase 1: Build the Convolutional Neural Network

### CNN: Build and Train
#### PHASE 1 - Build the convolutional neural network
- Step 1: Create the first convolution layer (CONV) that uses ReLU as the activation function  
- Step 2: Create the first pooling layer (POOL) that works hand in hand with the convolution layer. Repeat steps 1 and 2 until all the convolutional and pooling layers have been created
- Step 3: Create a reshape/flattening layer to reshape output data from the last pooling layer
- Step 4: Create a fully connected FC layer that accepts outputs from the reshape layer as its inputs. Repeat step 4 to create other fully connected layers if necessary
- Step 5: Create a dropout layer to drop some outputs from the last fully connected layer
- Step 6: Create the final output layer, a FC layer, that accepts the outputs from the dropout layer as it's inputs and produces the final outputs
- Step 7: Use the loss function (softmax cross entropy loss) to compute the cross entropy loss (the gap between the predicted outputs and the labels).
    - In other words, measure the probability error in discrete classification tasks in which the classes are mutually exclusive - each entry is exactly one class
- Step 8: Create an optimizer to optimize the model by minimizing the cross entropy loss
- Step 9: Create an AI deep learning trainer to train the neural network

#### PHASE 2 - Train and test the convolutional neural network 
- Step 1: Train the network using the AI deep learning trainer create above and test it

### Create Placeholders for Inputs and Labels: x and y_true

In [11]:
# x PLACEHOLDER

# Create a placeholder for the input data: x
# x: 2D array placeholder that can hold any number of rows/records
# ---Each row is a vector (1D array) to hold data for one image
# ---Each row/image has 784 values: 1 pixel = 1 value

x = tf.placeholder(tf.float32, shape=[None,784])

In [12]:
# y_true PLACEHOLDER

# Create a placeholder for the input data: x
# y_true: 2D array placeholder that can hold any number of rows/records
# ---Each row is a vector (1D array) of 10 values that indicate a digit between 0 and 9
# ---Each row: label is stored in the one-hot format

y_true = tf.placeholder(tf.float32, shape=[None,10])

### Reshape the Input Placeholder x

In [13]:
# Prepare feeding inputs into the first conv layer
# Reshape the input x: a placeholder
# from 1D array (vector) original input shape: 4D input [batch,H,W,depth channels]
# Depth: color channels gray scale = 1
# Reshaped inputs: x_image [1,28,28,1]

x_image = tf.reshape(x, [-1,28,28,1])

### Create First Convolutional Layer and Perform Computation: x*W+b

### Convolutional Layer 1
- Convolution 4D shape: [batch, H, W, depth] = [1,28,28,32]
    - batch: subset of data
    - H: height of input
    - W: width of input
    - depth: indicates the number of feature maps that we can get from a convolution layer
- 2D data size: 28x28
- Input shape: 28x28x1 Depth = 1 in_channel
    - color is grayscale for in_channel
- Output shape: 28x28x32 Depth = 32 out-channel
    - extracts 32 features
- Filter/Kernel/Window size: 5x5
- Filter shape: [5,5,1,32]
- Stride: 1
- Stride shape: [1,1,1,1]
- Padding: same
- Activation function layer: ReLU

In [14]:
# Create the first convoluation layer
# Inputs: x_image: reshaped inputs with shape [1,28,28,1]
# filter_shaoe: [5,5,1,32]
# ---Filter:5x5
# ---Input channels: 1
# ---output channels: 32

# Extract the features, get the results: dot product of inputs + weights
# Return the outputs
conv_layer_1_outputs = create_convolution_layer_and_compute_dot_product(x_image, filter_shape=[5,5,1,32])

### Create First ReLu Layer and Perform Computation: x*W+b

### ReLU Layer 1
- No filter
- Input shape: 28x28x32
- Output shape: 28x28x32

In [15]:
# Create the ReLU layer and the first convolution layer
# Accept the outputs from the first convolution layer as the inputs
# Perform the computation at the layer: add inputs + bias
# Return the outputs of the layer

conv_relu_layer_1_outputs = create_relu_layer_and_compute_dotproduct_plus_b(conv_layer_1_outputs, filter_shape=[5,5,1,32])

### Create First Pooling Layer and Reduce Spatial Size

### Pooling Layer 1
- Pooling method: Max pooling
- Filter/Kernel/Window size = 2x2
- Filter/Kernel/Window shape: [1,2,2,1]
- Stride: 2
- Stride shape: [1,2,2,1]
- Padding: same
- Input channels: 32 inputs
- Input shape: 28x28x32
- Output channels: 32 outputs
- Output shape: 14x14x32 

In [16]:
# Create  the first pooling layer
# Then reduce the spatial size of the input data
# Return outputs of the layer

pooling_layer_1_outputs = create_maxpool2by2_and_reduce_spatial_size(conv_relu_layer_1_outputs)

### Reshape and Flatten Data Making it Ready to be Fed into First FC Layer

In [17]:
# Reshape and flatten the output of the second pooling layer
# prepare to feed the output data into the first fully connected layer
pooling_layer_1_outputs_flat = tf.reshape(pooling_layer_1_outputs, [-1,14*14*32])

### Create First FC Layer, ReLU Layer, and Output Data to Dropout Layer

### Fully Connected Layer 1
- FC_1 shape: [inputs (in_channels), outputs (out_channels)] = [7x7x64x1024]

In [18]:
# First create the FC layer
# Feed the output (already flattened) of the second pooling layer as the inputs into this layer
# Then perform the computation: dot product + bias ( x*W)+b

# Parameters:
# ---pooling_layer_2_outputs_flat
# ---output_size: 1024 out channels

# Returns outputs of the computation (x*W)+b

fc_layer_1_outputs = create_fully_connected_layer_and_compute_dotproduct_plus_bias(pooling_layer_1_outputs_flat,output_size=1024)

# Create the ReLU layer of the first FC layer
# The activation function of this layer is ReLU

# Return the outputs of the layer

fc_relu_layer_1_outputs = tf.nn.relu(fc_layer_1_outputs)

### Create Dropout Layer and Dropout a Fraction of Outputs Randomly

### Fully Connected Dropout Layer
- FC_2 shape: [inputs (in_channels), outputs (out_channels)] = [1024,1024]

In [19]:
# Declare a placeholder to hold the value of probability (percentage) to keep.
# the percentage of total output channels that will be kept
# Which nodes/channels to keep or drop out: selected randomly

hold_prob = tf.placeholder(tf.float32)

# Dropout: set the outputs to 0 so they will be ignored in the next layer

fc_dropout_outputs = tf.nn.dropout(fc_relu_layer_1_outputs, keep_prob=hold_prob)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


### Create Final FC Layer, Compute (x.W+b), and Produce Final Outputs

### Fully Connected Final Output Layer
- FC_3 shape: [inputs (in_channels), outputs (out_channels)] = [1024,10]

In [20]:
# Create the final FC layer
# Compute x*W+b
# Parameters:
# ---fc_dropout_outputs: outputs from the dropout layer

# Return y_pred: final predicted outputs

y_pred = create_fully_connected_layer_and_compute_dotproduct_plus_bias(fc_dropout_outputs, output_size=10)

### Define Loss Function and Calculate Softmax Cross Entropy loss

In [21]:
# Define loss function: cross entropy with logits
# Calculate the softmax cross entropy loss

softmax_cross_entropy_loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_true,logits=y_pred)

# Compute the mean of losses
cross_entropy_mean = tf.reduce_mean(softmax_cross_entropy_loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



### Create an Optimizer to Optimize CNN Model and Set Learning Rate

In [22]:
# Get an ADAM optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=.001)

### Create a Trainer to Train CNN Model

In [23]:
# Create a CNN model trainer that can train the model
# Optimize the model by minimizing the softmax cross_entropy loss

cnn_trainer = optimizer.minimize(cross_entropy_mean)

## 3. Phase 2: Train and Test the CNN Model

### Create a Variable Initializer to Initialize All Variables

In [24]:
# Get a variable initializer

vars_initializer = tf.global_variables_initializer()

### Set the Steps

In [25]:
# x: mnist.train = 50,000 images
# Each time of training (run entire process) = 1 step
# Each step, use on batch of inputs
# batch size = 50 images
#     Total number of batches: 50,000 / 50 = 1,000 batches

steps = 3000

### Run tf.Session() to Train and Test the CNN Model

In [26]:
with tf.Session() as sess:
    # Initialize variables
    sess.run(vars_initializer)
    
    for i in range(steps):
        # batch = 50 images
        batch_x, batch_y = mnist.train.next_batch(50)
        
        # Train the model
        # Dropout keep_prob 50% will be dropped
        sess.run(cnn_trainer, feed_dict={x: batch_x, 
                                         y_true: batch_y, 
                                         hold_prob: 0.5})
        
        # Test the model: at each 100th step
        # Run this block of code for each 100 times of training, each time run a batch
        if i % 100 == 0:
            print('ON STEP: {}'.format(i),'\nACCURACY: ')
            
            # Compare to find matches of y_pred and y_true
            matches = tf.equal(tf.argmax(y_pred,axis=1), tf.argmax(y_true,axis=1))
            
            # Cast the matches from integers to tf.float32
            # Calculate the accuracy using the mean of matches
            acc = tf.reduce_mean(tf.cast(matches, tf.float32))
            
            # Test the model at each 100th step
            # Using test dataset
            # Dropout: NONE because of test, not training
            test_accuracy = sess.run(acc, feed_dict={x: mnist.test.images, 
                                                     y_true: mnist.test.labels, 
                                                     hold_prob: 1.0})
            print(test_accuracy)
        

ON STEP: 0 
ACCURACY: 
0.244
ON STEP: 100 
ACCURACY: 
0.9034
ON STEP: 200 
ACCURACY: 
0.9102
ON STEP: 300 
ACCURACY: 
0.9384
ON STEP: 400 
ACCURACY: 
0.9548
ON STEP: 500 
ACCURACY: 
0.9625
ON STEP: 600 
ACCURACY: 
0.9684
ON STEP: 700 
ACCURACY: 
0.9694
ON STEP: 800 
ACCURACY: 
0.9711
ON STEP: 900 
ACCURACY: 
0.9741
ON STEP: 1000 
ACCURACY: 
0.978
ON STEP: 1100 
ACCURACY: 
0.9819
ON STEP: 1200 
ACCURACY: 
0.9817
ON STEP: 1300 
ACCURACY: 
0.9786
ON STEP: 1400 
ACCURACY: 
0.9799
ON STEP: 1500 
ACCURACY: 
0.982
ON STEP: 1600 
ACCURACY: 
0.9817
ON STEP: 1700 
ACCURACY: 
0.9828
ON STEP: 1800 
ACCURACY: 
0.985
ON STEP: 1900 
ACCURACY: 
0.9866
ON STEP: 2000 
ACCURACY: 
0.9863
ON STEP: 2100 
ACCURACY: 
0.9863
ON STEP: 2200 
ACCURACY: 
0.9812
ON STEP: 2300 
ACCURACY: 
0.9863
ON STEP: 2400 
ACCURACY: 
0.9827
ON STEP: 2500 
ACCURACY: 
0.9844
ON STEP: 2600 
ACCURACY: 
0.9838
ON STEP: 2700 
ACCURACY: 
0.9852
ON STEP: 2800 
ACCURACY: 
0.9862
ON STEP: 2900 
ACCURACY: 
0.9857
