Matthew Twete

Experiments with training LeNet architecture CNN on the cifar 10 dataset with various activations and hyperparameters. Including visualizing some feature maps of the best performing LeNet model. Additionally, some hand crafted CNN models were trained on the cifar 10 dataset. The training progess of the models was plotted as well.

Some code taken/inspiried from https://machinelearningmastery.com/how-to-develop-a-cnn-from-scratch-for-cifar-10-photo-classification/ and other sources

In [None]:
#Import libraries
import tensorflow.keras
import copy
from keras import callbacks
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, AveragePooling2D
from keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
from matplotlib import pyplot as plt
from keras.models import Model

#Import the dataset
(x_train, y_train), (x_test, y_test) = tensorflow.keras.datasets.cifar10.load_data()
#Convert labels to one hot encoding
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
#Function to normalize test and train data
def normalize(test, train):
  norm_test = test.astype('float32')
  norm_train = train.astype('float32')
  norm_test = norm_test/255.0
  norm_train = norm_train/255.0
  return norm_test,norm_train

#Normalize data
x_test, x_train = normalize(x_test, x_train)

In [None]:
#Define class to make a CNN with the LeNet structure
class LeNetCNN():
    #Constructor, only argument is the type of activation function
    def __init__(self,actFunc):
      #Build the network
      self.network = Sequential()
      self.network.add(Conv2D(filters=6, kernel_size=5,strides=1, activation=actFunc, input_shape=(32,32,3)))
      self.network.add(AveragePooling2D(pool_size=(2, 2), strides = 2))
      self.network.add(Conv2D(filters=16, kernel_size=5,strides=1, activation=actFunc))
      self.network.add(AveragePooling2D(pool_size=(2, 2), strides = 2))
      self.network.add(Flatten())
      self.network.add(Dense(units=120, activation=actFunc))
      self.network.add(Dense(units=84, activation=actFunc))
      self.network.add(Dense(units=10, activation='softmax'))
    #Simple function to get the network summary
    def get_summary(self):
      print(self.network.summary())
    #Wrapper function to train the network, takes the training data and labels, epochs to train for
    #batch size, and callback function (needed to get test loss after each epoch)
    def fit(self,xdata = None, ydata = None,epoch=1, b_size = 64, callback = None):
      return self.network.fit(x = xdata, y = ydata,epochs=epoch,verbose = 2, batch_size =  b_size, callbacks = callback)






In [None]:
#Code based on code from https://github.com/keras-team/keras/issues/2548
#Class that overrides a function that will run after each training epoch,
#this is needed to get the test loss after each epoch
class TestEvalCallback(callbacks.Callback):
    #Constructor, takes the test data and labels, a list to hold the test loss after each
    #epoch and the batch size
    def __init__(self, xtest,ytest,test_loss_list, b_size):
        self.x_test = xtest
        self.y_test = ytest
        self.test_loss = test_loss_list
        self.batch_size = b_size
    #Function that will run after each epoch, it will evaluate the model on the test data, store the
    #test loss and print it as well
    def on_epoch_end(self, epoch, logs={}):
        loss = self.model.evaluate(self.x_test, self.y_test,batch_size = self.batch_size, verbose=0)
        self.test_loss.append(loss)
        print("Loss: ", loss)

In [None]:
#Activation functions
sigmoid = "sigmoid"
tanh = "tanh"

#Arrays to hold the networks for each activation function
#and loss function, there will be three networks in each
#one for each learning rate
tanhCE_array = []
tanhMSE_array = []
sigCE_array = []
sigMSE_array = []

#Learning rates
lr = [0.1,0.01,0.001]

#Loop 3 times and instantiate networks with the two different activation functions and loss functions,
#adding them to their appropriate arrays
for learn_rate in lr:
  #Instantiate networks
  tanhCE = LeNetCNN(tanh)
  tanhMSE = LeNetCNN(tanh)
  sigCE = LeNetCNN(sigmoid)
  sigMSE = LeNetCNN(sigmoid)
  #Compile them
  tanhCE.network.compile(optimizer=tensorflow.keras.optimizers.Adam(learning_rate=learn_rate),loss="CategoricalCrossentropy")
  tanhMSE.network.compile(optimizer=tensorflow.keras.optimizers.Adam(learning_rate=learn_rate),loss="MeanSquaredError")
  sigCE.network.compile(optimizer=tensorflow.keras.optimizers.Adam(learning_rate=learn_rate),loss="CategoricalCrossentropy")
  sigMSE.network.compile(optimizer=tensorflow.keras.optimizers.Adam(learning_rate=learn_rate),loss="MeanSquaredError")
  #Add to the arrays
  tanhCE_array.append(tanhCE)
  tanhMSE_array.append(tanhMSE)
  sigCE_array.append(sigCE)
  sigMSE_array.append(sigMSE)


In [None]:
#Variables to hold the number of epochs, batch size and number of learning rates
epochs = 15
num_learn_rate = 3
batch_size = 32
#Arrays to hold losses for training and test data for each model
tanhCE_losses = [[[],[]],[[],[]],[[],[]]]
tanhMSE_losses = [[[],[]],[[],[]],[[],[]]]
sigCE_losses = [[[],[]],[[],[]],[[],[]]]
sigMSE_losses = [[[],[]],[[],[]],[[],[]]]


#Loop over the learning rates and networks with the two different acticvation functions and loss functions
for j in range(num_learn_rate):
  #Train the tanh CE network with the callback to get the test loss at the end of each epoch
  history = tanhCE_array[j].fit(x_train, y_train, epochs, batch_size, callback=[TestEvalCallback(x_test, y_test,tanhCE_losses[j][1],batch_size)])
  #Store the training loss over the epochs
  tanhCE_losses[j][0] = copy.deepcopy(history.history['loss'])
  #Train the tanh MSE network with the callback to get the test loss at the end of each epoch
  history = tanhMSE_array[j].fit(x_train, y_train, epochs, batch_size, callback=[TestEvalCallback(x_test, y_test,tanhMSE_losses[j][1],batch_size)])
  #Store the training loss over the epochs
  tanhMSE_losses[j][0] = copy.deepcopy(history.history['loss'])
  #Train the sigmoid CE network with the callback to get the test loss at the end of each epoch
  history = sigCE_array[j].fit(x_train, y_train, epochs, batch_size, callback=[TestEvalCallback(x_test, y_test,sigCE_losses[j][1],batch_size)])
  #Store the training loss over the epochs
  sigCE_losses[j][0] = copy.deepcopy(history.history['loss'])
  #Train the sigmoid MSE network with the callback to get the test loss at the end of each epoch
  history = sigMSE_array[j].fit(x_train, y_train, epochs, batch_size, callback=[TestEvalCallback(x_test, y_test,sigMSE_losses[j][1],batch_size)])
  #Store the training loss over the epochs
  sigMSE_losses[j][0] = copy.deepcopy(history.history['loss'])



In [None]:
#Function to plot the train & test loss vs epoch for the three networks of a given
#activation and loss function
def plot_multiple_network_errors(error_array, act_func, loss):
  #Set up figure for subplots
  plt.figure(figsize=(25, 10))
  plt.subplots_adjust(hspace=0.5)
  #Add overall title
  plt.suptitle(act_func + ' activation function using ' + loss + ' loss CNN models train and test loss vs epoch', fontsize=18, y=0.95)
  #Loop over the learning rates
  for i in range(num_learn_rate):
    #Plot the subplot with a title listing the learning rate
    ax = plt.subplot(1,3,i+1)
    ax.plot(error_array[i][0])
    ax.plot(error_array[i][1])
    plt.title('model trained with a learning rate of ' + str(lr[i]))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    ax.get_yaxis().get_major_formatter().set_useOffset(False)
    plt.legend(['train', 'test'], loc='lower right')

#Plot the graphs for networks with each of the activation and loss functions
plot_multiple_network_errors(tanhCE_losses, 'Tanh','Cross-Entropy')
plot_multiple_network_errors(tanhMSE_losses, 'Tanh','Mean Squared Error')
plot_multiple_network_errors(sigCE_losses, 'Sigmoid','Cross-Entropy')
plot_multiple_network_errors(sigMSE_losses, 'Sigmoid','Mean Squared Error')

In [None]:
#Display feature maps, I will use the first 10 training data points as the 10 images

#I choose the sigmoid CE network trained with a learning rate of 0.001
model = sigCE_array[2].network



#Redfine a model to output after the second convolution layer
model = Model(inputs=model.input, outputs=model.layers[2].output)


#Now plot the feature maps of first 10 training data points
for i in range(10):
  #Print the image number
  print("Image #", i+1)
  #Plot the training data points in its original form for comparison
  print("Original Image") 
  plt.imshow(x_train[i])
  plt.show()
  #Now get the feature map for the image
  img = copy.deepcopy(x_train[i])
  img = img[None,:,:,:]
  feature_maps = model.predict(img)

  # plot all 16 maps in 4x4 squares
  print("Feature Maps")
  square = 4
  ix = 1
  for _ in range(square):
    for _ in range(square):
      #Specify subplot and turn of axis
      ax = plt.subplot(square, square, ix)
      ax.set_xticks([])
      ax.set_yticks([])
      #Plot filter channel in grayscale
      plt.imshow(feature_maps[0, :, :, ix-1], cmap='gray')
      ix += 1
  #Show the figure
  plt.show()

In [None]:
#Define relu model
actFunc = 'relu'
network = Sequential()
network.add(Conv2D(filters=6, kernel_size=3,strides=1, activation=actFunc, input_shape=(32,32,3)))
network.add(AveragePooling2D(pool_size=(2, 2), strides = 2))
network.add(Conv2D(filters=16, kernel_size=3,strides=1, activation=actFunc))
network.add(AveragePooling2D(pool_size=(2, 2), strides = 2))
network.add(Flatten())
network.add(Dense(units=120, activation=actFunc))
network.add(Dense(units=84, activation=actFunc))
network.add(Dense(units=10, activation='softmax'))
network.compile(optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.001),loss="CategoricalCrossentropy")

#Array to hold test loss after each epoch
relu_test_error = []

#Train the network
epochs = 20
history = network.fit(x = x_train, y =  y_train,epochs = epochs, verbose = 2, batch_size = batch_size,callbacks=[TestEvalCallback(x_test, y_test,relu_test_error,batch_size)])




In [None]:
#Define plotting function for relu model
def plot_relu_network_errors(error_array, act_func, loss,lr):
  plt.figure(figsize=(15, 10))
  plt.title(act_func + ' activation function using ' + loss + ' loss 3x3 kernel CNN model with a learning rate of ' +str(lr)+ ' train and test loss vs epoch')
  plt.plot(error_array[0])
  plt.plot(error_array[1])
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='lower right')
  plt.show()
  
#Plot the results
plot_relu_network_errors([history.history['loss'],relu_test_error],'ReLU','Cross-Entropy',0.001)

In [None]:
#Define model a final model, I will use an ReLU activation, with cross-entropy loss and a learning rate of 0.001
actFunc = 'relu'
deep_network = Sequential()
deep_network.add(Conv2D(filters=6, kernel_size=3,strides=1, activation=actFunc, input_shape=(32,32,3),padding = "same"))
deep_network.add(AveragePooling2D(pool_size=(2, 2), strides = 1,padding = "same"))
deep_network.add(Conv2D(filters=16, kernel_size=3,strides=1, activation=actFunc,padding = "same"))
deep_network.add(AveragePooling2D(pool_size=(2, 2), strides = 1,padding = "same"))
deep_network.add(Conv2D(filters=16, kernel_size=3,strides=1, activation=actFunc,padding = "same"))
deep_network.add(AveragePooling2D(pool_size=(2, 2), strides = 1,padding = "same"))
deep_network.add(Conv2D(filters=16, kernel_size=3,strides=1, activation=actFunc,padding = "same"))
deep_network.add(AveragePooling2D(pool_size=(2, 2), strides = 1,padding = "same"))
deep_network.add(Conv2D(filters=16, kernel_size=3,strides=1, activation=actFunc,padding = "same"))
deep_network.add(AveragePooling2D(pool_size=(2, 2), strides = 1,padding = "same"))
deep_network.add(Flatten())
deep_network.add(Dense(units=120, activation=actFunc))
deep_network.add(Dense(units=84, activation=actFunc))
deep_network.add(Dense(units=10, activation='softmax'))
deep_network.compile(optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.001),loss="CategoricalCrossentropy")

#Array to hold test loss after each epoch
deep_network_test_error = []

#Train the network
epochs = 20
history = deep_network.fit(x = x_train, y =  y_train,epochs = epochs, verbose = 2, batch_size = batch_size,callbacks=[TestEvalCallback(x_test, y_test,deep_network_test_error,batch_size)])

In [None]:
#Define plotting function for final model
def plot_deep_network_errors(error_array, act_func, loss,lr):
  plt.figure(figsize=(15, 10))
  plt.title(act_func + ' activation function using ' + loss + ' loss 3x3 kernel 5 layer CNN model with a learning rate of ' +str(lr)+ ' train and test loss vs epoch')
  plt.plot(error_array[0])
  plt.plot(error_array[1])
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='lower right')
  plt.show()
  
#Plot the results
plot_deep_network_errors([history.history['loss'],deep_network_test_error],'ReLU','Cross-Entropy',0.001)