In [None]:
    import numpy as np
    import matplotlib.pyplot as plt
    import keras
    from keras.datasets import mnist
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.optimizers import Adam
    from keras.utils.np_utils import to_categorical
    from keras.models import Model
    from keras.layers import Flatten
    from keras.layers.convolutional import Conv2D
    from keras.layers.convolutional import MaxPooling2D
    from keras.layers import Dropout
    import random

In [None]:
np.random.seed(0)

In [None]:
(X_train, y_train), (X_test, y_test)= mnist.load_data()

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
assert(X_train.shape[0] == y_train.shape[0]), "The number of images is not equal to the number of labels."
assert(X_train.shape[1:] == (28,28)), "The dimensions of the images are not 28 x 28."
assert(X_test.shape[0] == y_test.shape[0]), "The number of images is not equal to the number of labels."
assert(X_test.shape[1:] == (28,28)), "The dimensions of the images are not 28 x 28."

In [None]:
num_of_samples=[]
cols = 5
num_classes = 10

fig, axs = plt.subplots(nrows=num_classes, ncols=cols, figsize=(5,10))
fig.tight_layout()
     
for i in range(cols):
    for j in range(num_classes):
        x_selected = X_train[y_train == j]
        axs[j][i].imshow(x_selected[random.randint(0,(len(x_selected) - 1)), :, :], cmap=plt.get_cmap('gray'))
        axs[j][i].axis("off")
        if i == 2:
            axs[j][i].set_title(str(j))
            num_of_samples.append(len(x_selected))

In [None]:
print(num_of_samples)
plt.figure(figsize=(12, 4))
plt.bar(range(0, num_classes), num_of_samples)
plt.title("Distribution of the train dataset")
plt.xlabel("Class number")
plt.ylabel("Number of images")
plt.show()

In [None]:
# previously we would flatten all the images into simple 784 pixels {(1D) -> now the pixels can be fed into a feedforward neural network as inputs}
# except now we are leaving it a a 28X28 image but also adding a depth of 1. With regular nueral networks the image had to be flattened into a 1D array
# of pixel intensities which were then processed as inputs into the NN while with convolutional networks it is a little different. The first step in preparing our# 
# data for use in convolutional neural network is to add depth to our data. As mentioned earlier the way CNN works is by applying a filter to the channels of the 
# image that are being viewed. In the case o grayscale images there is one channel present, therefore our data must reflect the presence of this depth, by adding
# this depth of one over data will be in the disired shape to be used as an input for the convolutional layer which we will code with momentarily.

X_train = X_train.reshape(60000, 28, 28, 1)
X_test = X_test.reshape(10000, 28, 28, 1)

In [None]:
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

In [None]:
X_train = X_train/255
X_test = X_test/255

In [None]:
# As with any other NN model we must first design it before we are able to use it when it comes to CNN, There are many pre-built architecture that have
# already been designed. These pre-built design have been shown to perform effectively and efficiently and gradually being improved over time.
# the very first revolutionary CNN was designed back in 1998, this architecture is referred to as the LeNet model and it was designed to be able to
# recognize hand written numbrs on checks. After the initial model was described many other CNN architectures have been researched and examined, such as
# AlexNet, ZFNet, GoogleNet. Tese newer architectures are more sophisticated, however the initial impact of LeNet should not be  understated as this 
# architecture type is still used today.
# the LeNet architecture typically consists of the following structure: 
# The first layer of the network is then a Convolutional layer with an unspecified amount of filters, the output of this convolutional layer is then
# fed into a pooling layer, that layer is then connected to the another convolutional layer, and finally we have one more pooling layer before I forward 
# data, is fed into a fully connected layer. That eventually connects to an output classifier.


#define the LeNet_model function

def LeNet_model():
    # we define our model as being Sequential model
  model = Sequential()
    # we start by definning our very first COnvolutional layer as you have done previously, we use the model.add() function, while the next step
    # was to put a Dense layer into our model, This time we are going to start by using a Conv2D layer instead. This convolutional layers consist of
    # filters that recognize various features within the image appropriately enough the first arg in Conv2D is going to deal with how many filters 
    # our layer is going to have, this is a parameter that you can feel free to play around with as a different number of filters will be effective
    # for different applications, to our experience we find that 30 filter usually pretty effective.keep in mind that as the number of filters 
    # increases the number of parameters also increases which demands more computing power.
    # next we want to use a tuple to define the size of our filters, this is hypr parameter that you can adjust to get optimal performanace. wer are
    # going to use a filter that is 5X5 which is good relative size for our images which are 28X28. As this is the very first layer of our network
    # we need to specify the shape of our input data. Recall earlier that we reshaped and added depth to our data to have a shape of 28X28X1.
    # so what we will do as we will declare an input shape  is equal to (28, 28, 1), meaning the input layer will take in 28X28 images with depth
    # of one with a single channel, that was being fed into the network continuously.
    # And finally that last arg for this convolutional layer is the activation function, we are going to use the relu function.
    # After this layer our 28X28 image which ever one is being pass through is going to be reduced down to 30 feature maps each one 24X24. Therefore 
    # the full output volumne of the convolutional layer will have feature maps of up to 30 as each independent filter used on the original image adds
    # depth to the convolute image. Analyzing the structure of this layer we can determine the amount of adjustable parameters that it contains. 
    # Considering that each filter is 5X5 and each pixel inside the filter has its own value, than a single filter will have 25 values or adjustable 
    # parameters. Considering that we have 30 filters this puts it up to 750. Finally considering that each filter has its own respective bias value
    # which is considered a parameter, we get a total value of 780 adjustable (750 + 30 bias = 780 value) parameters in this layer.
    # This conv2D function an take some other args to customize the layer. Arguments are Strides and padding.
    # Strides, it is simply the kernel of step one involved on the image, how much the kernel is translated. why it is important? well the smaller 
    # the stride the more the kernel convolution operations and thus the more output that is retained. Ex, 3X3 kernel increments of 2, resulting in the
    # following feature map 2X2 as opposed to convolving it by increments of 1 (smaller stride retains more information as more convolutional operations
    # are conducted)
    # padding, when conducting convolutional operations the spatial size of the image gets smaller as we lose the information at the borders, even if we 
    # use a stride of one, so padding works to perserve the spatail dimensionality of the image. Let's look at half padding (same padding) it ensures
    # that the output size remains the same as the input size. Suppose we have a 5X5 image with the corresponding 4X4 kernel to ensure that the output is
    # the same as the inputs we add 2 pixels thick layers of padding each pixel with a value of zero, and convolving the kernel, notice the information at 
    # the borders is now processed thus outputing an image with the same dimensionality preserving spatial dimensionality in most cases is quit a  desirable
    # property. for instance it allows to extract low level features and thus by keeping all the information at the border the tends to improve performance.
    # now the padding argument can take 3 variations either 'valid' or 'casual' or 'same' and default is no padding will be used. 
  model.add(Conv2D(30, (5, 5), input_shape=(28, 28, 1), activation='relu'))
    # The next layer we add to our model is going to be the pooling layer, it simplely adding MaxPooling2D layer. The function only takes in one argument 
    # which is of the size of the pooling element.We use a tuple to define this pooling size, the pool_size that we are going to use is going to be a 2X2
    # Therefore grabbing the maximum value in a feature map within a 2X2 neighborhood. The pooling size is appropraite as it is not too large as to miss
    # relevant features in the convoluted image. Howeve it is large enough to help classify our image in a more efficient way. After pooling process the 
    # shape of our convoluted image is going to go from a 24X24X30 to a smaller shape of a 12X12X30 which should make sense to you, since max pooling 
    # scales down every feature map into a smaller obstructured representation and since we are using 2X2 neighborhood it makes sense that the image size 
    # was reduced to half of what it was previously.
    # overall as we move forward into the network the size of our image decreases and start the gain depth in our specific case gaining a depth of 30. The 
    # depth contains features extracted from the original image from each of the filters and it very valuable. 
  model.add(MaxPooling2D(pool_size=(2, 2)))
    # We are going to add another conv2D layer, since we are dealing with a more dense input for this layer we are going to use fewer filters for for the 
    # purpose of minimizing the required computational power. we are going to use 15 filters on this layer as our image is smaller this time around we are 
    # going to use smallar filter to extract features whether our filter size is going to be a 3X3, ad interestingly enough it is important to note that 
    # even though we were using fewer filter in this layer and we are using a small filter size we are going to be dealing with a higher number of 
    # parameters. This is due to the fact that the output that is coming out of the pooling layer and going into the nextconvolutional layer as an input
    # now has a depth of 30. This means that each filter must be applied to each depth a layer which results in 15X30X3X3 = 4050 parameters.We also add a
    # biased parameter for each of the filters that we are using which results in a total of 4050 + 15 = 4065 parameters.
    # this is a little over 5 times as many parameters as our initial input convolutional layer. So you can see that this convultion process begins to
    # demand more and more computational power, as the input images start to increase in depth along the network. And as this is not first layer in the 
    # network there is no need to define the input_shape we already previous steps.
  model.add(Conv2D(15, (3, 3), activation='relu'))
    #
  model.add(MaxPooling2D(pool_size=(2, 2)))
    # Next layer is Flatten, does not actually require any paramters. However it will help us flatten our data in order to format it properly for it to 
    # go in the fully connected network which makes sense because if you are a member in the deep learning code that we implemented earlier before processing 
    # our image into the neural network we had to flatten it.we had to flatten each image to be 1D. Flattened layer will take our output data with a shape of 
    # 5X5X15 and reshape it into a 1D array of shape 375. Now our data is ready to be fed into a fully connected layer. 
  model.add(Flatten())
    # Now add dense layer, we first defined the amount of nodes we want to add in the first hidden layer, we will actully add 500 nodes to this layer.
    # A lower number will typically provide minimally less accuracy over a higher number will require more computing power. We also define the 
    # activation function to be the relu function 
  model.add(Dense(500, activation='relu'))
    #
  model.add(Dense(num_classes, activation='softmax'))
    #
  model.compile(Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = LeNet_model()
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, epochs=10, validation_split=0.1, batch_size=400, verbose=1, shuffle=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.title('Loss')
plt.xlabel('epoch')

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.legend(['acc', 'val_acc'])
plt.title('Accuracy')
plt.xlabel('epoch')

In [None]:
import requests
from PIL import Image
     
url = 'https://www.researchgate.net/profile/Jose_Sempere/publication/221258631/figure/fig1/AS:305526891139075@1449854695342/Handwritten-digit-2.png'
response = requests.get(url, stream=True)
img = Image.open(response.raw)
plt.imshow(img, cmap=plt.get_cmap('gray'))

In [None]:
import cv2
     
img = np.asarray(img)
img = cv2.resize(img, (28, 28))
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img = cv2.bitwise_not(img)
plt.imshow(img, cmap=plt.get_cmap('gray'))


In [None]:
img = img/255
img = img.reshape(1, 28, 28, 1)

In [None]:
prediction = model.predict_classes(img)
print("predicted digit:", str(prediction))

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print(type(score))
print('Test score: ', score[0])
print('Test accuracy: ', score[1])

In [None]:
# Generally whenever the validation error is higher than the training error often times that would mark the beginning of your model
# starting to overfit. So as affective as this Convolutional layer was in classifying test images on top of that we are going to 
# introduce another technique to reduce overfitting of our data.we are going to go back to our LeNet model function and we are gooing
# to add a specific layer type that will help reduce overfitting of our data. This layer type is called the 'dropout' layer.
# This layer essentially functions by randomly setting a fraction our rate of input units to zero at each update during training which 
# helps prevenet overfitting. Some nodes will be turn off and are no longer communicating information along the network. As mentioned
# earlier the dropout function randomly select a predefined fraction of nodes to be turned off.What this means is that every time the 
# network updates parameters during the training process, it selects random nodes that it is going to turn off. So while some nodes 
# that have been turned off it is important to note that a different set of nodes are turned off with each update, this process works to
# prevent overfitting because it forces the neural network to use various combinations of nodes to classify the same data as with each 
# updates a random set of nodes survives the dropout process. Each node is forced to learn the data set in a more independent way with a
# unique neural arrangement each time. This causes the NN in general to become more versatile and allows the weights to be more uniformly 
# distributed along that network.
# Now this only occurs during training when this network is used on new data, test data for example, dropout is not used in all the nerual
# nodes are used to classify the new data. One dropout is no longer used, the nodes can combined all of their independent learning to form
# a more effective NN. This results in a network that has reduced generalized error and demonstrates less overfitting when it is exposed 
# to new data.  
# Example of this to imagine the following case:
# Imagine a simple perceptron model with 2 inputs knowns and one output node, Our system is relatively simple and it functions by having one
# neuron and two feed data into neuron 3. Neuron 3 combines this data to give an output. Now let's assume our network functions as follows; 
# neuron one has been well trained and feeds the correct input to neron 3 about 90% of the time, while neron 2 doesn't perform as well and 
# seems to provide accurate input to neuron 3 at random. Therefore as the network receives more training it simply kearns to listen fully
# to neuron one and completely ignores the input associated with neuron number 2. We essentially get a perceptron that behaves as follow:
# --> 1 --> 3 --> output
# This can be viewed as redundant because two neurons are essentially just behaving as one neuron 2 becomes obsolete. So this is were 
# dropout plays a key role in improving the effectiveness of the network. Applying dropout to this network means that at random instances 
# either a neuron 1 or neuron 2 has turned off. when neuron 2 is turned off, not much changes as the network was ignoring this neuron anyways.
# However in the cases when neuron 1 is turned off the NN is forced to listen to neuron 2, while at the beginning this might seem like be a bad
# situation due to the poor performance of neuron 2. It is actually a good thing. This is because these situations force the 2nd neuron to
# learn and adopt to the data as well, independently of neuron 1. This also forces the NN to listen to neuron 2 and provides resources
# towards improving the accuracy of neuron 2. Now when the network moves on that testing data neurons 1 and 2, start to work togather more 
# efficiently, while neuron one is still correct 90% of the time, neuron 2 has also been trained and provides and added level of accuracy
# that pushes the accuracy of our network over 90%. This is just a simple example of how to wrap up layers can provide uniform weight
# distribution over the network and can decresse generalization there. In more complex networks this results in decreased overfitting.

# Now we import 'Dropout' and we update LeNet model and replace the dropout layer within our NN for the purpose of demonstration, we are
# only going to use a single dropout layer. However more than one dropout layer can be used in a given network to obtain the desired 
# performance. The location of these layers in a network can also vary, you can place them between the convolutional layers or between 
# your fully connected layers. Typically the dropout layer is used in between layers that have a high number of parameters, because
# these high parameter layers are more likely to overfit and memorized the training data. And for this reason we will attempt to place 
# our dropout layer between our two fully connected ones.To fix our overfitting problem. 

def LeNet_model_1():
    # we define our model as being Sequential model
    model = Sequential()
    model.add(Conv2D(30, (5, 5), input_shape=(28, 28, 1), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(15, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(500, activation='relu'))
    # Dropout layer takes an argument for a fraction rate, there is refers to the amount of input nodes that the dropout layer
    # drops during each update with zero refer to when no nodes are dropped and one referring to all input nodes are dropped.
    # we will use 0.5 as this is the recomended rates provided by the researchers that first proposed the dropout technique 
    # to reduce overfitting
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model_1 = LeNet_model_1()
print(model_1.summary())

In [None]:
history_1 = model_1.fit(X_train, y_train, epochs=10, validation_split=0.1, batch_size=400, verbose=1, shuffle=1)

In [None]:
# As you can see our validation accuracy jumps up to match our training accuracy. This an also be seen in the error plot and shows that our
# overfitting issue was successfully taken care of and just notice how the validation error remains lower than your training error which can also
# signify a reduction in overfitting as the NN was generalized to also correctly classify the validation data with minimal error

In [None]:
plt.plot(history_1.history['loss'])
plt.plot(history_1.history['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.title('Loss')
plt.xlabel('epoch')

In [None]:
plt.plot(history_1.history['acc'])
plt.plot(history_1.history['val_acc'])
plt.legend(['acc', 'val_acc'])
plt.title('Accuracy')
plt.xlabel('epoch')

In [None]:
import requests
from PIL import Image
     
url = 'https://www.researchgate.net/profile/Jose_Sempere/publication/221258631/figure/fig1/AS:305526891139075@1449854695342/Handwritten-digit-2.png'
response = requests.get(url, stream=True)
img = Image.open(response.raw)
plt.imshow(img, cmap=plt.get_cmap('gray'))

In [None]:
import cv2
     
img = np.asarray(img)
img = cv2.resize(img, (28, 28))
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img = cv2.bitwise_not(img)
plt.imshow(img, cmap=plt.get_cmap('gray'))

In [None]:
img = img/255
img = img.reshape(1, 28, 28, 1)

In [None]:
prediction = model_1.predict_classes(img)
print("predicted digit:", str(prediction))

In [None]:
score = model_1.evaluate(X_test, y_test, verbose=0)
print(type(score))
print('Test score: ', score[0])
print('Test accuracy: ', score[1])

In [None]:
# it is evident that the CNN architecture helped us improve our training and validation and accuracy while decreasing the amount of 
# overfitting persent in our NN. We then event futher decreased the degree of overfitting by adding a dropout layer to the network
# and now that we have optimized our CNN we believe we are now able to use it for more complex image classification problems

# However before we are ready to move on to more complex datasets let's ensure that we have a concrete grasp of just how CNN function,
# As mentioned earlier these convolutional layers take an input image and use a smaller filter to extract features from the image
# more than one filter can be applied within each layer and the size of the filters can vary as well.
# In our code, our first layer contains 30 filters of size 5X5 while our second layer contains 15 folders of 3X3. The purpose of using 
# multiple filters per layer is that each filter learns to look for its unique set of features and can analyze the image in a unque way.
# the analysis of all the filters can then be combined to get a network with higher degree of accuracy 

# Let's attempt to actually visualize what each of our filters are seeing into code.This will help us see the variety of features of the 
# filters that tact and how they can work together to accurately classify data sets.
# We will begin by instantiating the two layers that we want to see the output for, this process basically requires us to export layers 
# from our fully defined model se we can log the parts of the model at various sections along the network, we can do so by using something
# called the Model class API. Model class API is an incredibly useful tool as it allows us to define a model much in the same way as the
# Sequential function we have become familiar with, using the Model class API allows us to instantiate layers from pre-trained models
# effectively allowing us to reuse sections of previously trained models. We aregoing to take advantage of this ability to help us visualize
# the outputs from out two convolutional layers. 
# Model function requires 2 arguments, the first arguement defines all the inputs into your network, while he 2nd argument defines it outputs 
# that you want from the model. The case of layer1 our inputs are defined as the inputs into our very first layer. Therefore we use a model that layers
# to access the layers within our model and we use the index position of 0 to access our first layer in the model, to access the first convolutional
# layer if you view the model summary we printed, you can recognize that our first convolutional layer is the first layer in our network. and so 
# accessing the inputs to this layer is done by adding the 'input' extension to our layer. We then define the outputs of the same layer using
# 'model.layers[0].outputs' once again we access the first index position and this time we acess the outputs of this layer. Our first layer is complete.
layer1 = Model(inputs=model.layers[0].input, outputs=model.layers[0].output)
layer2 = Model(inputs=model.layers[0].input, outputs=model.layers[2].output)

# Now that we finished creating our two layers let's run a prediction on them without importing image. you get this prediction by calling predict fucntion
visual_layer1, visual_layer2 = layer1.predict(img), layer2.predict(img)
print(visual_layer1.shape)
print(visual_layer2.shape)

In [None]:
plt.figure(figsize=(10, 6))
for i in range(30):
    plt.subplot(6, 5, i+1)
    plt.imshow(visual_layer1[0, :, :, i], cmap=plt.get_cmap('jet'))
    plt.axis('off')

In [None]:
# the output of our 2nd layer in much the same way as earlier, red signifying high intensity while the blue signifies low intensity.
# These images appear to be a bit more abstarct and this makes perfect sense, we always talked about how the feature maps starts to
# become unrecognizable towards the very end as they contain less information about the image, but more information about the specific
# feature that is distinct to the kernel that was involved on the image, since the deeper you get into the NN, the filters become more and 
# more complex building on top of one another and becoming more sophisticated in terms of the high of shapes that they start in coding for
# these high level shapes as present in very specific parts of the image ans so which end up with our filters that only retain the information
# in the image taht's relevant to it 
plt.figure(figsize=(10, 6))
for i in range(15):
    plt.subplot(3, 5, i+1)
    plt.imshow(visual_layer2[0, :, :, i], cmap=plt.get_cmap('jet'))
    plt.axis('off')