In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import scipy
import tensorflow as tf
from tensorflow.python.framework import ops
from Preprocessing import *

import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Input, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout, GlobalMaxPooling2D, GlobalAveragePooling2D
from keras.models import Model
from keras.preprocessing import image
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import preprocess_input
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

import keras.backend as K
K.set_image_data_format('channels_last')
from matplotlib.pyplot import imshow

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_train_dataset():
    X_train = np.empty((1404,64,64,3), dtype="int32")
    Y_train = np.empty((1404,4), dtype="int32")
    
    X_train_id_cards,Y_train_id_cards = load_train_id_cards()
    X_train_slides,Y_train_slides = load_train_slides()
    X_train_paper_docs,Y_train_paper_docs = load_train_paper_documents()
    X_train_receipts,Y_train_receipts = load_train_receipts()
    
    for i in range(482):
        X_train[i] = X_train_id_cards[i]
    for i in range(316):
        X_train[482+i] = X_train_slides[i]
    for i in range(306):
        X_train[798+i] = X_train_paper_docs[i]
    for i in range(300):
        X_train[1104+i] = X_train_receipts[i]
    
    for i in range(482):
        Y_train[i] = Y_train_id_cards[i]
    for i in range(316):
        Y_train[482+i] = Y_train_slides[i]
    for i in range(306):
        Y_train[798+i] = Y_train_paper_docs[i]
    for i in range(300):
        Y_train[1104+i] = Y_train_receipts[i]
        
    return X_train,Y_train

In [3]:
def load_test_dataset():
    X_test = np.empty((65,64,64,3), dtype="int32")
    Y_test = np.empty((65,4), dtype="int32")
    
    X_test_id_cards,Y_test_id_cards = load_test_id_cards()
    X_test_slides,Y_test_slides = load_test_slides()
    X_test_paper_docs,Y_test_paper_docs = load_test_paper_documents()
    X_test_receipts,Y_test_receipts = load_test_receipts()
    
    for i in range(24):
        X_test[i] = X_test_id_cards[i]
    for i in range(10):
        X_test[24+i] = X_test_slides[i]
    for i in range(14):
        X_test[34+i] = X_test_paper_docs[i]
    for i in range(17):
        X_test[48+i] = X_test_receipts[i]
    
    for i in range(24):
        Y_test[i] = Y_test_id_cards[i]
    for i in range(10):
        Y_test[24+i] = Y_test_slides[i]
    for i in range(14):
        Y_test[34+i] = Y_test_paper_docs[i]
    for i in range(17):
        Y_test[48+i] = Y_test_receipts[i]
        
    return X_test,Y_test

In [4]:
X_train_orig,Y_train_orig = load_train_dataset()
X_test_orig,Y_test_orig = load_test_dataset()
# Normalizing for faster convergence
X_train = X_train_orig/255.
X_test = X_test_orig/255.
Y_train = Y_train_orig
Y_test = Y_test_orig
#print(Y_train[1105])
print ("number of training examples = " + str(X_train.shape[0]))
print ("number of test examples = " + str(X_test.shape[0]))
print ("X_train shape: " + str(X_train.shape))
print ("Y_train shape: " + str(Y_train.shape))
print ("X_test shape: " + str(X_test.shape))
print ("Y_test shape: " + str(Y_test.shape))

number of training examples = 1404
number of test examples = 65
X_train shape: (1404, 64, 64, 3)
Y_train shape: (1404, 4)
X_test shape: (65, 64, 64, 3)
Y_test shape: (65, 4)


In [5]:
def TrainingModel(input_shape):
    # Define the input placeholder as a tensor with shape input_shape. Think of this as your input image!
    X_input = Input(input_shape)

    # Zero-Padding: pads the border of X_input with zeroes
    X = ZeroPadding2D((3, 3))(X_input)

    # CONV -> BN -> RELU Block applied to X
    X = Conv2D(8, (4, 4), strides = (1, 1), name = 'conv0')(X)
    X = BatchNormalization(axis = 3, name = 'bn0')(X)
    X = Activation('relu')(X)

    # MAXPOOL
    X = MaxPooling2D((8, 8), name='max_pool0')(X)
    
    # CONV -> BN -> RELU Block applied to X
    X = Conv2D(16, (2, 2), strides = (1, 1), name = 'conv1')(X)
    X = BatchNormalization(axis = 3, name = 'bn1')(X)
    X = Activation('relu')(X)

    # MAXPOOL
    X = MaxPooling2D((4, 4), name='max_pool1')(X)

    # FLATTEN X (means convert it to a vector) + FULLYCONNECTED
    X = Flatten()(X)
    X = Dense(4, activation='softmax', name='fc')(X)

    # Create model. This creates your Keras model instance, you'll use this instance to train/test the model.
    model = Model(inputs = X_input, outputs = X, name='trainingModel')

    return model

In [6]:
trainingModel = TrainingModel((X_train.shape[1],X_train.shape[2],X_train.shape[3]))

In [7]:
trainingModel.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])

In [8]:
trainingModel.fit(x = X_train, y = Y_train, epochs = 40, batch_size = 32)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fd1c17cedd8>

In [9]:
preds = trainingModel.evaluate(x = X_test, y = Y_test)
print()
print ("Loss = " + str(preds[0]))
print ("Test Accuracy = " + str(preds[1]))


Loss = 0.3531882168008731
Test Accuracy = 0.8769230769230769


In [10]:
trainingModel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 64, 64, 3)         0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 70, 70, 3)         0         
_________________________________________________________________
conv0 (Conv2D)               (None, 67, 67, 8)         392       
_________________________________________________________________
bn0 (BatchNormalization)     (None, 67, 67, 8)         32        
_________________________________________________________________
activation_1 (Activation)    (None, 67, 67, 8)         0         
_________________________________________________________________
max_pool0 (MaxPooling2D)     (None, 8, 8, 8)           0         
_________________________________________________________________
conv1 (Conv2D)               (None, 7, 7, 16)          528       
__________

## Test on own image
- Just for this example, I reused test examples to see what the mistakes were. 
- Should actually use a fresh set of test examples

In [11]:
def make_prediction(path, end):
    for i in range(1,end):
        # Preprocess the image first
        img_path = path+str(i)+".jpg"
        img = Image.open(img_path)
        #imshow(img)
        img = standardize(img)
        img = img.reshape((1,64,64,3))/255.
        
        # Making predictions 
        prediction = trainingModel.predict(img)
        #print("prediction results: ",prediction)
        #print(prediction.shape)
        index = np.unravel_index(np.argmax(prediction, axis=None), prediction.shape)
        #print(index)

        # converting results to text
        item = index[1]
        if item == 0:
            print("It is an id card/passport.")
        elif item == 1:
            print("It is a slide.")
        elif item == 2:
            print("It is a paper document.")
        elif item == 3:
            print("It is a receipt.")

In [12]:
make_prediction("slides_test/",11)

It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is an id card/passport.
It is a slide.
It is a paper document.


## Results from testing
- **ID cards:** 22/24, the 2 were incorrectly indentified as receipts due to it being a 2 page pic of passport which is longer.
- **slides:** 9/10, 1 was incorrectly indentified as an id card/passport.
- **paper_docs:** 6/14, 8 were incorrectly indentified as receipts. Technically recipts are a form of paper doc.
- **receipts:** 14/14

## Applying transfer learning 
- Using pre-trained weights from VGG model
- freeze all the layers and only train the last 3 layers
- Should give a better accuracy

In [13]:
vgg16_model = keras.applications.vgg16.VGG16(weights='imagenet', include_top=False)

In [14]:
# checking the type
type(vgg16_model)

keras.engine.training.Model

In [15]:
vgg16_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
__________

In [16]:
# Freeze all the other layers
for layer in vgg16_model.layers:
    layer.trainable = False

In [17]:
# Create your own input format (here 64x64x3)
X_input = Input(shape=(64,64,3),name = 'image_input')

In [18]:
output_vgg16_conv = vgg16_model(X_input)

In [19]:
#Add the fully-connected layers 
X = Flatten(name='flatten')(output_vgg16_conv)
X = Dense(2048, activation='relu', name='fc1')(X)
X = Dense(2048, activation='relu', name='fc2')(X)
X = Dense(4, activation='softmax', name='predictions')(X)

In [20]:
my_model = Model(inputs=X_input, outputs=X,name='myModel')

In [21]:
my_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     (None, 64, 64, 3)         0         
_________________________________________________________________
vgg16 (Model)                multiple                  14714688  
_________________________________________________________________
flatten (Flatten)            (None, 2048)              0         
_________________________________________________________________
fc1 (Dense)                  (None, 2048)              4196352   
_________________________________________________________________
fc2 (Dense)                  (None, 2048)              4196352   
_________________________________________________________________
predictions (Dense)          (None, 4)                 8196      
Total params: 23,115,588
Trainable params: 8,400,900
Non-trainable params: 14,714,688
________________________________________________________

In [22]:
my_model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])

In [23]:
my_model.fit(x = X_train, y = Y_train, epochs = 5, batch_size = 32, validation_data=(X_test,Y_test))

Train on 1404 samples, validate on 65 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd1a82c2320>

In [24]:
preds = my_model.evaluate(x = X_test, y = Y_test)
print ("Loss = " + str(preds[0]))
print ("Test Accuracy = " + str(preds[1]))

Loss = 0.31653624085279614
Test Accuracy = 0.9384615384615385


In [25]:
def make_prediction_VGG(path, end):
    for i in range(1,end):
        # Preprocess the image first
        img_path = path+str(i)+".jpg"
        img = Image.open(img_path)
        #imshow(img)
        img = standardize(img)
        img = img.reshape((1,64,64,3))/255.
        
        # Making predictions 
        prediction = my_model.predict(img)
        #print("prediction results: ",prediction)
        #print(prediction.shape)
        index = np.unravel_index(np.argmax(prediction, axis=None), prediction.shape)
        #print(index)

        # converting results to text
        item = index[1]
        if item == 0:
            print("It is an id card/passport.")
        elif item == 1:
            print("It is a slide.")
        elif item == 2:
            print("It is a paper document.")
        elif item == 3:
            print("It is a receipt.")

## Double checking on test images
- checking if the accuracy is correct
- finding out where the mistakes are

In [26]:
make_prediction_VGG("slides_test/",11)

It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.
It is a slide.


In [27]:
make_prediction_VGG("id_cards_test/",25)

It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.


In [28]:
make_prediction_VGG("paper_documents_test/",15)

It is a receipt.
It is a paper document.
It is a paper document.
It is a receipt.
It is a paper document.
It is a receipt.
It is a receipt.
It is a paper document.
It is a paper document.
It is a paper document.
It is a paper document.
It is a paper document.
It is a paper document.
It is a paper document.


In [29]:
make_prediction_VGG("receipts_test/",18)

It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
