In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import scipy
import tensorflow as tf
from tensorflow.python.framework import ops
from Preprocessing_resnet import *

import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Input, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout, GlobalMaxPooling2D, GlobalAveragePooling2D
from keras.models import Model
from keras.preprocessing import image
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import preprocess_input
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

import keras.backend as K
K.set_image_data_format('channels_last')
from matplotlib.pyplot import imshow

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_train_dataset():
    X_train = np.empty((1404,200,200,3), dtype="int32")
    Y_train = np.empty((1404,4), dtype="int32")
    
    X_train_id_cards,Y_train_id_cards = load_train_id_cards()
    X_train_slides,Y_train_slides = load_train_slides()
    X_train_paper_docs,Y_train_paper_docs = load_train_paper_documents()
    X_train_receipts,Y_train_receipts = load_train_receipts()
    
    for i in range(482):
        X_train[i] = X_train_id_cards[i]
    for i in range(316):
        X_train[482+i] = X_train_slides[i]
    for i in range(306):
        X_train[798+i] = X_train_paper_docs[i]
    for i in range(300):
        X_train[1104+i] = X_train_receipts[i]
    
    for i in range(482):
        Y_train[i] = Y_train_id_cards[i]
    for i in range(316):
        Y_train[482+i] = Y_train_slides[i]
    for i in range(306):
        Y_train[798+i] = Y_train_paper_docs[i]
    for i in range(300):
        Y_train[1104+i] = Y_train_receipts[i]
        
    return X_train,Y_train

In [3]:
def load_test_dataset():
    X_test = np.empty((65,200,200,3), dtype="int32")
    Y_test = np.empty((65,4), dtype="int32")
    
    X_test_id_cards,Y_test_id_cards = load_test_id_cards()
    X_test_slides,Y_test_slides = load_test_slides()
    X_test_paper_docs,Y_test_paper_docs = load_test_paper_documents()
    X_test_receipts,Y_test_receipts = load_test_receipts()
    
    for i in range(24):
        X_test[i] = X_test_id_cards[i]
    for i in range(10):
        X_test[24+i] = X_test_slides[i]
    for i in range(14):
        X_test[34+i] = X_test_paper_docs[i]
    for i in range(17):
        X_test[48+i] = X_test_receipts[i]
    
    for i in range(24):
        Y_test[i] = Y_test_id_cards[i]
    for i in range(10):
        Y_test[24+i] = Y_test_slides[i]
    for i in range(14):
        Y_test[34+i] = Y_test_paper_docs[i]
    for i in range(17):
        Y_test[48+i] = Y_test_receipts[i]
        
    return X_test,Y_test

In [4]:
X_train_orig,Y_train_orig = load_train_dataset()
X_test_orig,Y_test_orig = load_test_dataset()
# Normalizing for faster convergence
X_train = X_train_orig/255.
X_test = X_test_orig/255.
Y_train = Y_train_orig
Y_test = Y_test_orig
#print(X_test[60][100])
print ("number of training examples = " + str(X_train.shape[0]))
print ("number of test examples = " + str(X_test.shape[0]))
print ("X_train shape: " + str(X_train.shape))
print ("Y_train shape: " + str(Y_train.shape))
print ("X_test shape: " + str(X_test.shape))
print ("Y_test shape: " + str(Y_test.shape))

number of training examples = 1404
number of test examples = 65
X_train shape: (1404, 200, 200, 3)
Y_train shape: (1404, 4)
X_test shape: (65, 200, 200, 3)
Y_test shape: (65, 4)


In [5]:
# To resolve the issue of BN in keras layers
K.set_learning_phase(1)

In [6]:
resnet50_model = keras.applications.resnet50.ResNet50(weights='imagenet',include_top=False)

In [7]:
type(resnet50_model)

keras.engine.training.Model

In [8]:
resnet50_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, None, None, 3 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, None, None, 6 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, None, None, 6 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

In [9]:
# Freeze all the other layers
for layer in resnet50_model.layers:
    layer.trainable = False

In [10]:
# Create your own input format (here 200x200x3)
X_input = Input(shape=(200,200,3),name = 'image_input')

In [11]:
output_resnet50_conv = resnet50_model(X_input)

In [36]:
X = GlobalAveragePooling2D()(output_resnet50_conv)
#X = Dense(4, activation='softmax', name='predictions')(X)
X = Flatten(name='flatten')(output_resnet50_conv)
X = Dense(2048, activation='relu', name='fc1')(X)
X = Dropout(0.7)(X)
X = Dense(1024, activation='relu', name='fc2')(X)
X = Dropout(0.7)(X)
X = Dense(4, activation='softmax', name='predictions')(X)

In [37]:
my_model = Model(inputs=X_input, outputs=X,name='myModel')

In [38]:
my_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     (None, 200, 200, 3)       0         
_________________________________________________________________
resnet50 (Model)             multiple                  23587712  
_________________________________________________________________
flatten (Flatten)            (None, 2048)              0         
_________________________________________________________________
fc1 (Dense)                  (None, 2048)              4196352   
_________________________________________________________________
dropout_3 (Dropout)          (None, 2048)              0         
_________________________________________________________________
fc2 (Dense)                  (None, 1024)              2098176   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
__________

In [39]:
my_model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])

In [47]:
my_model.fit(x = X_train, y = Y_train, epochs = 3, batch_size = 32, validation_data=(X_test,Y_test))

Train on 1404 samples, validate on 65 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa22bbdec50>

In [41]:
preds = my_model.evaluate(x = X_test, y = Y_test)
print ("Loss = " + str(preds[0]))
print ("Test Accuracy = " + str(preds[1]))

Loss = 1.4287343242993722
Test Accuracy = 0.7384615384615385


## Poor results on test case
- Results are no better than random guessing despite training accuracy (Test Accuracy = 0.215)
- Issue is due to the presence of batch normalization(BN)
- VGG performs way better as it is an architect that does not use BN

## Reason (from DatumBox Blog)
- In the case of BN, during training we use the **mean and variance of the mini-batch** to rescale the input. On the other hand, during inference we use the **moving average and variance** that was estimated during training.
- Keras knows in which mode to run because it has a built-in mechanism called **learning_phase**. The **learning phase** controls whether the network is on train or test mode. 
- If it is not manually set by the user, during fit() the network runs with learning_phase=1 (train mode). While producing predictions (for example when we call the predict() & evaluate() methods or at the validation step of the fit()) the network runs with learning_phase=0 (test mode). 

## Work around
- Even though it is not recommended, the user is also able to statically change the learning_phase to a specific value but this needs to happen before any model or tensor is added in the graph. If the learning_phase is set statically, Keras will be locked to whichever mode the user selected.
- Results are still not very impressive.

In [42]:
def make_prediction_ResNet(path, end):
    for i in range(1,end):
        # Preprocess the image first
        img_path = path+str(i)+".jpg"
        img = Image.open(img_path)
        #imshow(img)
        img = standardize(img)
        img = img.reshape((1,200,200,3))/255.
        
        # Making predictions 
        prediction = my_model.predict(img)
        #print("prediction results: ",prediction)
        #print(prediction.shape)
        index = np.unravel_index(np.argmax(prediction, axis=None), prediction.shape)
        #print(index)

        # converting results to text
        item = index[1]
        if item == 0:
            print("It is an id card/passport.")
        elif item == 1:
            print("It is a slide.")
        elif item == 2:
            print("It is a paper document.")
        elif item == 3:
            print("It is a receipt.")

In [43]:
make_prediction_ResNet("slides_test/",11)

It is an id card/passport.
It is a paper document.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is a receipt.
It is an id card/passport.


In [44]:
make_prediction_ResNet("id_cards_test/",25)

It is a receipt.
It is an id card/passport.
It is a receipt.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is a slide.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is a receipt.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is a receipt.
It is a receipt.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.


In [45]:
make_prediction_ResNet("paper_documents_test/",15)

It is a receipt.
It is an id card/passport.
It is a receipt.
It is an id card/passport.
It is an id card/passport.
It is a paper document.
It is a paper document.
It is a receipt.
It is an id card/passport.
It is an id card/passport.
It is a receipt.
It is an id card/passport.
It is a receipt.
It is an id card/passport.


In [46]:
make_prediction_ResNet("receipts_test/",18)

It is an id card/passport.
It is an id card/passport.
It is a receipt.
It is an id card/passport.
It is a receipt.
It is a receipt.
It is an id card/passport.
It is a receipt.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is an id card/passport.
It is a paper document.
It is an id card/passport.
It is a receipt.
