##  Training an OCR using CNN + RNN + CTC on Synthetic Images ##
- Here we change the network architecture slightly from RNN+CTC ; we add a convolutional stack before the BLSTM layer
    - Now your input to the network is not the raw pixel values, But we do steps of convolution and maxpooling and the resultant output is reshpaed to form a Time x featDim structured before it is fed to the network. 
    - The convoultional stack can be increased in depth to get better feature represenations

<b> Compared to the RNN+CTC code, the only change here is in the model definition part where we have a convolutional stack ahead of the BLSTM stack</b>


In [1]:
# =============================================================================
# Use a BRNN + CTC to recognize given word image 
# Network is trained on images rendered using PIL 
# ============================================================================
# 


from __future__ import print_function
from PIL import Image, ImageFont, ImageDraw, ImageEnhance
import numpy as np
import time,math
from time import sleep
import random
import sys,codecs,glob 
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from warpctc_pytorch import CTCLoss
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
random.seed(0)
# TODO - MAKE SURE CTC IS INSTALLED IN ALL MACHINES
use_cuda = torch.cuda.is_available()

if use_cuda:
    print ('CUDA is available')

#use_cuda=False   #uncomment this if you dont want to use cuda variables

CUDA is available


#### vocabulary and the fonts ####
-  loading the lexicon of 90k words
- get the fontslist to be used


In [2]:
#all word images are resized to a height of 32 pixels
imHeight=32 
"""
image width is also set a fixed size
YES. Though RNNS can handle variable length sequences we resize them to fixed width
This is for the ease of batch learning

"""
#imWidth=100
imWidth=100
#13 fonts from googlefonts is used
#incase you want to use different set of fonts, change the path below
fontsList=glob.glob('fontsForRendering/'+'*.ttf')
# a  90k size lexicon 
# lexicon source : http://www.robots.ox.ac.uk/~vgg/data/text/
vocabFile=codecs.open('lexicon.txt','r')
#90k vocabulary
words = vocabFile.read().split()
vocabSize=len(words)
fontSizeOptions={'16','20','24','28','30','32','36','38'}
batchSize=5 
alphabet='0123456789abcdefghijklmnopqrstuvwxyz-'
#alphabet="(3)-"
dict={}
for i, char in enumerate(alphabet):
	dict[char] = i + 1


    


In [3]:
## a simple helper function to compute time since some 'start time'
def time_since(since):
	s = time.time() - since
	m = math.floor(s / 60)
	s -= m * 60
	return '%dm %ds' % (m, s)
 

In [4]:
# return the class labels for each character in the targetsequence 
def Str2Labels(text):
	global dict
	text = [dict[char.lower()] for char in text]
	#print (text)
	length=len(text)
	return text, length
#StrtoLabels("0-1")

### from the predicted sequence of labels for an image, decode the string
# function returns the rawstring and also the decoded string after removing blanks and duplicates

#eg: if labelsequnce you get after an argmax on the output activation matris is  [12,12,0,0,15,0,15,15,0,0]
#then your raw label string would be "bb~~e~ee~~" and the outputstring "bee"
def Labels2Str(predictedLabelSequences):
    bz=predictedLabelSequences.size(0)
    predictedRawStrings=[]
    predictedStrings=[]
    for i in range(0,bz):
        predictedRawString=""
        predictedString=""
        predictedLabelSeq=predictedLabelSequences.data[i,:]
        prevId=1000 #just a large value which is not in the index 
        character=""
        character_raw=""
        for j in range (0, predictedLabelSeq.size(0)):
            idx=predictedLabelSeq[j]
            if (prevId != 1000 or prevId!=idx) :
                if prevId!=idx:
                    if idx==0:
                        character_raw="~"
                        character=""
                    else:
                        character_raw=alphabet[idx-1]
                        character=alphabet[idx-1]
                else:
                    character_raw="~"
                    character=""
                prevId=idx
            else:
                character=""
                if idx==0:
                    character_raw="~"
                else:
                    character_raw=alphabet[idx-1]
                    
                    

            
            predictedString+=character
            predictedRawString+=character_raw
        predictedRawStrings.append(predictedRawString)
        predictedStrings.append(predictedString)
        
    return predictedRawStrings, predictedStrings



def image2tensor(im):
    #returns the pixel values of a PIL image (in 0-1 range) as a numpy 2D array

    (width, height) = im.size
    greyscale_map = list(im.getdata())
    greyscale_map = np.array(greyscale_map, dtype = np.uint8)
    greyscale_map=greyscale_map.astype(float)
    greyscale_map = torch.from_numpy(greyscale_map.reshape((height, width))).float()/255.0
    return greyscale_map


### Render the images, prepare a training batch ###
- renders a batch of word images, from the list of words supplied
- if singleFont is true then only one font would be used to render images. This is useful in case where you want to test overfitting the network to easy examples
- Along with the rendered images, the target strings are converted to corresponding sequence of labels; for example the word "bee" would be converted to [12,15,15] 

In [5]:
def GetBatch ( batchOfWords ):
	"""
	Renders a batch of word images and returns the images along with the corresponding GTs
	Uses PIL to render word images
	font is randomly picked from a set of freely available google fonts
	word is picked from a vocabulary of English words

	"""
	wordImages=[]
	labelSequences=[]
	labelSeqLengths=[]

	for  i,text in enumerate (batchOfWords):
		wordText=text
		#print('text is', text)
		fontName=fontsList[0]
		fontSize='26'
		#fontSize=fontSizeOptions[0]
		fontName=random.sample(fontsList,1)[0]
		fontSize=random.sample(fontSizeOptions,1)[0]
		imageFont = ImageFont.truetype(fontName,int(fontSize))
		textSize=imageFont.getsize(wordText)
		img=Image.new("L", textSize,(255))
		draw = ImageDraw.Draw(img)
		draw.text((0, 0),wordText,(0),font=imageFont)
		img=img.resize((imWidth,imHeight), Image.ANTIALIAS)
		#img.save(text+'.jpeg')

		imgTensor=image2tensor(img)
		imgTensor=imgTensor.unsqueeze(0) # at 0 a new dimenion is added

		wordImages.append(imgTensor)

		labelSeq,l=Str2Labels(wordText)
		labelSequences+=labelSeq
		labelSeqLengths.append(l)
	batchImageTensor=torch.cat(wordImages,0) #now all the image tensors are combined ( we  did the unsqueeze eariler for this cat)	
	#batchImageTensor=torch.transpose(batchImageTensor,1,2)
	labelSequencesTensor=torch.IntTensor(labelSequences)
	labelSeqLengthsTensor=torch.IntTensor(labelSeqLengths)
	return batchImageTensor, labelSequencesTensor, labelSeqLengthsTensor


### Adding a convolutional stack to the BLSTM + CTC Architecure ###
Remember that earlier we were feeding raw pixel columns as inputs at each time step <br>
Here we will use a covolutional stack to get some useful represenations from the word image <br>
And then sequences of this convolutional features are fed to the BLSTM layer above <br>

![CRNN Architecture](crnnstack.png)


In [6]:
# minesh TODO split blstm into a separate class ?

class crnnocr (nn.Module):
    def __init__(self, inputDim, hiddenDim, outputDim,  numLayers, numDirections):
        super(crnnocr, self).__init__()
        self.inputDim=inputDim
        self.hiddenDim=hiddenDim
        self.outputDim=outputDim
        self.numLayers=numLayers
        self.numDirections=numDirections
        # bidirectional= true to make the rnn bidirectional
        #cnn stack
        self.conv1 = nn.Conv2d(1, 64, 3)
        self.conv2 = nn.Conv2d(64, 64, 3)
        
        
        # rnn part
        if numDirections==2:
            self.blstm1=nn.LSTM(384, hiddenDim,numLayers, bidirectional=True, batch_first=True) # first blstm layer takes the image features as inputs
        else:
            self.blstm1=nn.LSTM(384, hiddenDim,numLayers, bidirectional=False, batch_first=True)
        self.linearLayer2=nn.Linear(hiddenDim*numDirections, outputDim) # linear layer at the output
        self.softmax = nn.Softmax()
                
    def forward(self, x ):
        # x will be in shape B x D x T
        x=x.unsqueeze(1) # # we add an extra dimension at 1 for #channels
        #see the input dimension required for conv2s
        #print(x.size())
        
        #print('size of x in the beginning =', x.size()) # batxhSizexnumChannels=1xHxW
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) # batchSizex64xH/2-1(W/2-1)x
        #print('size of x after conv1 and pooling =', x.size())
        x = F.max_pool2d(F.relu(self.conv2(x)), 2) # batchSizex64xH/2/2-1x(W/2-1)/2-1
        #print('size of x after conv2 and pooling =', x.size())
        #if input is 50x1x32x100 then it would become 50x64x16*49 and then 50x64x6x23
        #print(x.size())
        x=x.contiguous()
        B,C,D,T=x.size(0), x.size(1), x.size(2), x.size(3)
        #x=x.transpose(2,3) #swapping last two dimensions
        x=x.contiguous()
        x=x.view(B,x.size(1)*x.size(2),-1) # BxC*DXT
        x=x.transpose(1,2) #making T the second dimension
        #print(x.size())
        
        
        lstmOut1, _  =self.blstm1(x ) #x has three dimensions batchSize* seqLen * FeatDim
        B,T,D  = lstmOut1.size(0), lstmOut1.size(1), lstmOut1.size(2)
        lstmOut1=lstmOut1.contiguous()

                
        # output of RNN is reshaped to B*T x D before it is fed to the linear layer
        outputLayerActivations=self.linearLayer2(lstmOut1.view(B*T,D))
        outputSoftMax=self.softmax(outputLayerActivations)
        # the activations are reshaped to B x T x outputDim size
        #then a transpose of B and T since CTC expects the T to be first
        outputLayerActivations= outputLayerActivations.view(B,T,-1).transpose(0,1)
        #if use_cuda:
        #    outputLayerActivations=outputLayerActivations.cuda()
        return outputLayerActivations

In [7]:
###########
# Prepare the synthetic validation data
##############

valWords=['azadi','from','beef','janata','party']
valImages, valLabelSeqs, valLabelSeqlens=GetBatch(valWords)
valImages=autograd.Variable(valImages)
valImages=valImages.contiguous()
if use_cuda:
    valImages=valImages.cuda()
valLabelSeqs=autograd.Variable(valLabelSeqs)
#print(valLabelSeqs.data)
valLabelSeqlens=autograd.Variable(valLabelSeqlens)
    

In [9]:
###########################################
# TRAINING
##################################################
"""
a batch of words are sequentially fetched from the vocabulary
one epoch runs until all the words in the vocabulary are seen once
then the word list is shuffled and above process is repeated
"""
nHidden=80
batchSize=5 #if you have more gpu memory you may increase it and your training will be faster
nClasses= len(alphabet)
criterion = CTCLoss()

numLayers=2 # the 2 BLSTM layers defined seprately without using numLayers option for nn.LSTM
numDirections=2 # 2 since we need to use a bidirectional LSTM
model = crnnocr(imHeight,nHidden,nClasses,numLayers,numDirections)

optimizer=optim.Adam(model.parameters(), lr=0.001)
start = time.time()
if use_cuda:
    model=model.cuda()
    criterion=criterion.cuda()


for iter in range (0,200):
    avgTrainCost=0
    random.shuffle(words)

    for i in range (0,vocabSize-batchSize+1,batchSize):
    
        model.zero_grad()
        
        batchOfWords=words[i:i+batchSize]
        images,labelSeqs,labelSeqlens =GetBatch(batchOfWords)
        images=autograd.Variable(images)
        #images=autograd.Variable(images)
        images=images.contiguous()
        if use_cuda:
            images=images.cuda()
        labelSeqs=autograd.Variable(labelSeqs)

        labelSeqlens=autograd.Variable(labelSeqlens)
        outputs=model(images)
        outputs=outputs.contiguous()
        outputsSize=autograd.Variable(torch.IntTensor([outputs.size(0)] * batchSize))
        trainCost = criterion(outputs, labelSeqs, outputsSize, labelSeqlens) / batchSize

        avgTrainCost+=trainCost
        if i%500==0:
            avgTrainCost=avgTrainCost/(5000/batchSize)
            #print ('avgTraincost for last 5000 samples is',avgTrainCost)
            avgTrainCost=0
            valOutputs=model(valImages)
#print (valOutputs.size()) 100 X nvalsamoles x 37
            valOutputs=valOutputs.contiguous()
            valOutputsSize=autograd.Variable(torch.IntTensor([valOutputs.size(0)] * len(valWords)))
            valCost=criterion(valOutputs, valLabelSeqs, valOutputsSize, valLabelSeqlens) / len(valWords)
            print ('ctc Cost on validation data is',valCost.data[0])
            if valCost.data[0] < 0.3:
                sys.exit()


            ### get the actual predictions and compute word error ################
            valOutputs=valOutputs.transpose(0,1)
            # second output of max() is the argmax along the requuired dimension
            _, argMaxActivations= valOutputs.max(2)
            #the below tensor each raw is the sequences of labels predicted for each sample in the batch
            predictedSeqLabels=argMaxActivations.squeeze(2) #batchSize * seqLen
            predictedRawStrings,predictedStrings=Labels2Str(predictedSeqLabels)
            nCorrectWords=0
            WER=100
            for ii in range(0,5):

                print (predictedRawStrings[ii]+"==>"+predictedStrings[ii])
                if predictedStrings[ii]==valWords[ii]:
                    nCorrectWords+=1
            WER=((len(valWords)-nCorrectWords)*100)/len(valWords)
            print ('word error on validation data is', WER)
            

            #   print (predictedSeqLabels[0,:].transpose(0,0))
            #print(valOutputs_batchFirst[0,0,:])
            print('Time since we began trainiing [%s]' % (time_since(start)))

        optimizer.zero_grad()
        trainCost.backward()
        optimizer.step()
        if WER == 0:
            break
    if WER==0:
        break
    
    #iterString=int(iter)
    #torch.save(model.state_dict(), iterString+'.pth')


ctc Cost on validation data is 67.3995513916
m~~~~~~~~~~~~~~~~~~~~g~==>mg
g~~~~~~~m~~~~~~~~g~mg~~==>gmgmg
g~~m~~~~~~~~~~~~~~g~~~~==>gmg
mg~~m~~~~~~~~~~~~~~g~~~==>mgmg
m~~~~~~~~~~~~~~~~~g~~~~==>mg
word error on validation data is 100
Time since we began trainiing [0m 2s]
ctc Cost on validation data is 18.230588913
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
word error on validation data is 100
Time since we began trainiing [0m 3s]
ctc Cost on validation data is 17.5130767822
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
word error on validation data is 100
Time since we began trainiing [0m 4s]
ctc Cost on validation data is 18.241809845
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~==>
word error on validation data is 100
Time