# OpenCV text recognition testing

In this notebook i am adapting code from [pyimagesearch](https://www.pyimagesearch.com/2018/09/17/opencv-ocr-and-text-recognition-with-tesseract/) to detect and recognize text from images of bookshelves using OpenCV and Tesseract. The goal here it to take a photo of a bookshelf and return a list of books and authors. In the next notebook I will match the list of titles and authors to records in a database of popular books. 

In [9]:
# necessary packages
from imutils.object_detection import non_max_suppression
import numpy as np
import pytesseract
import cv2

# cleaning strings
import re
import string

from PIL import Image

# if using on cli
import argparse

## OCR functions

These are the functions I will use to do OCR on my images. 

In [10]:
def decode_predictions(scores, geometry):
    # grab the number of rows and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence scores
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the
        # geometrical data used to derive potential bounding box
        # coordinates that surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        # loop over the number of columns
        for x in range(0, numCols):
            # if our score does not have sufficient probability,
            # ignore it
            if scoresData[x] < args["min_confidence"]:
                continue

            # compute the offset factor as our resulting feature
            # maps will be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)

            # extract the rotation angle for the prediction and
            # then compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            # use the geometry volume to derive the width and height
            # of the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            # compute both the starting and ending (x, y)-coordinates
            # for the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)

            # add the bounding box coordinates and probability score
            # to our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    # return a tuple of the bounding boxes and associated confidences
    return (rects, confidences)


In [11]:
def get_image_text(args):
    '''
    takes dict of parameters
    returns text results from image

        resizes, preprocesses images
        gets bondary boxes
        pulls text
    '''
    
    # load the input image and grab the image dimensions
    image = cv2.imread(args["image"])
    orig = image.copy()
    (origH, origW) = image.shape[:2]

    # set the new width and height and then determine the ratio in change
    # for both the width and height
    (newW, newH) = (args["width"], args["height"])
    rW = origW / float(newW)
    rH = origH / float(newH)

    # resize the image and grab the new image dimensions
    image = cv2.resize(image, (newW, newH))
    (H, W) = image.shape[:2]
    
    # additional preprocessing
    #image = get_grayscale(image)
    
    #image = remove_noise(image)

    # define the two output layer names for the EAST detector model that
    # we are interested -- the first is the output probabilities and the
    # second can be used to derive the bounding box coordinates of text
    layerNames = [
        "feature_fusion/Conv_7/Sigmoid",
        "feature_fusion/concat_3"]

    # load the pre-trained EAST text detector
    #print("[INFO] loading EAST text detector...")
    net = cv2.dnn.readNet(args["east"])

    # construct a blob from the image and then perform a forward pass of
    # the model to obtain the two output layer sets
    blob = cv2.dnn.blobFromImage(image, 
                                 1.0, 
                                 (W, H),
                                 (123.68, 116.78, 103.94), 
                                 swapRB=True, 
                                 crop=False)

    net.setInput(blob)
    (scores, geometry) = net.forward(layerNames)

    # decode the predictions, then  apply non-maxima suppression to
    # suppress weak, overlapping bounding boxes

    (rects, confidences) = decode_predictions(scores, geometry)
    boxes = non_max_suppression(np.array(rects), probs=confidences)
    
    # initialize the list of results
    results = []

    # loop over the bounding boxes
    for (startX, startY, endX, endY) in boxes:
        # scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)

        # in order to obtain a better OCR of the text we can potentially
        # apply a bit of padding surrounding the bounding box -- here we
        # are computing the deltas in both the x and y directions
        dX = int((endX - startX) * args["padding"])
        dY = int((endY - startY) * args["padding"])

        # apply padding to each side of the bounding box, respectively
        startX = max(0, startX - dX)
        startY = max(0, startY - dY)
        endX = min(origW, endX + (dX * 2))
        endY = min(origH, endY + (dY * 2))

        # extract the actual padded ROI
        roi = orig[startY:endY, startX:endX]

        # in order to apply Tesseract v4 to OCR text we must supply
        # (1) a language, (2) an OEM flag of 4, indicating that the we
        # wish to use the LSTM neural net model for OCR, and finally
        # (3) an OEM value, in this case, 7 which implies that we are
        # treating the ROI as a single line of text
        config = ("-l eng --oem 1 --psm 7")
        text = pytesseract.image_to_string(roi, config=config)

        # add the bounding box coordinates and OCR'd text to the list
        # of results
        results.append(((startX, startY, endX, endY), text))
        
    # sort the results bounding box coordinates from top to bottom
    results = sorted(results, key=lambda r: r[0][1])
    
    return results

## Text processing

In [12]:
# cleaning up strings

def clean_string(text):
    # clean string to remove non-ASCII text
    text = "".join([c if ord(c) < 128 else "" for c in text])

    # standard cleaning
    text = text.lower().strip()
    
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…–]', '', text)
    text = re.sub('\n', '', text)
    
    return text

In [13]:
# return each horizontal line of text as a dict value

def clean_tesseract_output(raw_results, line_buffer):
    '''
    Function to clean text pulled from images by Tesseract OCR function.
    
    raw_results = list produced by Tesseract 
        ex. list of items that look like this: ((73, 49, 426, 114), "CAPUTO'S\n\x0c")
        
    line_buffer = int representing amount of space to allow between lines 
        This may have to be tweaked depending on how close books are to one another in frame.
        POTENTIALLY DETERMINE THIS IN FUNCTION? 
    '''
    # initialize empty dict and list
    d = {}
    startY_points = []

    # iterate through each roi
    for ((startX, startY, endX, endY), text) in results:
        
        text = clean_string(text)

        # only first pass, where no value in d yet
        if not startY_points: 
            d[startY] = text
            startY_points.append(startY)
            
        else:    
            # checking whether text from roi is on same line as last roi (within line_buffer)
            if abs(startY - startY_points[-1]) < line_buffer:
                 # if so, adding to dict key of that line
                d[startY_points[-1]] = ' '.join([d[startY_points[-1]], text]).strip()

            else: 
                # if not, adding to dict key for new line
                d[startY] = text
                startY_points.append(startY)
    
    # return output as a list of names
    return [val for val in d.values()]

## Testing on images

### Sample images

In [14]:
args = {'image': 'opencv-text-recognition/images/example_01.jpg',
       'east': 'opencv-text-detection/frozen_east_text_detection.pb',
       'min_confidence': 0.5,
       'width': 320,
       'height': 320,
       'padding': 0.1}

for i in range(1,5):
    args['image'] = 'opencv-text-recognition/images/example_0' + str(i) + '.jpg'
    results = get_image_text(args)
    final = clean_tesseract_output(results, 10)
    print(args['image'])
    print(final, '\n')

opencv-text-recognition/images/example_01.jpg
['oh ok'] 

opencv-text-recognition/images/example_02.jpg
['middleborougch'] 

opencv-text-recognition/images/example_03.jpg
['estate  agents', ' saxons'] 

opencv-text-recognition/images/example_04.jpg
['caputos shop bake'] 



### Bookshelf test images 1 - from google

Doesn't detect titles when text is horizontal. Since this is the default for bookshelf images, i will have to rotate them manually for now, but later will find a way to rotate them automatically. 

In [33]:
args = {'image': 'images/bookshelf9.jpg',
       'east': 'opencv-text-detection/frozen_east_text_detection.pb',
       'min_confidence': 0.5,
       'width': 320,
       'height': 320,
       'padding': 0.1}

In [34]:
# initial look at test bookshelf pictures

for i in range(1,9):
    args['image'] = 'images/bookshelf' + str(i) + '.jpg'
    results = get_image_text(args)
    final = clean_tesseract_output(results, 10)
    print(args['image'])
    print(final, '\n')

images/bookshelf1.jpg
['classics'] 

images/bookshelf2.jpg
['dle tds s poeetry sd spensers po ie', 'sherlo conandovle'] 

images/bookshelf3.jpg
['summer ov', 'bdenan once', 'and  for all', 'lem nowinski fs on aa', 'queens  cy  eae ac', 'aie', 'rire ills', 'fe schwab darkersianeo oemagic', 'ane uo ro yinige', 'ae'] 

images/bookshelf4.jpg
['lag', '', 'moe'] 

images/bookshelf5.jpg
['ae'] 

images/bookshelf6.jpg
[] 

images/bookshelf7.jpg
['ca', 'julian  barnes', 'picador lemon tr', 'v irginia eats qv', 'nda panruool', 'lawrenci the rainbow dil', 'tule'] 

images/bookshelf8.jpg
['destruction', ''] 



In [37]:
# testing confidence 

args = {'image': 'images/bookshelf7.jpg',
       'east': 'opencv-text-detection/frozen_east_text_detection.pb',
       'min_confidence': 0.5,
       'width': 320,
       'height': 320,
       'padding': 0.1}

for i in np.linspace(0,1,10):
    args['min_confidence'] = i
    results = get_image_text(args)
    final = clean_tesseract_output(results, 10)
    print('confidence = ',args['min_confidence'])
    print(final, '\n')

confidence =  0.0
['a of', 'sager', 'fear', 'ag', '', 'ca periential g seon', '', 'ey sian', '', 'i  s ge', '', '', 'squirrel ia', 'chipmunk', 'ly in c  ere', 'ses recs', 'es julian', 'barnes picador lemon tr', 'j table', '', 'shadows seus corey eye ad ee me', 'as ames', 'sais tte ba', 'eto ya ola  ', 'a ', 'ents crea ue', 'r ', 'a  a   oe ole e', ' am er', '', 'f the waves', '', 'v irginia eats qv', '', ' a i ', '', '', 'nda panruool', 'a', '', 'pa', '', 'ww', 'mut s lawrenci', 'the rainbow dil', 'zz pe an', 'as ol', '', 'ap', 'tule', 'spanish i roy ia'] 

confidence =  0.1111111111111111
['ca seon', 'julian  barnes', 'picador lemon tr', '', 'eto ya ola  ', 'v irginia eats qv', 'nda panruool', 'lawrenci the rainbow dil', 'tule'] 

confidence =  0.2222222222222222
['ca seon', 'julian  barnes', 'picador lemon tr', '', 'eto ya ola  ', 'v irginia eats qv', 'nda panruool', 'lawrenci the rainbow dil', 'tule'] 

confidence =  0.3333333333333333
['ca seon', 'julian  barnes', 'picador lemon tr

In [36]:
# testing padding

args = {'image': 'images/bookshelf7.jpg',
       'east': 'opencv-text-detection/frozen_east_text_detection.pb',
       'min_confidence': 0.5,
       'width': 320,
       'height': 320,
       'padding': 0.1}

for i in np.linspace(0,.5,10):
    args['padding'] = i
    results = get_image_text(args)
    final = clean_tesseract_output(results, 10)
    print('padding = ',args['padding'])
    print(final, '\n')

padding =  0.0
['aetoy', 'julian barnes', 'picador lemon', 'virginia woolf', 'andreny fara', 'fawren the rainbow dhl', 'soya iil'] 

padding =  0.05555555555555555
['sey', 'julian barnes', 'picador lemon', 'i ont woolf', 'min parry', 'tlawrenc the rainbow dhl', 'spanish'] 

padding =  0.1111111111111111
['en', 'julian  barnes', 'picador lemon ti', 'es irginia eats ', 'ay pan rol', 'lawrenci the rainbow pl', 'as eg icie'] 

padding =  0.16666666666666666
[' tames re', 'julian n barnes t', 'picador lemon ta', ' v rertats a', 'elon', 'lawrence  the rainbow', 'sulci'] 

padding =  0.2222222222222222
['oe', 'julian b n barnes th', 'picador lemon tab', 'fi on the w', 'aindred harold man', 'jlawrence  the rainbow dhla', 'uc'] 

padding =  0.2777777777777778
['none', 'julian be an barnes th', 'ccador e lemon tabl', 'e on the wave', 'an at aol', 'hlawrence v e the raine dhlay', 'a see ulon a'] 

padding =  0.3333333333333333
['st', 'julian bai an barnes the', 'picador  e lemon table', 'e ens oe

## Bookshelf test images 2 - taken myself

In [18]:
args = {'image': 'images/bookshelf9.jpg',
       'east': 'opencv-text-detection/frozen_east_text_detection.pb',
       'min_confidence': 0.5,
       'width': 320,
       'height': 320,
       'padding': 0.1}

# padding = 0.2, buffer = 300, min_confidence = 0.5
results = get_image_text(args)
final = clean_tesseract_output(results, 300)
final

['gg ll  ns bill wi ea aimle',
 'drow diaz junot',
 'rule evle hl  history o world gombrich zoe the',
 'immortality milan kundera i',
 'great gatcrby itzgeral d the',
 'narcissus an d goldmund hermann he  hesse fsg fall',
 'lhe albert camus']

This is WAY better! Images must do better when clearly framed and maybe even in higher resolution or with images taken up close. 

## Display results

See bounding box in context of image with OCR results alongside ROIs

In [None]:
# loop over the results
for ((startX, startY, endX, endY), text) in results:
    # display the text OCR'd by Tesseract
    print("OCR TEXT")
    print("========")
    print("{}\n".format(text))

    # strip out non-ASCII text so we can draw the text on the image
    # using OpenCV, then draw the text and a bounding box surrounding
    # the text region of the input image
    
    text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
    output = orig.copy()
    cv2.rectangle(output, 
                  (startX, startY), 
                  (endX, endY),
                  (0, 0, 255), 2)
    
    cv2.putText(output, 
                text, 
                (startX, startY - 20),
                cv2.FONT_HERSHEY_SIMPLEX, 
                1.2, 
                (0, 0, 255), 3)

    # show the output image
    cv2.imshow("Text Detection", output)
    cv2.waitKey(0)

OCR TEXT
oo ts




## CLI Implementation

To use this instead of dict of args to run from shell

In [None]:
# # construct the argument parser and parse the arguments
# ap = argparse.ArgumentParser()

# ap.add_argument('-f') # ADDED TO RUN IN NOTEBOOK (argparse is meant for CLI)

# ap.add_argument("-i", 
#                 "--image", 
#                 type=str,
#                 help="path to input image")

# ap.add_argument("-east", 
#                 "--east", 
#                 type=str,
#                 help="path to input EAST text detector")

# ap.add_argument("-c", 
#                 "--min-confidence", 
#                 type=float, 
#                 default=0.5,
#                 help="minimum probability required to inspect a region")

# ap.add_argument("-w", 
#                 "--width", 
#                 type=int, 
#                 default=320,
#                 help="nearest multiple of 32 for resized width")

# ap.add_argument("-e", 
#                 "--height", 
#                 type=int, 
#                 default=320,
#                 help="nearest multiple of 32 for resized height")

# ap.add_argument("-p", 
#                 "--padding", 
#                 type=float, 
#                 default=0.0,
#                 help="amount of padding to add to each border of ROI")

# args = vars(ap.parse_args())

# Image pre-processing

Consider adding these when processing images.

In [12]:
# https://nanonets.com/blog/ocr-with-tesseract/
# get grayscale image
def get_grayscale(image):
    kernel = np.ones((5,5), np.uint8)
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY, kernel)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5), np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5), np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5), np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 

In [13]:
kernel = np.ones((5,5), np.uint8)
noise = cv2.medianBlur(img, 5)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY, kernel)
erosion = cv2.erode(gray, kernel, iterations = 1)
dilation = cv2.dilate(gray, kernel, iterations = 1)
opening = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
closing = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
gradient = cv2.morphologyEx(gray, cv2.MORPH_GRADIENT, kernel)
tophat = cv2.morphologyEx(gray, cv2.MORPH_TOPHAT, kernel)
blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)

NameError: name 'img' is not defined

In [40]:
args = {'image': 'images/bookshelf9.jpg',
       'east': 'opencv-text-detection/frozen_east_text_detection.pb',
       'min_confidence': 0.5,
       'width': 320,
       'height': 320,
       'padding': 0.05}

In [21]:
# padding = 0.05
results = get_image_text(args)
final = clean_tesseract_output(results, 25)
final

['ga babe  nes bil',
 'ess aim',
 'drov diaz junot',
 'mle tlr listory woo rtt fsombrich of the',
 'immortalitv milan kundera',
 'great gatery tzgerald the',
 'narcissus di  goldmund hermann t hesse',
 'fsg',
 'fall lhe',
 'albert camus']

['ga babe  nes bil ess aim',
 'drov diaz junot',
 'mle tlr listory woo rtt fsombrich of the',
 'immortalitv milan kundera',
 'great gatery tzgerald the',
 'narcissus di  goldmund hermann t hesse fsg fall',
 'lhe albert camus']