In [50]:
import cv2
import numpy as np
import pytesseract
import os

#### 1 - Preprocessing Image
def preProcess(image, doDebug=False):
    if (doDebug): show_wait_destroy("raw image", image)

    # Transform source image to gray if it is not already
    if len(image.shape) != 2:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image

    if (doDebug): show_wait_destroy("gray image", gray)

    # using a big blocksize seem to work well (blocksize = 51, c = 9)
    thresh = cv2.adaptiveThreshold( 
        gray,
        maxValue=255.0,
        adaptiveMethod=cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        thresholdType=cv2.THRESH_BINARY_INV,
        blockSize=51,
        C=9
    )

    # Show binary image
    if (doDebug): show_wait_destroy("thresh", thresh)

    # Filter out all numbers and noise to isolate only boxes
    # seem not to be needed, but keep it anyway
    cnts = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        area = cv2.contourArea(c)
        if area < 5:
            cv2.drawContours(thresh, [c], -1, (0,0,0), -1)

    if (doDebug): show_wait_destroy("thresh2", thresh)

    roi = cv2.bitwise_not(thresh)
    if (doDebug): show_wait_destroy("thresh3", roi)

    return roi

def show_wait_destroy(winname, img):
    cv2.imshow(winname, img)
    cv2.moveWindow(winname, 500, 0)
    cv2.waitKey(0)
    cv2.destroyWindow(winname)

def extract_largest_contour(input, output):
    mask = np.zeros((input.shape), np.uint8)
    contours, hierarchy = cv2.findContours(input, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # find the biggest contour
    max_area = 0
    best_cnt = None
    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area > max_area:
            max_area = area
            best_cnt = cnt

    cv2.drawContours(mask, [best_cnt], 0, 255, -1) # full color (255) inverted
    cv2.drawContours(mask, [best_cnt], 0, 0, 2)    # no color (0) thickness 2

    # increase mask size so we don't cut away the lines when bitwising
    mask = cv2.dilate(mask, None, iterations=3)

    output = cv2.bitwise_and(output, mask)
    return output

def outputOcrImage(image, txt):
    output = cv2.resize(image, (100, 100), cv2.INTER_AREA)

    y0, dy = 30, 30
    for i, line in enumerate(txt.split('\n')):
        y = y0 + i*dy
        cv2.putText(output, line, (5, y ), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0,255,0), 2)

    show_wait_destroy("ocr", output)

def ocr(roi):
    # txt = pytesseract.image_to_string(roi, config="--psm 6")
    txt = pytesseract.image_to_string(roi, 
                                    config="-c tessedit"
                                    "_char_whitelist=' 'ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅ-"
                                    " --psm 6")
    return txt

# ####################################

# image = cv2.imread('temp/roi184.png')
# roi = preProcess(image, doDebug=True)
# txt = ocr(roi)
# outputOcrImage(image, txt)

# f = open("temp/ocr_output.txt", "w")
for file in os.listdir("temp"):
    if file.startswith("roi"):
        path = os.path.join("temp", file)
        # print(path)

        # image = cv2.imread('temp/roi1.png')
        image = cv2.imread(path)
        roi = preProcess(image)
        txt = ocr(roi)
        outputOcrImage(image, txt)

        # f.write('file: %s ocr: %s \n' % (path, txt))
        # f.flush()

# f.close()
