In [None]:
import cv2
import numpy as np
import pytesseract
import os
from utils import *
from crossword_utils import *
import re
from PIL import ImageFont, ImageDraw, Image

#### 1 - Preprocessing Image
def  pre_process(image):
    # Transform source image to gray if it is not already
    if len(image.shape) != 2:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image

    # using a big blocksize seem to work well (blocksize = 51, c = 9)
    # to keep the details, 3 and 2 seem to work better?!
    thresh = cv2.adaptiveThreshold( 
        gray,
        maxValue=255.0,
        adaptiveMethod=cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        thresholdType=cv2.THRESH_BINARY_INV,
        blockSize=29,
        C=9
    )

    # Otsu's thresholding after Gaussian filtering
    # blur = cv2.GaussianBlur(gray,(3,3),0)
    # ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # store initial threshold
    threshInitial = thresh.copy()

    # remove borders
    threshNoBorders = removeBorder(thresh, 2)

    threshCleaned = threshNoBorders.copy()
    # Filter out all numbers and noise to isolate only boxes
    # seem not to be needed, but keep it anyway
    cnts = cv2.findContours(threshCleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        area = cv2.contourArea(c)
        if area < 20: # larger than 20 removes dashes, i.e. -
            cv2.drawContours(threshCleaned, [c], -1, (0,0,0), -1)

    roi = cv2.bitwise_not(threshCleaned)
    
    return roi, gray, threshInitial, threshNoBorders, threshCleaned 

def getOcrImage(image, txt, h, w, bottomTxt=''):
    output = image.copy()

    # output = cv2.resize(image, (100, 100), cv2.INTER_AREA) 

    # if using cv2.putText we only have a restricted character set
    # therefore strip out non-ASCII text so we can draw the text on the image
    # txt = "".join([c if ord(c) < 128 else "" for c in txt]).strip()

    # use PIL to draw special characters like æøå
    # convert from cv2-image to PIL-image 
    img_pil = Image.fromarray(output)
    draw = ImageDraw.Draw(img_pil)
    # If you are working in a server, you can set the font by adding the actual file of the font, e.g. arial.ttf
    font = ImageFont.truetype('/Library/Fonts/Arial.ttf', 11)

    y0, dy = 8, 10
    for i, line in enumerate(txt.split('\n')):
        y = y0 + i*dy
        # cv2.putText(output, line, (0, y ), cv2.FONT_HERSHEY_SIMPLEX, 0.40, (255,255,0), 1)
        draw.text((2, y), line, (255,255,0), font = font)
    
    # cv2.putText(output, bottomTxt, (0, h-4), cv2.FONT_HERSHEY_SIMPLEX, 0.40, (0,255,255), 1)
    draw.text((2, h-12), bottomTxt, (0,255,255), font = font)

    # convert PIL-image back to cv2-image 
    output = np.array(img_pil)

    return output

def ocr(roi):
    txt = pytesseract.image_to_string(roi, config="--psm 6", lang='nor')
    # txt = pytesseract.image_to_string(roi, 
    #                                 config="-c tessedit"
    #                                 "_char_whitelist=' 'ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅ-"
    #                                 " --psm 6", lang='nor')
    return txt

def processFilepath(path):
    image = cv2.imread(path)
    roi, gray, threshInitial, threshNoBorders, threshCleaned = pre_process(image)
    
    txt = ocr(roi)
    print('file: %s\n%s' % (path, txt))

    h, w = gray.shape
    imgBlank = np.zeros((h, w, 3), np.uint8)
    fileNumber = re.search(r"(\d+)", path).group(1)
    ocrImage = getOcrImage(imgBlank, txt, h, w, fileNumber)

    # Image Array for Display
    imageArray = ([
        [image, gray, threshInitial, threshNoBorders], 
        [threshCleaned, roi, ocrImage, imgBlank]
    ] )
    stackedImage = stackImages(imageArray, 4, 
    [
        ['raw', 'gray', 'threshold', 'no borders'] , 
        ['cleaned', 'roi', 'ocr', 'blank']
    ])

    show_wait_destroy("stacked", stackedImage)
    return txt


# ####################################
# MAIN
# ####################################

testRunSingle = False

if (testRunSingle):
    # test-run on one image
    
    # 8
    processFilepath('temp/roi8.png')

    # 410
    processFilepath('temp/roi410.png')

    # 561
    processFilepath('temp/roi561.png')

    # 188
    processFilepath('temp/roi188.png')

    # 428
    processFilepath('temp/roi428.png')

    # 161
    processFilepath('temp/roi161.png')

    # 174
    processFilepath('temp/roi174.png')

    # 97
    processFilepath('temp/roi97.png')

    # 83
    processFilepath('temp/roi83.png')

else:
    f = open("temp/ocr_output.txt", "w")
    for file in os.listdir("temp"):
        if file.startswith("roi"):
            filepath = os.path.join("temp", file)

            txt = processFilepath(filepath)

            f.write('file: %s\n%s\n' % (filepath, txt))
            f.flush()

    f.close()
