In [1]:
import pytesseract
from pytesseract import Output
from PIL import Image, ImageDraw, ImageFont
import cv2
import numpy as np
from glob import glob
import pandas as pd
import os
import re
from openpyxl import Workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from tqdm import tqdm

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

font_path = "/usr/share/fonts/truetype/fonts-japanese-mincho.ttf"

In [3]:
def remove_line(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Otsu's thresholding
    _,binary = cv2.threshold(gray,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    
    horizontal_img = binary.copy()
    vertical_img = binary.copy()
    
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50,1))
    horizontal_img = cv2.erode(horizontal_img, kernel, iterations=1)
    horizontal_img = cv2.dilate(horizontal_img, kernel, iterations=1)

    
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
    vertical_img = cv2.erode(vertical_img, kernel, iterations=1)
    vertical_img = cv2.dilate(vertical_img, kernel, iterations=1)

    mask_img = horizontal_img + vertical_img
    mask_indx = mask_img == 255
    img[mask_indx] = 255
    
    return img 
def df_to_image(df, img):
    overlay = np.full(img.shape, 255, dtype=np.uint8)
    pil_image = Image.fromarray(overlay)
    draw = ImageDraw.Draw(pil_image)
    for index, row in df.iterrows():
        if row['conf'] != -1:
            # cv2.putText(overlay,row['text'],(row['left'],row['top'] + row['height']), cv2.FONT_HERSHEY_SIMPLEX, img.shape[0]/1000.0,0,1,cv2.LINE_AA)
            draw.text((row['left'],row['top'] + row['height']), row['text'], font=ImageFont.truetype(font_path, np.max(img.shape)//100), fill=0)
    return np.array(pil_image)

In [4]:
def is_next(previous, current, is_phrase=True):
    # in case arg is phrase type
    if is_phrase:
        # parameters
        space_param = 10
        ratio_param = 0.5
        
        # if the space between 2 word are too far
        if current['left'] - previous['left'] > space_param * previous['width']:
            return False
        # if height of 2 words are too different
        #if abs(current['height'] - previous['height']) / current['height'] > ratio_param:
        #    return False
        return True
    # in case arg is paragraph
    else:
        # parameters
        space_param = 1.5
        ratio_param = 1.0
        
        l_c, t_c, h_c, w_c = current[1]
        l_p, t_p, h_p, w_p = previous[1]
        # if the space between 2 phrase are too far
        if t_c - t_p > space_param * h_p:
            return False
        # if the position of 2 phrases are not aligned (left_aligned, middle_aligned, right_aligned)
        if min(abs(l_c - l_p), abs(l_c + w_c / 2 - l_p - w_p / 2), abs(l_c + w_c - l_p - w_p)) > ratio_param * h_p:
            return False
        return True
def process_row(rows):
    line = ""
    left, top, height, width = (9999,9999,0,0) 
    for row in rows:
        line += row['text']
        left = min(left, row['left']) # Left of a line of characters should be the character on the left
        top = min(top, row['top'])    # Top of a line of characters should be the the min top of all characters (each character's top is not the same)
        height = max(height, row['top'] + row['height'] - top)
        width = max(width, row['left'] + row['width'] - left)
    return (line, (left, top, height, width))
def process_line(df):
    '''
    @input: a dataframe
    divide a data frame of line into different elements(blocks) 
    if these blocks locate far from eachother
    @output: a list of tupple (phrase, location)
    '''
    # result will be saved here
    phrases = []
    
    # initialize for for_loop
    previous_row = df.iloc[0]
    rows = [df.iloc[0]]
    
    for indx in range(1, df['text'].count()):
        row = df.iloc[indx]
        if is_next(previous_row, row):
            rows.append(row)
        else:
            previous_item = None
            text, location = process_row(rows)
            phrases.append((text, location))
            rows = [row]
        previous_row = row
    if len(rows) != 0:
        text, location = process_row(rows)
        phrases.append((text, location))
    return phrases
 
def process_phrases(blocks):
    '''
    # Sample of blocks
    #############################################################################################
    [('納品書', (764, 165, 41, 198))]
    [('御中', (661, 267, 46, 68)), ('納品No.DUA①②③①②④0⑤0⑤A', (977, 269, 41, 616))]
    [('ご担当:', (188, 343, 42, 103)), ('様', (592, 348, 28, 16)), ('納品日', (977, 344, 42, 89)), ('⑳①⑨/⑨/①⑧', (1465, 350, 26, 128))]
    [('件名:', (145, 426, 65, 90)), ('oo株式会社', (897, 423, 65, 155))]
    [('下記のとおり、納品致します。', (185, 480, 64, 408)), ('〒', (901, 499, 25, 21))]
    [('東京都新宿区新宿①.②③', (899, 529, 65, 306))]
    [('新宿第①ビル②階', (898, 596, 28, 211))]
    [('納期:', (169, 644, 39, 70)), ('TEL:', (972, 648, 34, 58))]
    [('支払条件:月末締め翌月末払い', (114, 679, 64, 424)), ('FAX', (969, 702, 19, 61))]
    [('E-Mail:', (938, 749, 21, 92))]
    [('担当', (954, 776, 65, 66))]
    [('合計金額\\0(税込)', (149, 875, 52, 611))]
    [('No.摘要', (118, 957, 67, 408)), ('数量単価', (863, 960, 64, 258)), ('金額', (1375, 960, 64, 64))]
    [('小計', (863, 1612, 64, 76)), ("'", (1208, 1619, 2, 2)), ('vo', (1564, 1619, 35, 36))]
    [('消費税', (847, 1662, 64, 104)), ('vo', (1564, 1668, 35, 36))]
    [('合計_', (863, 1711, 64, 109)), ('vo', (1564, 1718, 35, 36))]
    [('備考', (149, 1898, 46, 62))]
    #############################################################################################
    group a list of phrases into a paragraphs if these phrases are close to each other
    '''
 
    # declare a list containing items, that each item is a group of pharse that is close to each other
    paragraphs = [[item] for item in blocks[0]]
    
    for block in blocks[1:]:
        # declare a temporarory paragraphs to store new paragraph into
        tmp_paragraphs = []
        for current_item in block:
            for indx, paragraph in enumerate(paragraphs):
                # if current_item is close to the last item of candidate, add it to that paragraph
                if is_next(paragraph[-1], current_item, is_phrase=False):
                    paragraph.append(current_item)
                    break
                # other-wise, declare a new paragraph as this current_item
                if indx == len(paragraphs) - 1:
                    tmp_paragraphs.append([current_item])
        # update paragraphs
        if len(tmp_paragraphs) != 0:
            paragraphs.extend(tmp_paragraphs)
    return paragraphs

In [12]:
input_folder = "data"
image_paths = []
image_paths.extend(glob(os.path.join(input_folder, "*.png")))
image_paths.extend(glob(os.path.join(input_folder, "*.jpg")))
image_paths = ["data/nouhin_test_ocr_ImgOutput0.jpg"]
fig=plt.figure(figsize=(100, 100))
rows = len(image_paths)
cols = 2
for i, path in enumerate(image_paths[:1]):
    img = cv2.imread(path)
    img = remove_line(img)
    
    # dealing with rotated image. Choose the best image's direction based on tesseract confidence score
    df = None
    conf = -1 
    _img = None
    for i in range(1):
        img = (np.rot90(img) if i != 0 else img)
        _df = pytesseract.image_to_data(Image.fromarray(img), lang="jpn", output_type=Output.DATAFRAME)
        _df.columns = _df.columns.str.strip()
        _df = _df.dropna()
        if conf < _df['conf'].mean():
            conf = _df['conf'].mean()
            df = _df
            _img = img.copy()
    
    df.columns.str.strip()
    df = df.dropna()
    df = df.drop(columns=['level', 'page_num'])
    df_lines = []
    for _, block in df.groupby('block_num'):
        for _, line in block.groupby('line_num'):
            df_lines.append(line)
    phrases = []
    for df_line in df_lines:
        phrases.append(process_line(df_line))

    paragraphs = process_phrases(phrases)
    print("####################################")
    print(path)
    print("####################################")
    for paragraph in paragraphs:
        for phrase in paragraph:
            print(phrase[0])
        print()
#     new_img = df_to_image(df, _img)
#     
#     cv2.imwrite(os.path.join(input_folder, os.path.basename(path).split(".")[0] + "_output." + os.path.basename(path).split(".")[1]), new_img)
    
#     fig.add_subplot(rows, cols, i * cols + 1)
#     plt.imshow(_img.copy())
#     fig.add_subplot(rows, cols, i * cols + 2)
#     plt.imshow(new_img.copy())
plt.show()

####################################
data/nouhin_test_ocr_ImgOutput0.jpg
####################################
納品書

御中

納品No.DUA①②③①②④0⑤0⑤A

ご担当:

様

納品日

⑳①⑨/⑨/①⑧

件名:
下記のとおり、納品致します。

oo株式会社
〒
東京都新宿区新宿①.②③
新宿第①ビル②階

納期:

TEL:

支払条件:月末締め翌月末払い

FAX

E-Mail:
担当

合計金額\0(税込)

No.摘要

数量単価

金額

小計
消費税
合計_

'

vo
vo
vo

備考



<Figure size 7200x7200 with 0 Axes>

### Not Ready yet

In [5]:
def write_to_excel(df, ws):
    df = df.sort_values(by=['top', 'left'])
    count = 1
    for _, line_df in df.groupby('line_num'):
        line = "".join(line_df['text'].tolist())
        ws.cell(column=1, row=count, value=line)
        count += 1
    return ws

In [5]:
wb = Workbook()
ws = wb.active
ws.title = "Hoge"
ws = write_to_excel(block_df[3], ws)
wb.save(filename="hoge.xlsx")

In [61]:
print(df['text'].unique())

['請' '求' '書' ' ' '江' '阿' '年' '|' '昌' '〒①②-⑤⑥⑦' '楠' 'COOmoomoomni②③'
 'bSLiiSeee' '下' '記' 'の' '通' 'り' '佳' '睦' '田' 'し' 'よ' 'げ' 'ま' 'す' '。' '代'
 '表' '森' 'R' 'OOO' '〇' '①' 'W' '絵' '・' '嘆' '御' '金' '類' '節' '⑯.000-' '円'
 'ぉ' 'kimmis' '⑳⑦' '月' '⑨' 'Co' '②' '侵' '①0000' '⑳00' 'FRNdAitAUAE' 'て'
 'AWun' 'た' '、' '渚' '⑳' 'o' 'ama' 'を' ',' '〉' '」' 'C' 'ら' 'RR' 'COke' 'Wm'
 '①Ne⑦' 'DL' '和' '君' '肋' '⑳⑤.000']


In [63]:
print(" ".encode())

b' '
