In [75]:
import cv2
import numpy as np
from PIL import Image, ImageEnhance
from src.config import *
from src.utilities_upgrade import *

table_image = cv2.imread("Data/ky_1000gd_maumoi/ky_1000gd_maumoi_page_10.jpg")
height, width = table_image.shape[:2]
center = (width/2, height/2)
rotate_matrix = cv2.getRotationMatrix2D(center=center, angle=-1, scale=1)
table_image = cv2.warpAffine(src=table_image, M=rotate_matrix, dsize=(width, height))
table_image = table_image[110:5660, 110:5660]
table_image = Resize_Image_Keep_Ratio(table_image, height=IMG_HEIGHT)

In [76]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = PATH_TESSERACT_EXE

In [77]:
table_image.shape

(720, 992, 3)

In [78]:
def preprocess(img, factor: int):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = Image.fromarray(img)
    enhancer = ImageEnhance.Sharpness(img).enhance(factor)
    if gray.std() < 30:
        enhancer = ImageEnhance.Contrast(enhancer).enhance(factor)
    return np.array(enhancer)

gray = cv2.cvtColor(table_image, cv2.COLOR_BGR2GRAY)
thresh, img_bin = cv2.threshold(
    gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
img_bin = 255-img_bin

cv2.imshow('img_bin',img_bin)
cv2.waitKey(0)

-1

In [79]:
def group_h_lines(h_lines, thin_thresh):
    new_h_lines = []
    while len(h_lines) > 0:
        thresh = sorted(h_lines, key=lambda x: x[0][1])[0][0]
        lines = [line for line in h_lines if thresh[1] -
                 thin_thresh <= line[0][1] <= thresh[1] + thin_thresh]
        h_lines = [line for line in h_lines if thresh[1] - thin_thresh >
                   line[0][1] or line[0][1] > thresh[1] + thin_thresh]
        x = []
        for line in lines:
            x.append(line[0][0])
            x.append(line[0][2])
        x_min, x_max = min(x) - int(5*thin_thresh), max(x) + int(5*thin_thresh)
        new_h_lines.append([x_min, thresh[1], x_max, thresh[1]])
    return new_h_lines

kernel_len = gray.shape[1]//120
hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
image_horizontal = cv2.erode(img_bin, hor_kernel, iterations=5)
horizontal_lines = cv2.dilate(image_horizontal, hor_kernel, iterations=5)

h_lines = cv2.HoughLinesP(
    horizontal_lines, 1, np.pi/180, 30, maxLineGap=250)

new_horizontal_lines = group_h_lines(h_lines, kernel_len)

In [80]:
h_image = np.zeros(table_image.shape)
for x1, y1, x2, y2 in new_horizontal_lines:
    h_image = cv2.line(h_image, (x1, y1), (x2, y2), (255, 255, 255), 3)

cv2.imshow('h_image',h_image)
cv2.waitKey(0)

-1

In [81]:
def group_v_lines(v_lines, thin_thresh):
    new_v_lines = []
    while len(v_lines) > 0:
        thresh = sorted(v_lines, key=lambda x: x[0][0])[0][0]
        lines = [line for line in v_lines if thresh[0] -
                 thin_thresh <= line[0][0] <= thresh[0] + thin_thresh]
        v_lines = [line for line in v_lines if thresh[0] - thin_thresh >
                   line[0][0] or line[0][0] > thresh[0] + thin_thresh]
        y = []
        for line in lines:
            y.append(line[0][1])
            y.append(line[0][3])
        y_min, y_max = min(y) - int(4*thin_thresh), max(y) + int(4*thin_thresh)
        new_v_lines.append([thresh[0], y_min, thresh[0], y_max])
    return new_v_lines

kernel_len = gray.shape[1]//120
ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
image_vertical = cv2.erode(img_bin, ver_kernel, iterations=5)
vertical_lines = cv2.dilate(image_vertical, ver_kernel, iterations=5)

v_lines = cv2.HoughLinesP(vertical_lines, 1, np.pi/180, 30, maxLineGap=250)
new_vertical_lines = group_v_lines(v_lines, kernel_len)
print(len(new_vertical_lines) - 1)

9


In [82]:
v_image = np.zeros(table_image.shape)
for x1, y1, x2, y2 in new_vertical_lines:
    v_image = cv2.line(v_image, (x1, y1), (x2, y2), (255, 255, 255), 3)

cv2.imshow('v_image', v_image)
cv2.waitKey(0)

-1

In [83]:
def seg_intersect(line1: list, line2: list):
    a1, a2 = line1
    b1, b2 = line2
    da = a2-a1
    db = b2-b1
    dp = a1-b1

    def perp(a):
        b = np.empty_like(a)
        b[0] = -a[1]
        b[1] = a[0]
        return b

    dap = perp(da)
    denom = np.dot(dap, db)
    num = np.dot(dap, dp)
    return (num / denom.astype(float))*db + b1

points = []
for hline in new_horizontal_lines:
    x1A, y1A, x2A, y2A = hline
    for vline in new_vertical_lines:
        x1B, y1B, x2B, y2B = vline

        line1 = [np.array([x1A, y1A]), np.array([x2A, y2A])]
        line2 = [np.array([x1B, y1B]), np.array([x2B, y2B])]

        x, y = seg_intersect(line1, line2)
        if x1A <= x <= x2A and y1B <= y <= y2B:
            points.append([int(x), int(y)])


In [84]:
p_image = np.zeros(table_image.shape)
for x1, y1, x2, y2 in new_vertical_lines:
    p_image = cv2.line(p_image, (x1, y1), (x2, y2), (255, 255, 255), 3)
for x1, y1, x2, y2 in new_horizontal_lines:
    p_image = cv2.line(p_image, (x1, y1), (x2, y2), (255, 255, 255), 3)
for x, y in points:
    p_image = cv2.circle(p_image, (x,y), radius=5, color=(0, 0, 255), thickness=-1)

cv2.imshow('p_image', p_image)
cv2.waitKey(0)

-1

In [85]:
def get_bottom_right(right_points, bottom_points, points):
    for right in right_points:
        for bottom in bottom_points:
            if [right[0], bottom[1]] in points:
                return right[0], bottom[1]
    return None, None

cells = []
for point in points:
    left, top = point
    right_points = sorted(
        [p for p in points if p[0] > left and p[1] == top], key=lambda x: x[0])
    bottom_points = sorted(
        [p for p in points if p[1] > top and p[0] == left], key=lambda x: x[1])

    right, bottom = get_bottom_right(
        right_points, bottom_points, points)
    if right and bottom:
        cropped_image = table_image[top:bottom, left:right]
        cv2.imshow('f_image', cropped_image)
        cv2.waitKey(0)
        cv2.rectangle(table_image, (left, top), (right, bottom), (0, 0, 255), 2)
        cells.append([left, top, right, bottom])

In [74]:
f_image = table_image.copy()

for x1, y1, x2, y2 in cells:
    f_image = cv2.rectangle(f_image, (x1, y1), (x2, y2), (0, 0, 255), 1)

cv2.imshow('f_image', f_image)
cv2.waitKey(0)

-1