In [4]:
import re
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from paddleocr import PaddleOCR
from skimage.feature import match_template

In [2]:
def non_max_suppression(boxes, scores, threshold):
    # Mengurutkan kotak dan skor berdasarkan skor secara menurun
    sorted_indices = np.argsort(scores)[::-1]
    boxes = boxes[sorted_indices]
    scores = scores[sorted_indices]

    # Menginisialisasi daftar kotak yang dipilih
    selected_boxes = []

    while len(boxes) > 0:
        # Mengambil kotak dengan skor tertinggi
        current_box = boxes[0]
        selected_boxes.append(current_box)

        # Menghitung IoU (Intersection over Union) antara kotak saat ini dan kotak yang tersisa
        ious = calculate_iou(current_box, boxes[1:])

        # Mengambil kotak yang memiliki IoU kurang dari threshold
        selected_indices = np.where(ious < threshold)[0]

        # Menghapus kotak yang dipilih dari daftar kotak
        boxes = boxes[selected_indices + 1]
        scores = scores[selected_indices + 1]

    return selected_boxes

def calculate_iou(box, boxes):
    # Menghitung luas kotak saat ini
    x1 = box[0]
    y1 = box[1]
    x2 = box[2]
    y2 = box[3]
    area = (x2 - x1 + 1) * (y2 - y1 + 1)

    # Menghitung luas kotak-kotak yang tersisa
    x1s = boxes[:, 0]
    y1s = boxes[:, 1]
    x2s = boxes[:, 2]
    y2s = boxes[:, 3]
    areas = (x2s - x1s + 1) * (y2s - y1s + 1)

    # Menghitung koordinat persekutuan antara kotak saat ini dan kotak-kotak yang tersisa
    xx1s = np.maximum(x1, x1s)
    yy1s = np.maximum(y1, y1s)
    xx2s = np.minimum(x2, x2s)
    yy2s = np.minimum(y2, y2s)

    # Menghitung luas persekutuan
    intersection = np.maximum(0, xx2s - xx1s + 1) * \
        np.maximum(0, yy2s - yy1s + 1)

    # Menghitung IoU (Intersection over Union)
    ious = intersection / (area + areas - intersection)

    return ious

def crop_image(image, start_x, start_y, end_x, end_y):
    # Memotong citra berdasarkan koordinat yang diberikan
    cropped_image = image[start_y:end_y, start_x:end_x]
    return cropped_image

def is_date(txt):
    pattern = r"\b\d{4}-\d{2}-\d{2}\b"
    tanggal = re.findall(pattern, txt)
    if tanggal:
        return True
    return False

def is_time(txt):
    pattern = r"\b\d{2}:\d{2}:\d{2}\b"
    waktu = re.findall(pattern, txt)
    if waktu:
        return True
    return False

def is_number(txt):
    pattern = r"\b(?:\d{1,3}(?:\.\d{3})*|\d+)(?:,\d{1,2})?\b"
    angka = re.findall(pattern, txt)
    if angka:
        return True
    return False

In [3]:
threshold = 0.2
threshold_nms = 0.5
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)

In [16]:
[line[1][0] for line in array_ls[0]]

['Riwayat',
 'P',
 'Uraian',
 'Tipe',
 'Nominal',
 'Saldo Akhir',
 'TRANSFER KE BIAYA ADMIN',
 'D',
 '1.000,00',
 '5.033.061,00',
 '(GOPAY CUSTNO',
 ':P51675321714966',
 '2023-02-02',
 '14:08:39',
 'TRANSFER KE GOPAY CUST',
 'D',
 '100.000,00',
 '5.034.061,00',
 'P51675321714966',
 '2023-02-02',
 '14:08:39',
 'TRANSFER KE Sdr NABIH',
 'D',
 '4.000.000,00',
 '5.134.061,00',
 'HABIBI NOOR',
 '2023-02-02',
 '10:32:25',
 'BYTRX BIFAST',
 'D',
 '2.500,00',
 '9.134.061,00',
 '2023-02-02',
 '00:00:00',
 'TRF/PAY/TOP-UP',
 'D',
 '74.000,00',
 '9.136.561,00',
 'ECHANNEL KARTU',
 '0000000000000000BIZID',
 '20230202BNINIDJA010',
 'Q02079583762331562337',
 '2023-02-02',
 '09:39:59',
 '1/8',
 'I >',
 'Download']

In [None]:
image_rgb = np.array(Image.open("./image/image.jpeg").convert("RGB"))
template_rgb = np.array(Image.open("./image/template.jpeg").convert("RGB"))
image_gray = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
template_gray = cv2.cvtColor(template_rgb, cv2.COLOR_RGB2GRAY)
result = match_template(image_gray, template_gray)
loc = np.where(result >= threshold)
boxes = np.column_stack((loc[1], loc[0], loc[1] + template_gray.shape[1], loc[0] + template_gray.shape[0]))
selected_boxes = non_max_suppression(boxes, result[loc], threshold_nms)
if len(selected_boxes) == 0:
    print("BUDI")
else:
    x1, y1, x2, y2 = selected_boxes[0]
    image_rgb = image_rgb[y1:, x1:]
    array = ocr.ocr(image_rgb, cls=True)[0]
    array = [line[1][0] for line in array]
    data = []
    start = 0
    end = 0
    for i, result in enumerate(array):
        if is_date(result):
            end = i+1
            data.append(
                array[start:end]
            )
            start = i+1
        arr_desc=[]
        arr_type=[]
        arr_date=[]
        arr_time=[]
        arr_cash=[]
        arr_saldo_akhir=[]
        for i, data_i in enumerate(data):
            array_time = []
            array_date = []
            array_word = []
            array_code = []
            array_nominal = []
            for data_j in data_i:
                if len(data_j) == 1 and "K" in data_j:
                    array_code.append(data_j)
                elif len(data_j) == 1 and "D" in data_j:
                    array_code.append(data_j)
                elif "." in data_j or "," in data_j or ",00" in data_j or ".00" in data_j:
                    array_nominal.append(data_j)
                elif is_time(data_j):
                    array_time.append(data_j)
                elif is_date(data_j):
                    array_date.append(data_j)
                else:
                    array_word.append(data_j)
        word = ' '.join(array_word)
        arr_desc.append(word)
        arr_type.append(array_code)
        arr_date.append(array_date)
        arr_time.append(array_time)
        arr_cash.append(array_nominal[0])
        arr_saldo_akhir.append(array_nominal[1])
table = {
    'Keterangan': arr_desc,
    'Tipe': arr_type,
    'Date': arr_date,
    'Time': arr_time,
    'Nominal': arr_cash,
    'Saldo Akhir': arr_saldo_akhir
}
table = pd.DataFrame(table)

In [14]:
array_ls[]

IndexError: list index out of range