In [1]:
import pandas as pd
import numpy as np
import os
import json
from PIL import Image
import matplotlib.pyplot as plt
import cv2
from craft_text_detector import (
    read_image,
    load_craftnet_model,
    load_refinenet_model,
    get_prediction,
    export_detected_regions,
    export_extra_results,
    empty_cuda_cache
)
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from tqdm import tqdm

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-printed')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-printed')

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-printed and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
refine_net = load_refinenet_model(cuda=True)
craft_net = load_craftnet_model(cuda=True)

In [3]:
def rectify_poly(img, poly):
    # Use Affine transform
    n = int(len(poly) / 2) - 1
    width = 0
    height = 0
    for k in range(n):
        box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
        width += int(
            (np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2
        )
        height += np.linalg.norm(box[1] - box[2])
    width = int(width)
    height = int(height / n)

    output_img = np.zeros((height, width, 3), dtype=np.uint8)
    width_step = 0
    for k in range(n):
        box = np.float32([poly[k], poly[k + 1], poly[-k - 2], poly[-k - 1]])
        w = int((np.linalg.norm(box[0] - box[1]) + np.linalg.norm(box[2] - box[3])) / 2)

        # Top triangle
        pts1 = box[:3]
        pts2 = np.float32(
            [[width_step, 0], [width_step + w - 1, 0], [width_step + w - 1, height - 1]]
        )
        M = cv2.getAffineTransform(pts1, pts2)
        warped_img = cv2.warpAffine(
            img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
        )
        warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
        warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
        output_img[warped_mask == 1] = warped_img[warped_mask == 1]

        # Bottom triangle
        pts1 = np.vstack((box[0], box[2:]))
        pts2 = np.float32(
            [
                [width_step, 0],
                [width_step + w - 1, height - 1],
                [width_step, height - 1],
            ]
        )
        M = cv2.getAffineTransform(pts1, pts2)
        warped_img = cv2.warpAffine(
            img, M, (width, height), borderMode=cv2.BORDER_REPLICATE
        )
        warped_mask = np.zeros((height, width, 3), dtype=np.uint8)
        warped_mask = cv2.fillConvexPoly(warped_mask, np.int32(pts2), (1, 1, 1))
        cv2.line(
            warped_mask, (width_step, 0), (width_step + w - 1, height - 1), (0, 0, 0), 1
        )
        output_img[warped_mask == 1] = warped_img[warped_mask == 1]

        width_step += w
    return output_img

In [5]:
data_image_test_new_path = "../data_GNN/data_image_test_new"
data_csv_test_new_path = "../data_GNN/data_csv_test_new" 

list_images = os.listdir(data_image_test_new_path)
for image_name in tqdm(sorted(list_images)):
    image_path = os.path.join(data_image_test_new_path, image_name)
    csv_write_path = os.path.join(data_csv_test_new_path, image_name[:-4] + ".csv")
    df_write = pd.DataFrame(columns=["xmin", "ymin", "xmax", "ymax", "Object", "label"])
    image = read_image(image_path)
    prediction_result = get_prediction(image=image,
                                        craft_net=craft_net,
                                        refine_net=refine_net,
                                        text_threshold=0.7,
                                        link_threshold=0.4,
                                        low_text=0.4,
                                        cuda=True,
                                        long_size=1280)
    for box in prediction_result["boxes"]:
        # get x, y, w, h
        coor_a,coor_b,coor_c,coor_d = box
        xmin = int(max(coor_a[0], coor_d[0]))
        xmax = int(min(coor_b[0], coor_c[0]))
        ymin = int(max(coor_a[1], coor_b[1]))
        ymax = int(min(coor_c[1], coor_d[1]))


        box_text = rectify_poly(image.copy(), box)
        image_img = Image.fromarray(box_text)
        pixel_values = processor(image_img, return_tensors="pt").pixel_values 
        generated_ids = model.generate(pixel_values, max_length = 100)
        generated_text_pre = processor.batch_decode(generated_ids, skip_special_tokens=True)
        # append to dataframe
        df_write.loc[len(df_write)] = [xmin, ymin, xmax, ymax, generated_text_pre[0], ""]

        # print(generated_text_pre)
    # save to csv
    df_write.to_csv(csv_write_path, index=False)
    # break


  0%|          | 0/184 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`: