In [1]:
import ast
import json

import cv2
import pandas as pd
from PIL import Image, ImageDraw
from transformers import AutoModel, AutoModelForTokenClassification, AutoProcessor

In [2]:
model = AutoModelForTokenClassification.from_pretrained(
    "nassimb0u/chart-text-role-classification-model"
)
processor = AutoProcessor.from_pretrained(
    "nassimb0u/chart-text-role-classification-model", apply_ocr=True
)

In [18]:
image = Image.open("PMC1618809___4.jpg").convert("RGB")
encoding = processor(image, return_tensors="pt")
print(encoding.keys())

dict_keys(['input_ids', 'attention_mask', 'bbox', 'pixel_values'])


In [4]:
print(encoding["bbox"][0])

tensor([[   0,    0,    0,    0],
        [   4,  471,   30,  601],
        [   4,  240,   30,  459],
        [  60,    4,  125,   32],
        [  60,    4,  125,   32],
        [  60,    4,  125,   32],
        [  60,  146,  155,  173],
        [  60,  146,  155,  173],
        [ 138,  116,  160,  196],
        [  60,  270,  155,  315],
        [  60,  270,  155,  315],
        [  60,  270,  155,  315],
        [ 142,  260,  162,  338],
        [  60,  411,  155,  457],
        [  60,  411,  155,  457],
        [  60,  574,  125,  601],
        [  60,  574,  125,  601],
        [  60,  574,  125,  601],
        [  60,  686,  123,  798],
        [  60,  686,  123,  798],
        [  60,  686,  123,  798],
        [ 140,  695,  160,  789],
        [  60,  858,  125,  885],
        [  60,  858,  125,  885],
        [  60,  858,  125,  885],
        [ 406,  906,  441,  933],
        [ 547,  906,  720,  933],
        [ 547,  906,  720,  933],
        [ 547,  906,  720,  933],
        [ 335,

In [8]:
outputs = model(**encoding)
predictions = outputs.logits.argmax(-1)
print(len(predictions))
labels = [model.config.id2label[idx.item()] for idx in predictions[0]]
print(labels)
print(len(labels))



1
['TICK_LABEL', 'AXIS_TITLE', 'AXIS_TITLE', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL', 'AXIS_TITLE', 'AXIS_TITLE', 'LEGEND_LABEL', 'AXIS_TITLE', 'AXIS_TITLE', 'OTHER', 'AXIS_TITLE', 'OTHER', 'TICK_LABEL', 'TICK_LABEL', 'TICK_LABEL']
40


In [9]:
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[-8.6732e-01, -1.3840e+00,  1.6726e-01, -3.4561e-01,  1.6711e+00,
          -6.1787e-01,  8.5743e-02, -2.5648e-01,  3.5146e-01],
         [-1.4438e+00, -1.1077e+00, -5.3403e-01,  1.1121e+01, -1.4786e+00,
          -1.1999e+00, -1.3696e+00, -1.2927e+00, -1.4122e+00],
         [-1.4850e+00, -1.1519e+00, -5.4889e-01,  1.0999e+01, -1.4063e+00,
          -1.1243e+00, -1.4132e+00, -1.2618e+00, -1.2905e+00],
         [-1.8632e+00, -1.9006e+00, -1.5643e+00, -1.4714e+00,  1.2035e+01,
          -1.8848e+00, -1.6300e+00, -2.1249e+00, -1.4960e+00],
         [-1.7252e+00, -1.9428e+00, -1.5070e+00, -1.4451e+00,  1.2022e+01,
          -1.8114e+00, -1.5993e+00, -2.2243e+00, -1.5517e+00],
         [-1.8038e+00, -1.8307e+00, -1.5169e+00, -1.3335e+00,  1.2021e+01,
          -1.8810e+00, -1.6059e+00, -2.1396e+00, -1.5216e+00],
         [-1.7618e+00, -1.8407e+00, -1.7034e+00, -1.6079e+00,  1.2068e+01,
          -2.0256e+00, -1.3378e+00, -2.0045e+00, -1.4702e

In [19]:
draw = ImageDraw.Draw(image)

bbox = encoding["bbox"][0]

for b, label in zip(bbox, labels):
    x0, y0, x1, y1 = b.tolist()
    # Skip zero bboxes if needed
    if (x0, y0, x1, y1) == (0, 0, 0, 0):
        continue
    draw.rectangle([x0, y0, x1, y1], outline="red", width=2)
    draw.text((x0, y0 - 10), label, fill="red")

image.save("output_with_bboxes.jpg")

In [20]:
img = cv2.imread("PMC1618809___4.jpg")

for b, label in zip(bbox, labels):
    x0, y0, x1, y1 = map(int, b.tolist())
    # Skip zero boxes if needed
    if (x0, y0, x1, y1) == (0, 0, 0, 0):
        continue
    # Draw rectangle
    cv2.rectangle(img, (x0, y0), (x1, y1), color=(0, 0, 255), thickness=2)
    # Draw label
    cv2.putText(
        img,
        label,
        (x0, max(0, y0 - 10)),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (0, 0, 255),
        1,
        cv2.LINE_AA,
    )

cv2.imwrite("output2_with_bboxes.jpg", img)

True

In [22]:
print(encoding["input_ids"])
print(type(encoding))
decoded_sequence = processor.tokenizer.decode(
    [
        0,
        204,
        29,
        321,
        12270,
        9,
        181,
        19508,
        33609,
        7571,
        7571,
        6200,
        7571,
        4525,
        2041,
        1242,
        2544,
        102,
        1499,
        1021,
        17137,
        5109,
        30131,
        1878,
        2,
    ]
)
print(decoded_sequence)

tensor([[    0, 27490, 11772,   321,     4,   541,   321,  1244,   204,   321,
             4,   844,   204,  9465,  4283,   321,     4,   698,   321,     4,
          2546,   204,   321,     4,   612,   158,   379,     4,   844, 17134,
          8913,  1258,   944, 24645,    36,    29,    43,   564,   389,     2]])
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<s> 4s 0 Number of paretpants 1960 1960 1970 1960 1990 Yearotintaion otwaterpipe 200</s>


In [23]:
model = AutoModelForTokenClassification.from_pretrained(
    "nassimb0u/chart-text-role-classification-model"
)
processor = AutoProcessor.from_pretrained(
    "nassimb0u/chart-text-role-classification-model"
)

In [61]:
print(processor.tokenizer)

LayoutLMv3TokenizerFast(name_or_path='nassimb0u/chart-text-role-classification-model', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['“', '”']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
	50265: AddedToken("“", rstri

In [24]:
with open("./orderd_test_data_wtext/icpr22_dataset_ordered_wpred_wtext.json") as f:
    data = json.load(f)

df = pd.DataFrame(data)

df["bbox"] = df["bbox"].apply(ast.literal_eval)

# Group by 'id' and collect lists of bboxes and texts for each id
df = df.groupby("id", sort=False).agg({"bbox": list, "text": list}).reset_index()
data = df.to_dict(orient="records")
print(len(data[0]["text"]))
print(data[0])

15
{'id': 0, 'bbox': [[333, 961, 785, 997], [4, 235, 32, 604], [268, 901, 287, 935], [404, 901, 443, 931], [545, 901, 584, 933], [681, 901, 720, 933], [822, 903, 861, 935], [958, 901, 997, 933], [58, 855, 127, 885], [56, 713, 125, 743], [56, 569, 127, 599], [56, 427, 125, 457], [56, 286, 125, 315], [56, 144, 127, 176], [58, 2, 127, 32]], 'text': [' Sedimentation Coetticient (s)', ' Relative concentration', ' 5', ' 10', ' 15', ' 20', ' 25', ' 30', ' 0.00', ' 0.05', ' 0.10', ' 0.15', ' 0.20', ' 0.25', ' 0.30']}


In [99]:
image = Image.open("PMC1618809___4.jpg").convert("RGB")
w, h = image.size
size = (w, h)

print(
    int(1000 * 154 / size[0]),
    int(1000 * 420 / size[1]),
    int(1000 * 363 / size[0]),
    int(1000 * 436 / size[1]),
)


words = data[0]["text"]
boxes = data[0]["bbox"]


encoding = processor(image, words, boxes=boxes, return_tensors="pt")
print(encoding.keys())
print(encoding["input_ids"])
print(encoding["attention_mask"])
print(len(encoding["bbox"]))
print(encoding["pixel_values"][0][0])

333 961 785 997
dict_keys(['input_ids', 'attention_mask', 'bbox', 'pixel_values'])
tensor([[    0, 17134,  8913,  1258,   944,  2645, 35056,    36,    29,    43,
         27490, 11772,   195,   158,   379,   291,   564,   389,   321,     4,
           612,   321,     4,  2546,   321,     4,   698,   321,     4,   996,
           321,     4,   844,   321,     4,  1244,   321,     4,   541,     2]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
1
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])


In [114]:
outputs = model(**encoding)
print(encoding["bbox"])
predictions = outputs.logits.argmax(-1)
print(len(predictions[0]))
labels = [model.config.id2label[idx.item()] for idx in predictions[0]]

print(encoding["bbox"].shape[1])
mask = []
for i in range(encoding["bbox"].shape[1]):
    zero = True
    equal_to_pred = True
    for j in range(encoding["bbox"].shape[2]):
        if encoding["bbox"][0][i][j] != 0:
            zero = False
        if i > 0 and encoding["bbox"][0][i - 1][j] != encoding["bbox"][0][i][j]:
            equal_to_pred = False
    if zero or equal_to_pred:
        mask.append(0)
    else:
        mask.append(1)

print(mask)
print([label for (m, label) in zip(mask, labels) if m == 1])
print(len(labels))

tensor([[[  0,   0,   0,   0],
         [333, 961, 785, 997],
         [333, 961, 785, 997],
         [333, 961, 785, 997],
         [333, 961, 785, 997],
         [333, 961, 785, 997],
         [333, 961, 785, 997],
         [333, 961, 785, 997],
         [333, 961, 785, 997],
         [333, 961, 785, 997],
         [  4, 235,  32, 604],
         [  4, 235,  32, 604],
         [268, 901, 287, 935],
         [404, 901, 443, 931],
         [545, 901, 584, 933],
         [681, 901, 720, 933],
         [822, 903, 861, 935],
         [958, 901, 997, 933],
         [ 58, 855, 127, 885],
         [ 58, 855, 127, 885],
         [ 58, 855, 127, 885],
         [ 56, 713, 125, 743],
         [ 56, 713, 125, 743],
         [ 56, 713, 125, 743],
         [ 56, 569, 127, 599],
         [ 56, 569, 127, 599],
         [ 56, 569, 127, 599],
         [ 56, 427, 125, 457],
         [ 56, 427, 125, 457],
         [ 56, 427, 125, 457],
         [ 56, 286, 125, 315],
         [ 56, 286, 125, 315],
        

In [50]:
draw = ImageDraw.Draw(image)

bbox = boxes

for b, label in zip(bbox, labels):
    x0, y0, x1, y1 = b
    # Skip zero bboxes if needed
    if (x0, y0, x1, y1) == (0, 0, 0, 0):
        continue
    draw.rectangle([x0, y0, x1, y1], outline="red", width=2)
    draw.text((x0, y0 - 10), label, fill="red")

image.save("output_with_bboxes.jpg")

In [34]:
img = cv2.imread("PMC1618809___4.jpg")

for b, label in zip(bbox, labels):
    x0, y0, x1, y1 = b
    # Skip zero boxes if needed
    if (x0, y0, x1, y1) == (0, 0, 0, 0):
        continue
    # Draw rectangle
    cv2.rectangle(img, (x0, y0), (x1, y1), color=(0, 0, 255), thickness=2)
    # Draw label
    cv2.putText(
        img,
        label,
        (x0, max(0, y0 - 10)),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (0, 0, 255),
        1,
        cv2.LINE_AA,
    )

cv2.imwrite("output2_with_bboxes.jpg", img)

True