In [None]:
import easyocr
import pandas as pd
from PIL import Image

In [None]:
TEST_IMAGE = '../data/wildreceipt/image_files/Image_1/0/0ea337776eb4a57010accaf2814ea7351770819b.jpeg'
TEST_IMAGE_2 = 'test-2.png'

Image.open(TEST_IMAGE)

# Refs

- Experiment-1: IMAGE → OCR Engine → text boxes → LLM → JSON object https://mychen76.medium.com/finetune-llm-to-convert-a-receipt-image-to-json-or-xml-3f9a6237e991

# EasyOCR

### IMAGE → OCR Engine -> Text Boxes

In [None]:
reader = easyocr.Reader(['ch_sim','en']) # this needs to run only once to load the model into memory


In [None]:

# standard, dict, 'json', 'free_merge'

result = reader.readtext('test.png', output_format='standard')

result

In [None]:
result = reader.readtext('test.png', output_format='dict')

pd.DataFrame(result)

In [None]:
Image.open('test-2.png')

In [None]:
result = reader.readtext('test-2.png', output_format='dict')

pd.DataFrame(result)

In [None]:
result

### Text boxes → LLM → JSON object

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
# bnb_config = BitsAndBytesConfig(
#     llm_int8_enable_fp32_cpu_offload=True,
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )
# control model memory allocation between devices for low GPU resource (0,cpu)
device_map = {
    "transformer.word_embeddings": 0,
    "transformer.word_embeddings_layernorm": 0,
    "lm_head": 0,
    "transformer.h": 0,
    "transformer.ln_f": 0,
    "model.embed_tokens": 0,
    "model.layers":0,
    "model.norm":0    
}
device = "cpu" if torch.cuda.is_available() else "cpu"

# model use for inference
model_id="mychen76/mistral7b_ocr_to_json_v1"
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    trust_remote_code=True,  
    torch_dtype=torch.float16,
    # quantization_config=quantization_config,
    # device_map=device_map
)

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [None]:
# !pip install accelerate 

In [None]:
prompt=f"""### Instruction:
You are POS receipt data expert, parse, detect, recognize and convert following receipt OCR image result into structure receipt data object. 
Don't make up value not in the Input. Output must be a well-formed JSON object.```json

### Input:
{result}

### Output:
"""

with torch.inference_mode():
    inputs = tokenizer(prompt,return_tensors="pt",truncation=True).to(device)
    outputs = model.generate(**inputs, max_new_tokens=512) ##use_cache=True, do_sample=True,temperature=0.1, top_p=0.95
    result_text = tokenizer.batch_decode(outputs)[0]
    print(result_text)

# python-doctr

In [None]:
# !pip install python-doctr==0.8.1

### IMAGE → OCR Engine -> Text Boxes

In [None]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor



In [None]:
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

doc = DocumentFile.from_images("test.png")
result = model(doc)

In [None]:
result

In [None]:
result.show()

In [None]:
json_output = result.export()
json_output

### Post-Processing - Eliminate irrelevant information

In [None]:
# https://djajafer.medium.com/create-a-receipt-parsing-using-ocr-and-a-large-language-model-7aa733d5e335

In [None]:
# Define a function to remove fields recursively
def remove_fields(obj, fields):
    if isinstance(obj, list):
        for item in obj:
            remove_fields(item, fields)
    elif isinstance(obj, dict):
        for key in list(obj.keys()):
            if key in fields:
                del obj[key]
            else:
                remove_fields(obj[key], fields)

# Function to remove 'geometry' key from 'blocks' and 'lines'
def remove_geometry(data):
    if isinstance(data, list):
        for item in data:
            remove_geometry(item)
    elif isinstance(data, dict):
        if 'geometry' in data:
            del data['geometry']
        for key, value in data.items():
            remove_geometry(value)



In [None]:
from pprint import pprint
# JSON export
json_export_raw = result.export()

pprint(json_export)

In [None]:
# Fields to remove
fields_to_remove = ['confidence', 'page_idx', 'dimensions', 'orientation', 'language', 'artefacts']

# Remove the specified fields
remove_fields(json_export, fields_to_remove)

# Remove 'geometry' from 'blocks' and 'lines'
for page in json_export['pages']:
    for block in page['blocks']:
        if 'geometry' in block:
            del block['geometry']
        for line in block.get('lines', []):
            if 'geometry' in line:
                del line['geometry']

# Convert the modified data back to JSON
modified_json = json.dumps(json_export, separators=(',', ':'))

# Print the modified JSON
pprint(json_export)

In [None]:
pd.DataFrame.from_dict(json_export, orient='index')

In [None]:
synthetic_pages = result.synthesize()
plt.imshow(synthetic_pages[0]); plt.axis('off'); plt.show()

In [None]:
for obj1 in json_export['pages'][0]["blocks"]:
    for obj2 in obj1["lines"]:
        for obj3 in obj2["words"]:
            print("{}: {}".format(obj3["geometry"],obj3["value"]))

In [None]:
json_export['pages']

In [None]:
# https://medium.com/quantrium-tech/text-extraction-using-doctr-ocr-471e417764d5 

import math
def convert_coordinates(geometry, page_dim):
    len_x = page_dim[1]
    len_y = page_dim[0]
    (x_min, y_min) = geometry[0]
    (x_max, y_max) = geometry[1]
    x_min = math.floor(x_min * len_x)
    x_max = math.ceil(x_max * len_x)
    y_min = math.floor(y_min * len_y)
    y_max = math.ceil(y_max * len_y)
    return [x_min, x_max, y_min, y_max]

def get_coordinates(output):
    page_dim = output['pages'][0]["dimensions"]
    text_coordinates = []
    for obj1 in output['pages'][0]["blocks"]:
        for obj2 in obj1["lines"]:
            for obj3 in obj2["words"]:                
                converted_coordinates = convert_coordinates(
                                           obj3["geometry"],page_dim
                                          )
                print("{}: {}".format(converted_coordinates,
                                      obj3["value"]
                                      )
                     )
                text_coordinates.append(converted_coordinates)
    return text_coordinates


In [None]:
graphical_coordinates = get_coordinates(json_export_raw)

graphical_coordinates

In [None]:
import PIL
from PIL import ImageDraw
import matplotlib.pyplot as plt

def draw_bounds(image, bound):
    draw = ImageDraw.Draw(image)
    for b in bound:
        p0, p1, p2, p3 = [b[0],b[2]], [b[1],b[2]], \
                         [b[1],b[3]], [b[0],b[3]]
        draw.line([*p0,*p1,*p2,*p3,*p0], fill='blue', width=2)
    return image

image = PIL.Image.open("test.png")
result_image = draw_bounds(image, graphical_coordinates)

plt.figure(figsize=(15,15))
plt.imshow(result_image)

### Text boxes → LLM → JSON object

### Load Data  - To train mdiel


In [None]:
from doctr.datasets import CORD
# Load straight boxes
train_set = CORD(train=True, download=True)
# Load rotated boxes
train_set = CORD(train=True, download=True, use_polygons=True)
img, target = train_set[0]

In [None]:
from doctr.datasets import WILDRECEIPT

WILDRECEIPT_DIR = "../data/wildreceipt"

train_set = WILDRECEIPT(train=True,img_folder=WILDRECEIPT_DIR,
                        label_path=f"{WILDRECEIPT_DIR}/train.txt")

test_set = WILDRECEIPT(train=False, img_folder=WILDRECEIPT_DIR,
                   label_path=f"{WILDRECEIPT_DIR}/test.txt")

img, target = train_set[0]
img, target = test_set[0]


In [None]:
train_set[0]

In [None]:
ImageDraw.Draw(img)
