In [None]:
# This code is best run inside Google Colab, or some alternations have to be made to the code.
# For accurate output, make sure labeling is meticulous. Especially for line_items.
# All descriptions labeled together, all codes labeled together, all amounts labeled together, all in the same order intended to appear in the JSON.


In [None]:
!pip install pdfplumber

In [None]:
from google.colab import files
import json

# Upload your Label Studio JSON
uploaded = files.upload()

# Get the uploaded filename
filename = list(uploaded.keys())[0]

# Load the JSON file
with open(filename, 'r') as f:
    data = json.load(f)

# Normalize to a list of annotations
if isinstance(data, dict):
    annotations = [data]
elif isinstance(data, list):
    annotations = data
else:
    raise ValueError("Unexpected JSON format")

# Preview first annotation
annotations[:1]

In [None]:
def schema_to_standard_json(entry):
    output = {
        "invoice_number": None,
        "patient_id": None,
        "invoice_date": None,
        "due_date": None,
        "patient_name": None,
        "patient_age": None,
        "patient_address": None,
        "patient_phone": None,
        "patient_email": None,
        "admission_date": None,
        "discharge_date": None,
        "subtotal_amount": None,
        "discount_amount": None,
        "total_amount": None,
        "provider_name": None,
        "bed_id": None,
        "line_items": []
    }

    # Temporary holders for line item fields
    line_item_fields = {"description": [], "amount": [], "code": []}

    for label in entry.get("label", []):
        label_type = label["labels"][0]
        value = label["text"]

        if label_type == "line_item_description":
            line_item_fields["description"].append(value)
        elif label_type == "line_item_code":
            line_item_fields["code"].append(value)
        elif label_type == "line_item_amount":
            try:
                line_item_fields["amount"].append(float(value))
            except ValueError:
                line_item_fields["amount"].append(value)
        else:
            output[label_type] = value

    # Combine line item fields into structured entries
    max_len = max(len(line_item_fields["description"]), len(line_item_fields["code"]), len(line_item_fields["amount"]))
    for i in range(max_len):
        item = {
            "description": line_item_fields["description"][i] if i < len(line_item_fields["description"]) else None,
            "code": line_item_fields["code"][i] if i < len(line_item_fields["code"]) else None,
            "amount": line_item_fields["amount"][i] if i < len(line_item_fields["amount"]) else None
        }
        output["line_items"].append(item)

    return output

# Convert all annotations
converted = [schema_to_standard_json(a) for a in annotations]

# Save to JSON file
with open("standardized_invoice.json", "w") as f:
    json.dump(converted, f, indent=2)

print("Standardized JSON has been saved to 'standardized_invoice.json'.")

In [None]:
with open("converted.json", "w") as f:
    json.dump(converted, f, indent=2)

# Download
files.download("converted.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>