In [1]:
# This code is best run inside Google Colab, or some alternations have to be made to the code.
# For accurate output, make sure labeling is meticulous. Especially for line_items.
# All descriptions labeled together, all codes labeled together, all amounts labeled together, all in the same order intended to appear in the JSON.


In [2]:
!pip install pdfplumber
from google.colab import files
import json, os, shutil

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [33]:
# === Step 1. Upload your Label Studio JSON ===
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving manualLabelingFirst8 (1).json to manualLabelingFirst8 (1) (1).json


In [34]:
# === Step 2. Load the JSON file ===
with open(filename, 'r') as f:
    data = json.load(f)

# Normalize data to a list
if isinstance(data, dict):
    annotations = [data]
elif isinstance(data, list):
    annotations = data
else:
    raise ValueError("Unexpected JSON format — must be list or dict")

print(f"✅ Loaded {len(annotations)} annotations from {filename}")

✅ Loaded 8 annotations from manualLabelingFirst8 (1) (1).json


In [35]:
def schema_to_standard_json(entry):
    output = {
        "invoice_number": None,
        "patient_id": None,
        "invoice_date": None,
        "due_date": None,
        "patient_name": None,
        "patient_age": None,
        "patient_address": None,
        "patient_phone": None,
        "patient_email": None,
        "admission_date": None,
        "discharge_date": None,
        "subtotal_amount": None,
        "discount_amount": None,
        "total_amount": None,
        "provider_name": None,
        "bed_id": None,
        "line_items": []
    }

    line_item_fields = {"description": [], "amount": [], "code": []}

    # --- Detect which format we're working with ---
    if "annotations" in entry:
        # 🆕 New format (nested under annotations → result → value)
        results = entry.get("annotations", [])[0].get("result", [])
        labels = [r["value"] for r in results if "value" in r]
    elif "label" in entry:
        # 🧾 Old/simple format
        labels = entry["label"]
    else:
        labels = []

    # --- Process all labels uniformly ---
    for label in labels:
        # Normalize key access between formats
        value = label if "labels" in label else label.get("value", {})
        label_type = value["labels"][0]
        text = value.get("text", "").strip()

        if label_type == "line_item_description":
            line_item_fields["description"].append(text)
        elif label_type == "line_item_code":
            line_item_fields["code"].append(text)
        elif label_type == "line_item_amount":
            try:
                line_item_fields["amount"].append(float(text))
            except ValueError:
                line_item_fields["amount"].append(text)
        else:
            output[label_type] = text

    # --- Combine structured line items ---
    max_len = max(
        len(line_item_fields["description"]),
        len(line_item_fields["code"]),
        len(line_item_fields["amount"])
    )
    for i in range(max_len):
        item = {
            "description": line_item_fields["description"][i] if i < len(line_item_fields["description"]) else None,
            "code": line_item_fields["code"][i] if i < len(line_item_fields["code"]) else None,
            "amount": line_item_fields["amount"][i] if i < len(line_item_fields["amount"]) else None
        }
        output["line_items"].append(item)

    return output


In [36]:
# === Step 4. Create output folder ===
os.makedirs("standardized_invoices", exist_ok=True)

In [37]:
# === Step 5. Convert and save each invoice ===
for entry in annotations:
    converted_entry = schema_to_standard_json(entry)

    # Use filename or ID for naming
    base_name = entry.get("filename") or f"invoice_{entry.get('id', 'unknown')}.json"
    base_name = os.path.splitext(os.path.basename(base_name))[0] + ".json"

    out_path = os.path.join("standardized_invoices", base_name)
    with open(out_path, "w") as f:
        json.dump(converted_entry, f, indent=2)

print(f"✅ Saved {len(annotations)} JSON files in 'standardized_invoices/'")

✅ Saved 8 JSON files in 'standardized_invoices/'


In [39]:
# === Step 6. Zip and download all files ===
shutil.make_archive("standardized_invoices", 'zip', "standardized_invoices")
files.download("standardized_invoices.zip")

print("🎉 All done! Your ZIP file is ready for download.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

🎉 All done! Your ZIP file is ready for download.
