In [None]:
# This code is best run inside Google Colab, or some alternations have to be made to the code.

In [None]:
!pip install pdfplumber
from google.colab import files
import json, os, shutil

In [None]:
# === Step 1. Upload your Label Studio JSON ===
uploaded = files.upload()
filename = list(uploaded.keys())[0]

In [None]:
# === Step 2. Load the JSON file ===
with open(filename, 'r') as f:
    data = json.load(f)

# Normalize data to a list
if isinstance(data, dict):
    annotations = [data]
elif isinstance(data, list):
    annotations = data
else:
    raise ValueError("Unexpected JSON format — must be list or dict")

print(f"✅ Loaded {len(annotations)} annotations from {filename}")

In [None]:
def schema_to_standard_json(entry):
    output = {
        "patient_name": None,
        "patient_first_name": None,
        "patient_middle_name": None,
        "patient_last_name": None,
        "patient_address_name": None,
        "patient_id": None,
        "patient_dob": None,
        "patient_signature": None,
        "patient_state": None,
        "patient_city": None,
        "patient_zip_code": None,
        "provider_name": None,
        "provider_address_name": None,
        "provider_phone": None,
        "provider_fax": None,
        "provider_state": None,
        "provider_city": None,
        "provider_zip_code": None,
        "family_name": None,
        "family_relation": None,
        "family_phone": None,
        "family_address_name": None,
        "family_state": None,
        "family_city": None,
        "family_zip_code": None,
        "guardian_name": None,
        "guardian_signature": None,
        "guardian_relation": None,
        "date": None,
        "expiration_date": None,
        "expiration_event": None,
        "translator_name": None,
        "translator_signature": None
    }


    # --- Detect which format we're working with ---
    if "annotations" in entry:
        results = entry.get("annotations", [])[0].get("result", [])
        labels = [r.get("value", {}) for r in results if "value" in r]
    elif "label" in entry:
        labels = entry["label"]
    else:
        labels = []

    # --- Process all labels uniformly ---
    for label in labels:
        label_type = label.get("labels", [None])[0]
        text = label.get("text", "").strip()

        if label_type and label_type in output:
            output[label_type] = text

    return output

In [None]:
# === Step 4. Create output folder ===
os.makedirs("standardized_consents", exist_ok=True)

In [None]:
# === Step 5. Convert and save each invoice ===
for entry in annotations:
    converted_entry = schema_to_standard_json(entry)

    # Use filename or ID for naming
    base_name = entry.get("filename") or f"consent_{entry.get('id', 'unknown')}.json"
    base_name = os.path.splitext(os.path.basename(base_name))[0] + ".json"

    out_path = os.path.join("standardized_consents", base_name)
    with open(out_path, "w") as f:
        json.dump(converted_entry, f, indent=2)

print(f"✅ Saved {len(annotations)} JSON files in 'standardized_consents/'")

In [None]:
# === Step 6. Zip and download all files ===
shutil.make_archive("standardized_consents", 'zip', "standardized_consents")
files.download("standardized_consents.zip")

print("🎉 All done! Your ZIP file is ready for download.")