# Qwen2.5-VL Demand Letter Spark Notebook

In [1]:
# Cell 1
import json
import os
import threading
import uuid

import fitz
import numpy as np
import torch
from pydantic import BaseModel, Field
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

spark = SparkSession.builder.appName("DemandLetterQwen2_5_VL").getOrCreate()
print("Spark version:", spark.version)
print("CUDA:", torch.cuda.is_available())



Spark version: 3.5.0
CUDA: True


In [2]:
# Cell 2
class DemandLetterExtract(BaseModel):
    is_demand_letter: bool|None=None
    claim_number: str|None=None
    claimant_name: str|None=None
    claimant_address: str|None=None
    claimant_contact_phone: str|None=None
    claimant_contact_email: str|None=None
    claimant_contact_facsimile: str|None=None
    claimant_legal_office_information: str|None=None
    insurance_company_representative: str|None=None
    claim_amount: str|None=None
    demand_letter_date: str|None=None
    response_deadline_date: str|None=None
    evidence_attached: list = Field(default_factory=list)
    date_of_loss: str|None=None
    insured_property_address: str|None=None
    insured_asset_description: str|None=None
    policy_number: str|None=None
    referenced_policy_language: str|None=None
    threats_of_legal_action: str|None=None
    requested_resolution: str|None=None
    tone_of_letter: str|None=None
    claimant_stated_cause_of_loss: str|None=None
    letter_response_markdown: str|None=None

MASTER_PROMPT = """You are an insurance claims adjuster analyzing a document containing both text and images (PDF page snapshots). 
Use ALL provided content — extracted text AND page images — to determine if the document is a demand letter and to extract structured data.  
Images always override text in cases of conflict.

Your job is to produce ONLY a JSON object matching the exact schema below, with no commentary, no Markdown, and no extra fields:

{{
    "is_demand_letter": boolean | false,
    "claim_number": string | null,
    "claimant_name": string | null,
    "claimant_address": string | null,
    "claimant_contact_phone": string | null,
    "claimant_contact_email": string | null,
    "claimant_contact_facsimile": string | null,
    "claimant_legal_office_information": string | null,
    "insurance_company_representative": string | null,
    "claim_amount": string | null,
    "demand_letter_date": string | null,
    "response_deadline_date": string | null,
    "evidence_attached": array,
    "date_of_loss": string | null,
    "insured_property_address": string | null,
    "insured_asset_description": string | null,
    "policy_number": string | null,
    "referenced_policy_language": string | null,
    "threats_of_legal_action": string | null,
    "requested_resolution": string | null,
    "tone_of_letter": string | null,
    "claimant_stated_cause_of_loss": string | null,
    "letter_response_markdown": string | null
}}

Rules:

1. Use BOTH:  
   - The extracted TEXT of the PDF pages  
   - The IMAGE representations of the pages  
   If there is any discrepancy, treat the IMAGE as the authoritative source.

2. First determine whether the document IS a demand letter.  
   If NOT a demand letter:  
   - "is_demand_letter" = false  
   - All other fields MUST be null (including "letter_response_markdown").  
   - Return the JSON and stop.

3. If it IS a demand letter:  
   - Extract all fields strictly from the provided content.  
   - Use null for any field not explicitly present.  
   - Do not infer, guess, or hallucinate any information.

4. For "evidence_attached":  
   - Provide an array of evidence types ONLY if explicitly attached or referenced.  
   - Otherwise leave as an empty array.

5. The field "letter_response_markdown" must contain a short, professional acknowledgement letter from the insurance adjuster.  
   Requirements:  
   - No salutation (“Dear…”).  
   - No names of adjusters.  
   - Must acknowledge receipt of the demand.  
   - Must state the claim is under review.  
   - Must summarize ONLY verifiable facts present in the letter.  
   - Must mention a 30-day review/response timeline.  
   - No additional claims, legal commentary, or invented facts.

6. Use consistent, neutral, factual language.  
   Never fabricate legal, financial, or policy details.

7. When reading images, pay attention to:  
   - Headers (law office, claimant, insurer)  
   - Stamps, dates, letterheads  
   - Signatures  
   - Embedded exhibits  
   - Handwritten annotations  
   - Policy numbers, claim numbers  
   - Amounts demanded  

8. Your output must be **strict JSON**, parsable with no trailing text or commentary.

Now analyze the provided text and images and return ONLY the JSON object.
""" 

In [3]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


def get_model_and_processor():
    if not hasattr(get_model_and_processor, "_model_instance"):

        from transformers import BitsAndBytesConfig

        bnb = BitsAndBytesConfig(load_in_4bit=True)

        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            "Qwen/Qwen2.5-VL-7B-Instruct",
            torch_dtype="auto",
            device_map="balanced",
            quantization_config=bnb,
        )

        processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

        setattr(get_model_and_processor, "_model_instance", model)
        setattr(get_model_and_processor, "_processor_instance", processor)

    return (
        getattr(get_model_and_processor, "_model_instance"),
        getattr(get_model_and_processor, "_processor_instance"),
    )

In [4]:
# Cell 4
import io
import uuid

import fitz
from PIL import Image


def extract_pdf_pages(pdf_path: str):
    pdf = fitz.open(pdf_path)
    out = []

    for page in pdf:
        # Extract text
        text = page.get_text()

        # Render at high DPI
        pix = page.get_pixmap(dpi=200)

        # Convert pixmap → PIL image
        img = Image.open(io.BytesIO(pix.tobytes("png")))

        # Convert to grayscale (mode "L")
        gray = img.convert("L")

        # Save grayscale PNG
        fn = f"/tmp/page_{uuid.uuid4()}.png"
        gray.save(fn)

        out.append({"text": text, "image_path": fn})

    return out


def analyze_pdf_local(pdf_path):
    model, processor = get_model_and_processor()

    pages = extract_pdf_pages(pdf_path)
    all_outputs = []

    for p in pages:
        try:
            # Build correct chat format for Qwen2.5-VL
            msgs = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": MASTER_PROMPT},
                        {"type": "image", "image": p["image_path"]},
                        {"type": "text", "text": p["text"]},
                    ],
                }
            ]

            # Prepare text template
            prompt_text = processor.apply_chat_template(
                msgs, tokenize=False, add_generation_prompt=True
            )

            # Extract vision tensors
            imgs, vids = process_vision_info(msgs)

            inputs = processor(
                text=[prompt_text],
                images=imgs,
                videos=vids,
                padding=True,
                return_tensors="pt",
            ).to(model.device)

            with torch.inference_mode():
                gen = model.generate(**inputs, max_new_tokens=1024)

            trimmed = [o[len(i) :] for i, o in zip(inputs.input_ids, gen)]
            page_output = processor.batch_decode(trimmed, skip_special_tokens=True)[0]

            all_outputs.append(page_output)

        finally:
            # Free VRAM between pages
            # torch.cuda.empty_cache()
            # torch.cuda.ipc_collect()
            pass

    # Final assembly → one JSON to parse
    full_text = "\n\n".join(all_outputs)

    return full_text

In [5]:
# Cell 5
def analyze_pdf_udf_impl(p):
    try:
        if p is None:
            return json.dumps({"error": "null path"})
        r = analyze_pdf_local(p)
        return r
    except Exception as e:
        return json.dumps({"error": str(e), "path": p})


analyze_pdf_udf = udf(analyze_pdf_udf_impl, StringType())

In [6]:
ls /data/docs

Insurance_Demand_Letter.pdf


In [7]:
# Cell 6
import glob

pdf_files = glob.glob("/data/docs/*.pdf")
df = spark.createDataFrame([(f,) for f in pdf_files], ["pdf_path"])

scored = df.withColumn("analysis", analyze_pdf_udf("pdf_path"))
pandas_df = scored.toPandas()
pandas_df

Unnamed: 0,pdf_path,analysis
0,/data/docs/Insurance_Demand_Letter.pdf,"```json\n{\n ""is_demand_letter"": true,\n ..."


In [8]:
pandas_df.to_csv('extracted_info.csv', index=False)