In [None]:
import os
import json
from openai import OpenAI
import base64
from mimetypes import guess_type
from PIL import Image
import fitz
import pytesseract
import tiktoken
from unstructured.partition.pdf import partition_pdf

# Set your OpenAI API key 
os.environ["OPENAI_API_KEY"] = ""
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# File names for caching
CHUNKS_CACHE_FILE = "pci_processed_chunks.json"
TOKEN_THRESHOLD = 10000

# --------------------------
# PDF Partitioning and Caching
# --------------------------

def partition_pdf_by_title(pdf_path):
    """
    Use unstructured to partition the PDF into sections based on titles/headers.
    Returns a dict mapping section titles to their full text.
    """
    elements = partition_pdf(filename=pdf_path)
    sections = {}
    current_title = "General"
    sections[current_title] = ""
    for element in elements:
        text = element.text.strip()
        # Use getattr to safely access metadata (which is not a dict)
        category = getattr(element.metadata, "category", "") if element.metadata else ""
        if category in ["Title", "Header"] or (text and text.isupper() and len(text.split()) < 10):
            current_title = text
            if current_title not in sections:
                sections[current_title] = ""
        else:
            sections[current_title] += "\n" + text
    return sections

def count_tokens(text: str, model="gpt-4o-mini"):
    """Count tokens using tiktoken for a given model."""
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

def split_text_into_chunks(text: str, max_tokens: int = TOKEN_THRESHOLD, overlap: int = 100) -> list:
    """
    Split a long text into overlapping chunks based on token count.
    """
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk = encoding.decode(tokens[start:end])
        chunks.append(chunk)
        start = end - overlap  # maintain overlap for context continuity
    return chunks

def process_pdf_sections(pdf_path):
    """
    Partition the PDF by title and, for each section that exceeds TOKEN_THRESHOLD,
    further split it into sub-chunks.
    Returns a dictionary mapping section titles to lists of text chunks.
    """
    raw_sections = partition_pdf_by_title(pdf_path)
    processed_sections = {}
    for title, content in raw_sections.items():
        full_section = f"{title}\n{content}"
        tokens = count_tokens(full_section)
        if tokens > TOKEN_THRESHOLD:
            print(f"Section '{title}' is very large ({tokens} tokens); splitting further.")
            sub_chunks = split_text_into_chunks(full_section, max_tokens=TOKEN_THRESHOLD, overlap=100)
            processed_sections[title] = sub_chunks
        else:
            processed_sections[title] = [full_section]
    return processed_sections

def load_or_process_pdf(pdf_path):
    """
    Load cached processed PDF sections if available; otherwise, process the PDF and cache the result.
    """
    if os.path.exists(CHUNKS_CACHE_FILE):
        with open(CHUNKS_CACHE_FILE, "r", encoding="utf-8") as f:
            processed_sections = json.load(f)
        print("Loaded cached processed PDF sections.")
    else:
        processed_sections = process_pdf_sections(pdf_path)
        with open(CHUNKS_CACHE_FILE, "w", encoding="utf-8") as f:
            json.dump(processed_sections, f, ensure_ascii=False, indent=2)
        print("Processed PDF and saved cache.")
    return processed_sections

# --------------------------
# Image and OCR Utilities
# --------------------------

def ocr_image(image_path):
    """Extract text from an image using pytesseract."""
    image = Image.open(image_path)
    return pytesseract.image_to_string(image)

def image_to_data_url(image_path: str) -> str:
    """Convert an image file to a base64-encoded data URL."""
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'
    with open(image_path, "rb") as f:
        encoded = base64.b64encode(f.read()).decode("utf-8")
    return f"data:{mime_type};base64,{encoded}"

# --------------------------
# Token Counting Utility
# --------------------------

def count_tokens(text: str, model="gpt-4o-mini"):
    """Count tokens using tiktoken for a given model."""
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

def split_text_into_chunks(text: str, max_tokens: int = TOKEN_THRESHOLD, overlap: int = 100) -> list:
    """
    Split a long text into overlapping chunks based on token count.
    Uses tiktoken for accurate token counting.
    """
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk = encoding.decode(tokens[start:end])
        chunks.append(chunk)
        start = end - overlap  # add overlap for context continuity
    return chunks
# --------------------------
# Relevance Check for Each Chunk
# --------------------------

def check_chunk_relevance(chunk_text: str, image_query: str) -> bool:
    """
    Ask GPT-4 if a given chunk (section) is relevant to the client's image.
    Returns True if the answer starts with "yes".
    """
    system_message = f"""
    You are an expert in PCI-DSS compliance. 
    You are given a section extracted from the PCI-DSS Report on Compliance Template containing the controls and requirements.
    A client has provided a screenshot showing details of their network and security configuration.
    Analyze the image and identify which specific control requirement is being addressed.
    Please be specific in your mapping.
    """
    prompt = f"""
    You are an expert in PCI-DSS compliance.
    Below is a section from the PCI-DSS Report on Compliance Template:
    
    \"\"\"{chunk_text}\"\"\"
    
    Also, consider the following text extracted from a client's screenshot:
    \"\"\"{image_query}\"\"\"

    Task:
    1. Does this image provide evidence for a specific control(s) and/or 
    requirements in this section?
    2. Identify the EXACT control references from text.
    3. Match image features to control requirements.

    Is this section relevant to mapping the client's network/security information (from the screenshot) to a PCI-DSS control?
    Identify if the image shows implementation of ANY control from this section.
     
    Answer with a single word: "Yes" or "No".
    """
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt}
    ]
    try:
        response = client.chat.completions.create(
            model="gpt-4o", 
            messages=messages,
            max_tokens=10,
            temperature=0  # deterministic
        )
        answer = response.choices[0].message.content.strip().lower()
        return answer.startswith("yes")
    except Exception as e:
        print(f"Relevance check error: {e}")
        return False

# --------------------------
# Final Mapping Call
# --------------------------

def map_chunk_to_control(aggregated_context: str, image_path: str, image_text: str) -> str:
    """
    Use the aggregated context (all relevant sections), the OCR text, and the image
    to have GPT-4 Vision map the screenshot to specific PCI-DSS rule(s).
    """
    system_message = f"""
    You are an expert in PCI-DSS compliance. 
    You are given a section extracted from the PCI-DSS Report on Compliance Template containing the controls and requirements.
    A client has provided a screenshot showing details of their network and security configuration.
    Analyze the image and identify which specific control requirement is being addressed.
    Please be specific in your mapping.
    """
    prompt = f"""
    You are an expert in PCI-DSS compliance.
    Below is the full context from the PCI-DSS Report on Compliance Template (all sections containing specific rules):
    {aggregated_context}
    
    Also, here is text extracted from a client's network/security screenshot:
    {image_text}
    
    Provide only the control requirement code(s) (e.g., "Requirement 1.1.1") and a brief explanation.
    Do not include any extraneous text.
    """
    image_data_url = image_to_data_url(image_path)
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": image_data_url}}
        ]}
    ]
    response = client.chat.completions.create(
        model="gpt-4o",  
        messages=messages,
        max_tokens=500
    )
    return response.choices[0].message.content

# --------------------------
# Main Application Flow
# --------------------------

def main():
    pdf_path = "PCI-DSS-ROC-Template.pdf"       # Path to your PCI-DSS ROC PDF
    image_path = "Connfido Network Diagram.png"  # Client screenshot image path

    print("Processing PDF and loading cached sections...")
    processed_sections = load_or_process_pdf(pdf_path)
    if not processed_sections:
        print("No processed sections found. Exiting.")
        return

    print("Found sections:")
    for title in processed_sections:
        print(f"--- {title} ---")

    print("Extracting OCR text from image...")
    image_query = ocr_image(image_path)

    mapped_controls = []
    # Loop over each section and its sub-chunks
    for title, chunk_list in processed_sections.items():
        for chunk in chunk_list:
            # Check if this chunk is relevant
            if check_chunk_relevance(chunk, image_query):
                print(f"Chunk from section '{title}' is relevant. Mapping control...")
                mapping = map_chunk_to_control(chunk, image_path, image_query)
                if mapping:
                    mapped_controls.append(mapping)

    if not mapped_controls:
        print("No mapped controls found for the given image. Exiting.")
        return

    # Aggregate only the mapping outputs (control codes and explanations)
    aggregated_mapping = "\n".join(mapped_controls)
    print("Aggregated Mapped Controls:")
    print(aggregated_mapping)

if __name__ == "__main__":
    main()

Processing PDF and loading cached sections...
Loaded cached processed PDF sections.
Found sections:
--- General ---
--- PCI DSS 4.0 ---
--- DO: ---
--- DON’T: ---
--- DSS. ---
--- OR • ---
--- CDE. ---
--- IDS/IPS. ---
--- FIM. ---
--- OR ---
Extracting OCR text from image...
Chunk from section 'General' is relevant. Mapping control...
Chunk from section 'PCI DSS 4.0' is relevant. Mapping control...
Chunk from section 'DO:' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'OR •' is relevant. Mapping control...
Chunk from section 'OR •' is relevant. Mapping control...
Chunk from section 'OR •' is relevant. Mapping control...
Chunk from section

In [5]:
def main():
    pdf_path = "PCI-DSS-ROC-Template.pdf"       # Path to your PCI-DSS ROC PDF
    image_path = "Connfido Network Diagram.png"  # Client screenshot image path

    print("Processing PDF and loading cached sections...")
    processed_sections = load_or_process_pdf(pdf_path)
    if not processed_sections:
        print("No processed sections found. Exiting.")
        return

    print("Found sections:")
    for title in processed_sections:
        print(f"--- {title} ---")

    print("Extracting OCR text from image...")
    image_query = ocr_image(image_path)

    mapped_controls = []
    # Loop over each section and its sub-chunks
    for title, chunk_list in processed_sections.items():
        for chunk in chunk_list:
            if check_chunk_relevance(chunk, image_query):
                print(f"Chunk from section '{title}' is relevant. Mapping control...")
                mapping = map_chunk_to_control(chunk, image_path, image_query)
                if mapping:
                    mapped_controls.append(mapping)

    if not mapped_controls:
        print("No mapped controls found for the given image. Exiting.")
        return

    # Deduplicate the mapping outputs (preserving order)
    unique_mapped_controls = list(dict.fromkeys(mapped_controls))
    aggregated_mapping = "\n".join(unique_mapped_controls)
    print("Aggregated Mapped Controls:")
    print(aggregated_mapping)

if __name__ == "__main__":
    main()

Processing PDF and loading cached sections...
Loaded cached processed PDF sections.
Found sections:
--- General ---
--- PCI DSS 4.0 ---
--- DO: ---
--- DON’T: ---
--- DSS. ---
--- OR • ---
--- CDE. ---
--- IDS/IPS. ---
--- FIM. ---
--- OR ---
Extracting OCR text from image...
Chunk from section 'General' is relevant. Mapping control...
Chunk from section 'PCI DSS 4.0' is relevant. Mapping control...
Chunk from section 'DO:' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'DSS.' is relevant. Mapping control...
Chunk from section 'OR •' is relevant. Mapping control...
Chunk from section 'OR •' is relevant. Mapping control...
Chunk from section 'OR •' is relevant. Mapping control...
Chunk from section

In [None]:

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": f"""
            PCI-DSS Section: {chunk['title']}
            Document Text: {chunk['text'][:20000]}
            Image Context: {ocr_text}

            Task:
            1. Does this image provide evidence for a specific control(s) and/or 
            requirements in this section?
            2. Identify the EXACT control references from text.
            3. Match image features to control requirements.
            
            Identify if the image shows implementation of ANY control from this section.
            Respond ONLY with matching control/requirement number, name, and description (e.g., '7.1 Processes and mechanisms for restricting access to system components and cardholder data by business need to know are defined and understood.') or 'None'"""},
            {"type": "image_url", "image_url": {"url": image_data}}

        ]
    }])

prompt = f"""
You are an expert in PCI-DSS compliance. 
You are given a section extracted from the PCI-DSS Report on Compliance Template containing the controls and requirements.
A client has provided a screenshot showing details of their network and security configuration.
Analyze the image and identify which specific control requirement is being addressed.
Provide the control requirement code along with a detailed explanation of 
how the information in the given image satisfies that requirement.
Please be as specific as possible in your mapping.
"""
    
detailed_response = client.chat.completions.create(
model="gpt-4o",
messages=[
    {"role": "system", "content": prompt},
    {
    "role": "user",
    "content": [
        {"type": "text", "text": f"""
        Potential Controls: {', '.join(controls)}
        Full OCR Text: {ocr_text}
        
        Generate final mapping report with:
        1. Controls/requirements from the PCI-DSS standard
        2. Implementation evidence from the image
        3. Relevant requirement text from document"""},
        {"type": "image_url", "image_url": {"url": image_data}}
    ]
    }])


In [8]:
# Usage
def main():
    # First run - processes and caches PDF
    get_pdf_chunks()  
    
    # Process multiple images
    for img in ["card_decryption_flow.jpg"]:
        print(f"\nProcessing {img}...")
        print(process_image(img))

In [3]:
# Analyze section of the document to map given image
def analyze_section(section: dict, image_data_url: str, ocr_text: str) -> dict:
    """Analyze one document section with image context"""
    instruct_prompt = f"""
    PCI-DSS Document Section Analysis
    Section Title: {section['title']}
    Section Content: {section['content'][:2500]}
    
    OCR Context from Image: {ocr_text}
    
    Task:
    1. Does this image provide evidence for this specific control?
    2. Identify EXACT control references from text
    3. Match image features to control requirements
    
    Respond STRICTLY in format:
    Relevant: [Yes/No]
    Control: [exact control text from document]
    Match Confidence: [High/Medium/Low]
    Evidence: [specific matching details from image]"""

    prompt = f"""
    You are an expert in PCI-DSS compliance. 
    You are given a section extracted from the PCI-DSS Report on Compliance Template containing the controls and requirements.
    A client has provided a screenshot showing details of their network and security configuration.
    Analyze the image and identify which specific control requirement is being addressed.
    Provide the control requirement code along with a detailed explanation of 
    how the information in the given image satisfies that requirement.
    Please be as specific as possible in your mapping.
    """
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": [
                    {"type": "text", "text": instruct_prompt},
                    {"type": "image_url", "image_url": {"url": image_data_url}}
                ]}
            ],
            max_tokens=400
        )
        return parse_response(response.choices[0].message.content, section)
    except Exception as e:
        print(f"Error analyzing section: {e}")
        return None


In [4]:
def parse_response(response: str, section: dict) -> dict:
    lines = response.split("\n")
    result = {
        "relevant": False,
        "control": "",
        "confidence": "",
        "evidence": "",
        "section_title": section["title"],
        "section_content": section["content"][:1000] + "..." 
    }
    
    for line in lines:
        if "Relevant: Yes" in line:
            result["relevant"] = True
        elif "Control:" in line:
            result["control"] = line.split("Control:")[-1].strip()
        elif "Match Confidence:" in line:
            result["confidence"] = line.split("Match Confidence:")[-1].strip()
        elif "Evidence:" in line:
            result["evidence"] = line.split("Evidence:")[-1].strip()
    
    return result


In [5]:
def cross_validate_results(findings: list, image_data_url: str) -> str:
    """Final validation with all positive matches"""
    validation_prompt = """
    Cross-Validate PCI-DSS Matches
    
    Positive Matches:
    """ + "\n".join(
        f"{i+1}. {f['control']} (Confidence: {f['confidence']})"
        for i, f in enumerate(findings)
    ) + """
    
    Task:
    1. Eliminate false positives
    2. Rank matches by relevance
    3. Combine supporting evidence
    4. Cite exact requirement text
    
    Final report format:
    ## PCI-DSS Compliance Mapping
    **Primary Control**: [control code] - [description]
    - Image Evidence: [specific features]
    - Document Reference: [exact text snippet]
    - Compliance Status: [Met/Partial/Not Met]
    
    [Repeat for secondary controls]"""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a senior PCI-DSS auditor."},
            {"role": "user", "content": [
                {"type": "text", "text": validation_prompt},
                {"type": "image_url", "image_url": {"url": image_data_url}}
            ]}
        ],
        max_tokens=600
    )
    return response.choices[0].message.content


In [6]:
def main():
    pdf_path = "PCI-DSS-ROC-Template.pdf"
    image_path = "Connfido Network Diagram.png"
    
    print("Processing document structure...")
    sections = extract_sections_from_pdf(pdf_path)
    print(f"Identified {len(sections)} logical sections")
    
    ocr_text = ocr_image(image_path)
    image_data_url = image_to_data_url(image_path)
    
    findings = []
    for idx, section in enumerate(sections):
        print(f"Analyzing section {idx+1}/{len(sections)}: {section['title'][:50]}...")
        result = analyze_section(section, image_data_url, ocr_text)
        if result and result["relevant"]:
            findings.append(result)
    
    if findings:
        print("\nCross-validating results...")
        final_report = cross_validate_results(findings, image_data_url)
        print("\nFINAL COMPLIANCE MAPPING:")
        print(final_report)
    else:
        print("No PCI-DSS controls matched to the image content")


In [7]:
if __name__ == "__main__":
    main()

Processing document structure...
Identified 1 logical sections
Analyzing section 1/1: Document Start...

Cross-validating results...

FINAL COMPLIANCE MAPPING:
## PCI-DSS Compliance Mapping

**Primary Control**: 4.1 - Use strong cryptography and security protocols to protect sensitive cardholder data during transmission over open, public networks.

- **Image Evidence**: The image shows network diagrams, which seem to include components indicating encryption protocols, such as VPNs or SSL/TLS labels. The lines between systems might denote encrypted communication paths.

- **Document Reference**: "Use strong cryptography and security protocols to protect sensitive cardholder data during transmission over open, public networks."

- **Compliance Status**: Met

**Secondary Control**: (If applicable, repeat the format above for additional controls identified in the documentation or network diagram.)


In [1]:
import os
import json
from openai import OpenAI
import base64
from mimetypes import guess_type
from PIL import Image
import fitz
import pytesseract
import tiktoken
from unstructured.partition.pdf import partition_pdf

# Configuration
os.environ["OPENAI_API_KEY"] = "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"  # Set your API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

CHUNKS_CACHE_FILE = "pci_processed_chunks.json"
TOKEN_THRESHOLD = 10000
MIN_CHUNK_TOKENS = 300  # Minimum tokens per chunk before merging

# --------------------------
# PDF Processing Utilities
# --------------------------

def partition_pdf_by_title(pdf_path):
    """
    Partition the PDF into ordered sections with titles using metadata.
    Returns a list of dicts: [{'title': <str>, 'content': <str>}, ...]
    """
    elements = partition_pdf(filename=pdf_path)
    sections = []
    current_title = "General"
    current_content = []
    
    for element in elements:
        text = element.text.strip()
        category = getattr(element.metadata, "category", "") if element.metadata else ""
        
        if category in ["Title", "Header"]:
            if current_content:
                sections.append({
                    "title": current_title,
                    "content": "\n".join(current_content)
                })
                current_content = []
            current_title = text
        else:
            current_content.append(text)
    
    if current_content:
        sections.append({
            "title": current_title,
            "content": "\n".join(current_content)
        })
    
    return sections

def count_tokens(text: str) -> int:
    """Count tokens using tiktoken."""
    return len(tiktoken.get_encoding("cl100k_base").encode(text))


def split_text_into_chunks(text: str, max_tokens: int = TOKEN_THRESHOLD, overlap: int = 100) -> list:
    """Split text into overlapping chunks based on token count."""
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk = encoding.decode(tokens[start:end])
        chunks.append(chunk)
        start = end - overlap  # maintain overlap for continuity
    return chunks

def merge_small_chunks(chunks: list, min_tokens: int = MIN_CHUNK_TOKENS) -> list:
    """Merge adjacent chunks if a chunk is very small."""
    merged = []
    encoding = tiktoken.get_encoding("cl100k_base")
    i = 0
    while i < len(chunks):
        current = chunks[i]
        current_tokens = len(encoding.encode(current))
        if current_tokens < min_tokens and i + 1 < len(chunks):
            merged_chunk = current + "\n" + chunks[i+1]
            merged.append(merged_chunk)
            i += 2
        else:
            merged.append(current)
            i += 1
    return merged

def process_sections(pdf_path):
    """
    Partition the PDF by title and, for each section that exceeds TOKEN_THRESHOLD,
    split it into sub-chunks and merge small chunks.
    Returns a list of dicts: [{'title': <str>, 'text': <str>}, ...]
    """
    raw_sections = partition_pdf_by_title(pdf_path)
    all_chunks = []
    for section in raw_sections:
        full_text = f"{section['title']}\n{section['content']}"
        tokens = count_tokens(full_text)
        if tokens <= TOKEN_THRESHOLD:
            all_chunks.append({
                "title": section['title'],
                "text": full_text,
                "tokens": tokens
            })
        else:
            print(f"Section '{section['title']}' is very large ({tokens} tokens); splitting further.")
            sub_chunks = split_text_into_chunks(full_text, max_tokens=TOKEN_THRESHOLD, overlap=100)
            merged_chunks = merge_small_chunks(sub_chunks, min_tokens=MIN_CHUNK_TOKENS)
            for chunk in merged_chunks:
                all_chunks.append({
                    "title": section['title'],
                    "text": chunk,
                    "tokens": count_tokens(chunk)
                })
    return all_chunks


def load_or_process_pdf(pdf_path):
    """
    Load cached processed PDF sections if available; otherwise, process the PDF and cache the result.
    Expected cache format: list of dicts with keys "title" and "text".
    """
    if os.path.exists(CHUNKS_CACHE_FILE):
        with open(CHUNKS_CACHE_FILE, "r", encoding="utf-8") as f:
            chunks = json.load(f)
        print("Loaded cached processed PDF sections.")
    else:
        chunks = process_sections(pdf_path)
        with open(CHUNKS_CACHE_FILE, "w", encoding="utf-8") as f:
            json.dump(chunks, f, ensure_ascii=False, indent=2)
        print("Processed PDF and saved cache.")
    return chunks


# --------------------------
# Image Processing
# --------------------------

def ocr_image(image_path):
    """Extract text from an image using pytesseract."""
    image = Image.open(image_path)
    return pytesseract.image_to_string(image)

def image_to_data_url(image_path: str) -> str:
    """Convert an image file to a base64-encoded data URL."""
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'image/png'
    with open(image_path, "rb") as f:
        encoded = base64.b64encode(f.read()).decode("utf-8")
    return f"data:{mime_type};base64,{encoded}"


# --------------------------
# API Interaction Handlers
# --------------------------

def check_chunk_relevance(chunk_text: str, image_ocr: str) -> bool:
    """Determine if chunk is relevant using GPT-4 with batch optimization."""
    
    system_message = f"""
    You are an expert in PCI-DSS compliance. 
    Analyze if the document section contains PCI-DSS controls relevant to the information in the given image.
    """
    prompt = f"""
    Document chunk:
    \"\"\"{chunk_text}\"\"\"
    
    Client screenshot text:
    \"\"\"{image_ocr}\"\"\"
    
    Is this chunk relevant for mapping the client's network/security information to a PCI-DSS control? 
    Answer only "Yes" or "No".
    """
    messages=[
            {"role": "system", "content": system_message}, 
            {"role": "user", "content": prompt}
        ]
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            max_tokens=10,
            temperature=0
        )
        answer = response.choices[0].message.content.strip().lower()
        return answer.startswith("yes")
    except Exception as e:
        print(f"Relevance check error: {e}")
        return False


def map_chunk_to_control(chunk_text: str, image_ocr: str, image_path: str) -> dict:
    """Get structured control mappings from GPT-4 Vision."""

    system_message = "You are an expert in PCI-DSS compliance."
    prompt = f"""
    Document chunk:
    \"\"\"{chunk_text}\"\"\"
    
    Client screenshot text:
    \"\"\"{image_ocr}\"\"\"
    
    Based solely on the above, identify which specific control requirement the screenshot addresses.
    Return your answer as a single-line JSON object with exactly these keys:
      "control_code": string (e.g., "Requirement 1.1.2a"),
      "description": string (an excerpt from the document describing the requirement),
      "explanation": string (a brief explanation of why the image satisfies this requirement).
    If no control applies, return empty JSON object.
    Output only the JSON with no extra text.
    """
    image_data_url = image_to_data_url(image_path)

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": image_data_url}}
        ]}
    ]
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            max_tokens=200,
            temperature=0.2
        )
        output = response.choices[0].message.content.strip()
        print("Raw mapping output:", output)  # Debug print
        if not output:
            return {}
        mapping = json.loads(output)
        if mapping.get("control_code"):
            return mapping
        else:
            return {}
    except Exception as e:
        print(f"Mapping error for chunk: {e}")
        return {}

# --------------------------
# Aggregation and Deduplication
# --------------------------

def aggregate_mappings(mappings: list) -> dict:
    """
    Aggregate a list of mapping dictionaries into a deduplicated dict keyed by control code.
    Merge explanations for duplicate control codes.
    """
    aggregated = {}
    for mapping in mappings:
        code = mapping.get("control_code")
        if not code:
            continue
        if code in aggregated:
            existing_expl = aggregated[code]["explanation"]
            new_expl = mapping.get("explanation", "")
            if new_expl and new_expl not in existing_expl:
                aggregated[code]["explanation"] += " " + new_expl
        else:
            aggregated[code] = {
                "description": mapping.get("description", ""),
                "explanation": mapping.get("explanation", "")
            }
    return aggregated

# --------------------------
# Main Workflow
# --------------------------

def main():
    pdf_path = "PCI-DSS-ROC-Template.pdf"       # Path to your PCI-DSS ROC PDF
    image_path = "Connfido Network Diagram.png"  # Client screenshot image path

    print("Processing PDF and loading cached sections...")
    chunks = load_or_process_pdf(pdf_path)
    if not chunks:
        print("No processed sections found. Exiting.")
        return

    print("Found sections:")
    for item in chunks:
        print(f"--- {item['title']} ---")

    print("Extracting OCR text from image...")
    image_ocr = ocr_image(image_path)

    mapping_outputs = []
    # Loop over each chunk in the processed sections
    for item in chunks:
        text = item['text']
        if not text.strip():
            continue
        if check_chunk_relevance(text, image_ocr):
            print(f"Chunk from section '{item['title']}' is relevant. Mapping control...")
            mapping = map_chunk_to_control(text, image_ocr, image_path)
            if mapping and mapping.get("control_code"):
                mapping_outputs.append(mapping)

    if not mapping_outputs:
        print("No mapped controls found for the given image. Exiting.")
        return

    aggregated = aggregate_mappings(mapping_outputs)
    print("Final Aggregated Mapped Controls:")
    for code, details in aggregated.items():
        print(f"\nControl: {code}")
        print(f"Description: {details['description']}")
        print(f"Explanation: {details['explanation']}")

if __name__ == "__main__":
    main()

Processing PDF and loading cached sections...
Section 'General' is very large (150575 tokens); splitting further.


TypeError: merge_small_chunks() got an unexpected keyword argument 'min_tokens'