In [1]:
from openai import OpenAI
import base64
from mimetypes import guess_type
from PIL import Image
import fitz
import pytesseract
from mimetypes import guess_type
import tiktoken

In [None]:
client = OpenAI(api_key="") 

In [3]:
def extract_text_from_pdf(pdf_path):
    
    # Open the PDF and extract full text
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()

    return full_text

In [13]:
def split_text_into_chunks(text: str, max_tokens: int = 2000, overlap: int = 100) -> list:
    """
    Splits a long text into overlapping chunks based on token count.
    Uses the tiktoken library for accurate token counting.
    """
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk_tokens = tokens[start:end]
        chunk_text = encoding.decode(chunk_tokens)
        chunks.append(chunk_text)
        start = end - overlap  # overlap for context continuity
    return chunks

In [14]:
def get_full_context_text(text: str, max_tokens: int = 2000, overlap: int = 100) -> str:
    """
    If the text is too long, splits it into chunks and then concatenates them
    with clear markers so that the model sees the entire content in an organized form.
    """
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    if len(tokens) <= max_tokens:
        return text
    else:
        chunks = split_text_into_chunks(text, max_tokens, overlap)
        # Join all chunks with clear markers
        combined_text = "\n\n".join(f"--- Chunk {i+1} ---\n{chunk}" for i, chunk in enumerate(chunks))
        return combined_text

In [4]:
# Extract text from image by ocr
def ocr_image(image_path):
    image = Image.open(image_path) # Open image using Pillow
    ocr_text = pytesseract.image_to_string(image)
    
    return ocr_text

In [5]:
def image_to_data_url(image_path: str) -> str:
    """
    Converts a local image file to a base64 data URL.
    """
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'
    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode("utf-8")
    data_url = f"data:{mime_type};base64,{base64_encoded_data}"
    return data_url

In [6]:
# Extract text from the PCI-DSS template PDF
print("Extracting PCI-DSS template text from PDF.")
pdf_path = "PCI-DSS-ROC-Template.pdf"  
pci_template_text = extract_text_from_pdf(pdf_path)
if not pci_template_text.strip():
    print("No text could be extracted from the PDF. Please check the file.")

Extracting PCI-DSS template text from PDF.


In [16]:
# Use the chunking function to get full context text without crossing token limits.
full_context_text = get_full_context_text(pci_template_text, max_tokens=2000, overlap=100)

In [7]:
def map_image_to_pci_requirement(pdf_text: pci_template_text, image_path: str) -> str:
    """
    Sends a prompt with the extracted PCI-DSS template text and an image
    (in data URL format) to GPT-4 Vision and returns the model's response.
    """
    # Extracting text from image:
    image_text = ocr_image(image_path)
    
    # Preprocess the client screenshot image
    print("Converting image to data URL...")
    image_data_url = image_to_data_url(image_path)
    
    # Detailed prompt using the PCI-DSS controls text
    prompt = f"""
    You are an expert in PCI-DSS compliance. 
    Below is an excerpt from a PCI-DSS Report on Compliance Template containing the controls and requirements.
    A client has provided a screenshot showing details of their network and security configuration.
    Analyze the image and identify which specific control requirement is being addressed.
    Provide the control requirement code (e.g., 'Requirement 1.1.1') along with a detailed explanation of 
    how the information in the given image satisfies that requirement.
    PCI-DSS Template Excerpt: \n
    {pdf_text}...\n

    Context text from image: \n
    {image_text}
    Please be as specific as possible in your mapping.
    """

    # Construct the messages for the ChatCompletion API.
    # The user message is given as an array with both a text segment and the image.
    messages = [
        {"role": "system", "content": "You are a PCI-DSS compliance expert."},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": image_data_url}}
            ]
        }
    ]

    # Call the GPT-4 Vision API (model name may vary, e.g., "gpt-4-vision-preview")
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # adjust to your available model identifier
        messages=messages,
        max_tokens=500
    )

    # Extract and return the answer text from the response.
    answer = response.choices[0].message.content
    return answer


In [8]:
def main():
    # File paths for your PCI-DSS ROC template PDF and client screenshot
    # "card_tokenization_flow.jpg"
    # "card_decryption_flow.jpg"
    # "transaction_data_flow.jpg"
    # "Connfido Network Diagram.png"
    image_path = "Connfido Network Diagram.png" # screenshot file path

    # Map the screenshot to a specific PCI-DSS control using GPT-4 Vision
    print("Mapping image to PCI-DSS requirement...")
    result = map_image_to_pci_requirement(pci_template_text, image_path)

    # Print the result
    print("GPT Response:")
    print(result)


In [9]:
if __name__ == "__main__":
    main()

Mapping image to PCI-DSS requirement...
Converting image to data URL...


BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 165779 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}