In [None]:
!pip install pdf2image opencv-python ipywidgets ultralytics scikit-image pymupdf pygments

In [None]:
!sudo apt-get update 
!sudo apt-get install -y libgl1
!sudo apt-get install -y poppler-utils

In [None]:
import numpy as np
import os
import io
import re
import fitz
import difflib
from ultralytics import YOLO
from skimage.metrics import structural_similarity as ssim
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
from PIL import Image, ImageChops, ImageEnhance
import base64
from datetime import datetime
import mlflow
from IPython.display import display, HTML
import pygments
from pygments.lexers import DiffLexer
from pygments.formatters import HtmlFormatter

In [None]:
import ipywidgets as widgets

In [None]:
# Set model paths and load models

forgery_model = YOLO("../shared/model_training_scripts/register_models/forgery_detect_model.pt")


signature_model = YOLO("../shared/model_training_scripts/register_models/signature_detect_model.pt")



In [None]:
### USE VISION MODEL TO LOCATE AND EXTRACT SIGNATURES FROM A PDF DOCUMENT ###

In [None]:
#Upload a PDF file with a signature

uploader = widgets.FileUpload(
    accept='.pdf',  
    multiple=False 
)
display(uploader)

In [None]:
#Convert PDF to image for processing

from pdf2image import convert_from_bytes

pdf_images = convert_from_bytes(uploader.value[0].content.tobytes(), fmt="jpeg")
print(f"PDF contains {len(pdf_images)} pages")

# Process each page
signatures_by_page = {}

for page_num, img in enumerate(pdf_images, 1):
    # Save the page image
    page_filename = f"temp/pdfimg_page{page_num}.jpeg"
    img.save(page_filename)
    print(f"Processing page {page_num}...")
    # Detect signatures in the PDF
    pred = signature_model(page_filename)
    signatures_this_page = []
    for i, p in enumerate(pred):
        boxes = p.boxes
        
        # Check if we have any detections
        if len(boxes) == 0:
            print(f"  No signatures detected on page {page_num}")
            continue
            
        # Get confidence scores and convert to numpy if needed
        conf_scores = boxes.conf
        if hasattr(conf_scores, 'cpu'):
            conf_scores = conf_scores.cpu().numpy()
            
        # Get bounding boxes and convert to numpy if needed
        bboxes = boxes.xyxy
        if hasattr(bboxes, 'cpu'):
            bboxes = bboxes.cpu().numpy()
            
        print(f"  Found {len(conf_scores)} potential signatures on page {page_num}")
        
        # Process detections and filter by confidence
        for j, conf in enumerate(conf_scores):
            if conf > 0.5:  # Apply confidence threshold
                # Format confidence for filename
                conf_formatted = f"{conf:.2f}"
                
                # Create filename with page, detection index and confidence
                crop_filename = f"signature_page{page_num}_det{j+1}_conf{conf_formatted}"
                
                # Get bounding box
                x1, y1, x2, y2 = bboxes[j]
                
                # Manual crop using PIL
                original_img = Image.open(page_filename)
                crop_img = original_img.crop((int(x1), int(y1), int(x2), int(y2)))
                
                # Save cropped image
                crop_path = os.path.join('temp', f"{crop_filename}.jpg")
                crop_img.save(crop_path)
                
                # Add to page signatures
                signatures_this_page.append((crop_path, conf))
                
                print(f"  Saved signature {j+1} with confidence: {conf:.2f}")
        
        # Visualize all detections with confidence scores
        plt.figure(figsize=(10, 10))
        img = plt.imread(page_filename)
        plt.imshow(img)
        
        for j, (conf, box) in enumerate(zip(conf_scores, bboxes)):
            x1, y1, x2, y2 = box
            
            # Choose color based on confidence
            if conf > 0.5:  # Above threshold
                color = 'green'
                linewidth = 2
                rect = plt.Rectangle((x1, y1), x2-x1, y2-y1, 
                linewidth=linewidth, edgecolor=color, facecolor='none')
                plt.gca().add_patch(rect)
                
                # Add text with confidence
                plt.text(x1, y1-5, f"{j+1}: {conf:.2f}", 
                        color='white', fontsize=12, 
                        bbox=dict(facecolor=color, alpha=0.7))
            else:  # Below threshold
                color = None
                linewidth = None
             
                  
            
        plt.title(f"Page {page_num} - Signatures (green: conf > 50%)")
        plt.axis('off')
        plt.tight_layout()
        plt.show()
    
    # Store results for this page
    signatures_by_page[page_num] = signatures_this_page
    
    high_conf_count = len(signatures_this_page)


# Calculate total signatures
total_signatures = sum(len(sigs) for sigs in signatures_by_page.values())


In [None]:
#Crop signatures for display

def display_only_crops(signatures_by_page):
    """
    Display only the cropped signature images - nothing else
    """
    from IPython.display import display, HTML
    import os
    
    # Extract all signature paths
    all_signature_paths = []
    for page_signatures in signatures_by_page.values():
        for sig_path, _ in page_signatures:
            if os.path.exists(sig_path):
                all_signature_paths.append(sig_path)
    
    # Check if any signatures were found
    if not all_signature_paths:
        print("No signature crops found")
        return
    
    # Build minimal HTML with just the images
    html_content = '<div style="display: flex; flex-wrap: wrap; gap: 10px;">'
    
    for sig_path in all_signature_paths:
        html_content += f'<img src="{sig_path}" style="max-width: 200px; max-height: 100px; margin: 5px;">'
    
    html_content += '</div>'
    
    # Display just the images
    display(HTML(html_content))

# Display only the cropped signature images
display_only_crops(signatures_by_page)

In [None]:
### ANALYZE PDF FILES FOR EDITING OR MANIPULATIONS ###

In [None]:
#Upload PDF for Analysis

pdf_uploader = widgets.FileUpload(
    accept='.pdf',  
    multiple=False 
)
display(pdf_uploader)

In [None]:
# Extract and save all versions of PDF

pdf_content = pdf_uploader.value[0].content.tobytes()

startxref_positions = [m.start() for m in re.finditer(b'startxref', pdf_content)]
        
if len(startxref_positions) <= 1:
    print("No incremental updates detected in this PDF.")

print(f"Found {len(startxref_positions)} potential versions")

# # Find all EOF positions
eof_positions = [pdf_content.find(b'%%EOF', pos) + 5 for pos in startxref_positions]

# # Extract and save all versions
version_paths = []

for i in range(len(eof_positions)):
    version_num = i + 1
    output_path = (f"temp/version_{version_num}.pdf")
    
    # Each version includes everything up to its EOF marker
    with open(output_path, 'wb') as f:
        f.write(pdf_content[:eof_positions[i]])
    
    version_paths.append(output_path)
    print(f"Version {version_num} saved as {output_path}")

In [None]:
if len(version_paths) <= 1:
    print("Need at least two versions to compare")
    # Exit or handle this case appropriately
else:
    # Extract text from each version (outside of any comparison loop)
    version_texts = []
    for path in version_paths:
        try:
            doc = fitz.open(path)
            text = ""
            for page_num in range(len(doc)):
                text += doc[page_num].get_text()
            version_texts.append(text)
            doc.close()
        except Exception as e:
            print(f"Error extracting text from {path}: {e}")
            version_texts.append("")
    
    # AFTER extracting all texts, then do comparisons
    # Compare consecutive versions
    for i in range(len(version_texts) - 1):
        prev_text = version_texts[i]
        curr_text = version_texts[i+1]
        
        print(f"\nComparing Version {i+1} with Version {i+2}:")
        
        # Check if either text is None or empty
        if prev_text is None or curr_text is None or prev_text.strip() == "" or curr_text.strip() == "":
            print(f"  Skipping comparison - text extraction failed or empty for Version {i+1} or Version {i+2}")
            continue
        
        # Basic difference check
        if prev_text == curr_text:
            print("  No text differences detected")
        else:
            # Create a unified diff
            diff = list(difflib.unified_diff(
                prev_text.splitlines(),
                curr_text.splitlines(),
                fromfile=f'Version {i+1}',
                tofile=f'Version {i+2}',
                lineterm=''
            ))
            
            # Count additions and removals
            additions = len([line for line in diff if line.startswith('+')])
            removals = len([line for line in diff if line.startswith('-')])
            print(f"  Changes: {additions} additions, {removals} removals")
            
            # Save detailed diff to file
            output_dir = os.path.dirname(version_paths[0])
            diff_path = os.path.join(output_dir, f"diff_v{i+1}_v{i+2}.txt")
            
            with open(diff_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(diff))
            print(f"  Detailed differences saved to {diff_path}")

In [None]:

# Read the diff
with open(diff_path, 'r', encoding='utf-8') as f:
    diff_content = f.read()
html_lines = []
for line in diff_content.split('\n'):
    if line.startswith('+'):
        html_lines.append(f'<span style="background-color: #e6ffec; color: #22863a">{line}</span>')
    elif line.startswith('-'):
        html_lines.append(f'<span style="background-color: #ffebe9; color: #cb2431">{line}</span>')
    elif line.startswith('@@'):
        html_lines.append(f'<span style="color: #6f42c1">{line}</span>')
    elif line.startswith('---') or line.startswith('+++'):
        html_lines.append(f'<span style="font-weight: bold">{line}</span>')
    else:
        html_lines.append(line)

formatted_diff = '<pre style="font-family: monospace;">' + '<br>'.join(html_lines) + '</pre>'
display(HTML(formatted_diff))

In [None]:
### Check a PDF for physical changes ###

In [None]:
pdf2_uploader = widgets.FileUpload(
    accept='.pdf',  
    multiple=False 
)
display(pdf2_uploader)

In [None]:
# Areas of deviation are highlighted and/or outlined

pdf_images = convert_from_bytes(uploader.value[0].content.tobytes(), fmt="jpeg")

class ModdedDocAnalyzer:
    def __init__(self):
        self.lower_bound = np.array([0, 10, 10])
        self.upper_bound = np.array([179, 255, 245])
    
    def convert_to_ela_image(self, image, quality=90):
        """Performs Error Level Analysis on an image."""
        # Save the image to a temporary file
        temp_output = io.BytesIO()
        image.save(temp_output, format="JPEG", quality=quality)
        temp_output.seek(0)
        
        # Open the temporary saved image
        temp_image = Image.open(temp_output)
        
        # Calculate the difference between the original and the saved image
        ela_image = ImageChops.difference(image, temp_image)
        
        # Scale the differences to make them visible
        extrema = ela_image.getextrema()
        max_diff = max([ex[1] for ex in extrema])
        if max_diff == 0:
            max_diff = 1
        scale = 255.0 / max_diff
        
        # Enhance the differences
        ela_image = ImageEnhance.Brightness(ela_image).enhance(scale)
        
        return ela_image
    
    def highlight_deviations(self, ela_image, threshold=20):
        """Highlights deviations in an ELA image based on a threshold."""
        ela_array = np.array(ela_image)
        mask = (ela_array > threshold).astype(np.uint8) * 255
        return Image.fromarray(mask)
    
    def detect_highlighted_areas(self, image):
        """Detect and mark suspicious areas in the image."""
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        
        mask = cv2.inRange(hsv, self.lower_bound, self.upper_bound)
        
        # Morphological operations for noise reduction
        kernel = np.ones((5, 5), np.uint8)
        mask = cv2.erode(mask, kernel, iterations=1)
        mask = cv2.dilate(mask, kernel, iterations=1)
        
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        suspicious_areas = []
        for contour in contours:
            area = cv2.contourArea(contour)
            if area > 50:
                x, y, w, h = cv2.boundingRect(contour)
                suspicious_areas.append((x, y, w, h))
                cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
        
        return suspicious_areas, image
    
    def analyze_pdf(self, pdf_img):
        """Perform complete deepfake analysis on a PDF."""
        try:
            # Perform ELA analysis
            ela_image = self.convert_to_ela_image(pdf_img, quality=90)
            
            # Detect suspicious areas
            deviation = self.highlight_deviations(ela_image, threshold=20)
            
            # Convert PIL images to numpy for OpenCV processing
            ela_array = np.array(ela_image)
            
            # Convert RGB to BGR for OpenCV
            if len(ela_array.shape) == 3 and ela_array.shape[2] == 3:
                ela_array = cv2.cvtColor(ela_array, cv2.COLOR_RGB2BGR)
                
            # Find and mark suspicious areas
            suspicious_areas, marked_image = self.detect_highlighted_areas(ela_array)
            
            # Convert images to base64 for web display
            def pil_to_base64(pil_img):
                buffered = io.BytesIO()
                pil_img.save(buffered, format="PNG")
                return base64.b64encode(buffered.getvalue()).decode('utf-8')
            
            # Convert OpenCV image to base64
            def cv_to_base64(cv_img):
                _, buffer = cv2.imencode('.png', cv_img)
                return base64.b64encode(buffer).decode('utf-8')
            
            results = {
                "suspicious_areas_count": len(suspicious_areas),
                "suspicious_areas": suspicious_areas,
                "images": {
                    "ela_analysis": pil_to_base64(ela_image),
                    "deviation_mask": pil_to_base64(deviation),
                    "marked_areas": cv_to_base64(marked_image)
                }
            }
            
            return results
        except Exception as e:
            print(f"Error analyzing PDF: {e}")
            return None

analyzer = ModdedDocAnalyzer()

for img in pdf_images:
    results = analyzer.analyze_pdf(img)
    if results.get('suspicious_areas_count') == 0:
        print("No suspicious areas found...")
    else:
        print(f"{results.get('suspicious_areas_count')} suspcious areas found. View below.")
        deviation_img = results.get('images').get('deviation_mask')
        image_d = base64.b64decode(deviation_img)
        dev_img = Image.open(io.BytesIO(image_d))
        
        dev_img.show()


In [None]:
#### Forgery Detection ####

In [None]:
# 1. First upload known genuine signature image

sig_uploader = widgets.FileUpload(
    accept='image/*',  
    multiple=False 
)
display(sig_uploader)

In [None]:
#Process and save uploaded image

genuine_sig = sig_uploader.value[0].content.tobytes()
image = Image.open(io.BytesIO(genuine_sig))
image.save("temp/genuine_sig.jpeg")

In [None]:
#2 - Upload questionable signature

compare_uploader = widgets.FileUpload(
    accept='image/*',  
    multiple=False 
)
display(compare_uploader)


In [None]:
#Process and save uploaded image

questioned_sig = compare_uploader.value[0].content.tobytes()
image = Image.open(io.BytesIO(questioned_sig))
image.save("temp/questioned_sig.jpeg")

In [None]:
# Combine images into a single image for prediction

def create_comparison_image(img1_path, img2_path):
    # Open images
    img1 = Image.open(img1_path).convert("RGB")
    img2 = Image.open(img2_path).convert("RGB")
    
    # Resize to same height
    height = max(img1.height, img2.height)
    width1 = int(img1.width * (height / img1.height))
    width2 = int(img2.width * (height / img2.height))
    
    img1 = img1.resize((width1, height), Image.LANCZOS)
    img2 = img2.resize((width2, height), Image.LANCZOS)
    
    # Create new image with space for both images
    total_width = width1 + width2
    comparison = Image.new('RGB', (total_width, height))
    
    # Paste images side by side
    comparison.paste(img1, (0, 0))
    comparison.paste(img2, (width1, 0))

    return comparison

img1_path = 'temp/genuine_sig.jpeg'
img2_path = 'temp/questioned_sig.jpeg'

comparison = create_comparison_image(img1_path, img2_path)

In [None]:
#Step 3 - Compare for forgery with custom trained vision model

In [None]:
# Get Prediction

prediction = forgery_model.predict(comparison)
for p in prediction:
    prob = p.summary()
    if prob and len(prob) > 0:
        pred = prob[0].get('name')
        confidence = prob[0].get('confidence', 0)
        conf_format = f"{confidence * 100:.1f}"

        if pred and ('genuine' in pred.lower() or 'authentic' in pred.lower()):
            ccolor = '#28a745'  # Green
            confidence_text = "GENUINE"
        elif pred and ('forg' in pred.lower() or 'fake' in pred.lower()):
            ccolor = '#dc3545'  # Red
            confidence_text = "FORGERY"
        else:
            ccolor = '#ffc107'  # Red
            confidence_text = pred.upper() if pred else "UNCERTAIN"

    
conf_html = f"""
<div style="font-family: 'Segoe UI', Arial, sans-serif; max-width: 500px; margin: 20px auto; 
            padding: 20px; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); 
            background: #f8f9fa;">
    <h2 style="text-align: center; color: #343a40; margin-top: 0;">Signature Verification</h2>
    <hr style="border: 0; height: 1px; background-image: linear-gradient(to right, rgba(0,0,0,0), rgba(0,0,0,0.2), rgba(0,0,0,0));">
    
    <div style="display: flex; align-items: center; justify-content: center; margin: 25px 0;">
        <div style="width: 80px; height: 80px; border-radius: 50%; background-color: {ccolor}; 
                  display: flex; align-items: center; justify-content: center; color: white; 
                  font-size: 38px; font-weight: bold;"></div>
    </div>
    
    <div style="text-align: center; margin: 20px 0;">
        <h3 style="font-size: 24px; margin-bottom: 5px; color: {ccolor};">{confidence_text}</h3>
        <p style="font-size: 18px; margin-top: 5px; color: #6c757d;">
            with <span style="color: {ccolor}; font-weight: bold;">{conf_format}%</span> confidence
        </p>
    </div>
    
    <div style="background-color: rgba(0,0,0,0.05); border-radius: 5px; padding: 10px; margin-top: 15px;">
        <p style="margin: 0; color: #6c757d; font-size: 14px;">
            <strong>Confidence Level:</strong> {confidence_text}
        </p>
        <div style="height: 6px; background-color: #e9ecef; border-radius: 3px; margin-top: 8px;">
            <div style="width: {conf_format}%; height: 100%; background-color: {ccolor}; border-radius: 3px;"></div>
        </div>
    </div>
    
    <div style="margin-top: 20px; font-size: 12px; color: #adb5bd; text-align: center;">
        Generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
    </div>
</div>
"""

from IPython.display import HTML
from datetime import datetime
HTML(conf_html)

In [None]:
#Clean Temp folder for next run

import os, shutil
folder = 'temp'
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))