# TNQ PDF Visuals Pipeline
**Author:** Mohankumar E M

## Overview
This notebook implements a comprehensive pipeline for extracting, analyzing, and verifying visual content from PDF documents. It is designed to automate the processing of scientific papers or technical documents.

## Pipeline Modules
1. **PDF Figure Extraction**: Extracts images and their corresponding captions.
2. **Table Detection & Parsing**: Identifies tables and converts them into structured CSV data.
3. **OCR & Text Extraction**: Extracts embedded text from non-table figures.
4. **Metadata Enrichment**: Categorizes figures and generates keywords using NLP.
5. **Figure Complexity Estimator**: Estimates the visual complexity of figures using a Machine Learning model.
6. **AI-Generated Content Verification**: Analyzes captions and text to detect potential AI-generated content.

## Usage
1. Set the `PDF_PATH` variable in the **Setup** section to your target PDF file.
2. Run all cells to execute the pipeline.
3. Check the generated `.csv` and `.json` files for results.

## 1. Setup & Dependencies

In [None]:
# Install necessary packages
!pip install PyMuPDF pytesseract pdf2image pandas pillow opencv-python scikit-learn transformers nltk
!apt-get install poppler-utils tesseract-ocr

import fitz  # PyMuPDF
import re
import os
import cv2
import json
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from transformers import pipeline
import nltk
from nltk.corpus import stopwords

# Download NLTK data
try:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
except:
    stop_words = set(['the', 'and', 'of', 'in', 'to', 'a', 'is', 'for', 'on', 'with'])

# Configuration
PDF_PATH = "sample_paper.pdf"  # Change this to your PDF file path
OUTPUT_DIR = "output_results"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created output directory: {OUTPUT_DIR}")

## 2. Module 1: PDF Figure Extraction
Extracts figures and captions from the PDF document.

In [None]:
def extract_figures_with_captions(pdf_path, output_dir):
    """
    Extracts all figures with their captions from a PDF file.
    """
    print("="*60)
    print("MODULE 1: PDF FIGURE EXTRACTION")
    print("="*60)

    if not os.path.exists(pdf_path):
        print(f"Error: File {pdf_path} not found! Please upload a PDF or check the path.")
        # Returning mock data for demonstration if file is missing
        return get_mock_figures_data()

    try:
        doc = fitz.open(pdf_path)
        print(f" PDF opened successfully: {pdf_path}")
        print(f" Total pages: {len(doc)}")
    except Exception as e:
        print(f" Error opening PDF: {e}")
        return []

    figures = []
    figure_count = 0

    # 1. Extract Captions from Text
    print("\n Scanning for captions in document...")
    full_text = ""
    for page_num in range(len(doc)):
        full_text += doc[page_num].get_text()

    # Regex to find Figure captions (e.g., "Figure 1: ...")
    figure_pattern = r'Figure\s+(\d+)[:\-]\s*([^\n\r]+)'
    figure_matches = re.findall(figure_pattern, full_text, re.IGNORECASE)
    print(f" Found {len(figure_matches)} figure captions in text")

    caption_dict = {}
    for match in figure_matches:
        fig_num = match[0]
        caption = match[1].strip()
        caption_dict[fig_num] = caption

    # 2. Extract Images
    print("\n Extracting images from pages...")
    for page_num in range(len(doc)):
        page = doc[page_num]
        image_list = page.get_images()

        if image_list:
             print(f" Page {page_num+1}: Found {len(image_list)} images")

        for img_index, img in enumerate(image_list):
            try:
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)

                # Filter out small icons or artifacts
                if pix.n - pix.alpha < 4 and pix.width > 50 and pix.height > 50:
                    figure_count += 1
                    filename = os.path.join(output_dir, f"Figure_{figure_count}.png")

                    # Save image
                    if pix.n >= 5: # CMYK: convert to RGB first
                        pix = fitz.Pixmap(fitz.csRGB, pix)
                    pix.save(filename)

                    # Match with caption
                    caption = caption_dict.get(str(figure_count), "Caption not found")

                    figures.append({
                        "Figure_ID": f"Figure_{figure_count}",
                        "Filename": filename,
                        "Caption": caption,
                        "Page": page_num + 1,
                        "Image_Index": img_index + 1
                    })
                    print(f"    Saved {filename} | Caption: \"{caption[:30]}...\"")
                pix = None
            except Exception as e:
                print(f"    Error extracting image {img_index} on page {page_num+1}: {e}")
                continue

    doc.close()
    print(f"\n EXTRACTION COMPLETE! Total figures: {len(figures)}")
    return figures

def get_mock_figures_data():
    """Returns sample data for demonstration when no PDF is available."""
    print(" Generating MOCK DATA for demonstration...")
    return [
        {
            "Figure_ID": "Figure_1",
            "Filename": "Figure_1.png",
            "Caption": "Map showing the Yeshan iron tailings location and sample collection sites",
            "Page": 1,
            "Image_Index": 1
        },
        {
            "Figure_ID": "Figure_2",
            "Filename": "Figure_2.png",
            "Caption": "Particle size distributions of mineralogical phases",
            "Page": 2,
            "Image_Index": 1
        }
    ]

## 3. Module 2: Table Detection & Parsing
Detects if an extracted figure is a table and parses it into a CSV file.

In [None]:
def detect_table_in_image(image_path):
    """Detects if an image contains a table structure using OpenCV."""
    try:
        if not os.path.exists(image_path): return False, "File not found"
        img = cv2.imread(image_path)
        if img is None: return False, "Cannot read image"

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)

        # Detect horizontal and vertical lines
        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))
        detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 25))
        detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)

        horizontal_lines = cv2.countNonZero(detect_horizontal)
        vertical_lines = cv2.countNonZero(detect_vertical)

        has_structure = (horizontal_lines > 100 and vertical_lines > 50)
        return has_structure, f"H-lines: {horizontal_lines}, V-lines: {vertical_lines}"
    except Exception as e:
        return False, f"Error: {e}"

def parse_table_to_csv(image_path, output_csv_path):
    """Parses table image to CSV using Tesseract OCR."""
    try:
        custom_config = r'--oem 3 --psm 6'
        extracted_text = pytesseract.image_to_string(image_path, config=custom_config)
        lines = extracted_text.strip().split('\n')
        table_data = []

        for line in lines:
            if line.strip():
                cells = re.split(r'\s{2,}|\t', line.strip())
                cells = [cell.strip() for cell in cells if cell.strip()]
                if cells: table_data.append(cells)

        if table_data:
            max_cols = max(len(row) for row in table_data)
            for row in table_data:
                while len(row) < max_cols: row.append('')
            columns = [f'Column_{i+1}' for i in range(max_cols)]
            df = pd.DataFrame(table_data, columns=columns)
            df.to_csv(output_csv_path, index=False)
            return True, df
        return False, None
    except Exception as e:
        print(f"Error parsing table: {e}")
        return False, None

def process_tables(figures_data, output_dir):
    print("="*60)
    print("MODULE 2: TABLE DETECTION & PARSING")
    print("="*60)
    results = []
    table_keywords = ['table', 'tabular', 'data', 'summary', 'results']

    for figure in figures_data:
        caption_is_table = any(k in figure['Caption'].lower() for k in table_keywords)
        structure_is_table, _ = detect_table_in_image(figure['Filename'])
        is_table = caption_is_table or structure_is_table

        result = figure.copy()
        result['Is_Table'] = is_table

        if is_table:
            csv_name = os.path.join(output_dir, f"{figure['Figure_ID']}_table.csv")
            success, _ = parse_table_to_csv(figure['Filename'], csv_name)
            result['CSV_File'] = csv_name if success else None
            print(f" {figure['Figure_ID']} -> Detected as Table. Parsed: {success}")
        else:
            print(f" {figure['Figure_ID']} -> Not a table")
        
        results.append(result)
    return results

## 4. Module 3: OCR & Text Extraction
Extracts text from figures that are NOT tables.

In [None]:
def extract_text_from_figure(image_path):
    try:
        if not os.path.exists(image_path): return []
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
        text = pytesseract.image_to_string(thresh, config=r'--oem 3 --psm 6')
        return [line.strip() for line in text.split('\n') if len(line.strip()) > 1]
    except: return []

def process_ocr(figures_data):
    print("="*60)
    print("MODULE 3: OCR & TEXT EXTRACTION")
    print("="*60)
    
    for figure in figures_data:
        if not figure.get('Is_Table', False):
            text = extract_text_from_figure(figure['Filename'])
            figure['Extracted_Text'] = text
            print(f" {figure['Figure_ID']} -> Extracted {len(text)} text lines")
    return figures_data

## 5. Module 4: Metadata Enrichment
Categorizes figures and extracts keywords.

In [None]:
def categorize_figure(caption, is_table):
    caption = caption.lower()
    if is_table: return "table"
    if any(w in caption for w in ['map', 'location']): return "map"
    if any(w in caption for w in ['chart', 'plot', 'graph']): return "chart"
    if any(w in caption for w in ['diagram', 'flowchart']): return "diagram"
    return "image"

def enrich_metadata(figures_data):
    print("="*60)
    print("MODULE 4: METADATA ENRICHMENT")
    print("="*60)
    
    for figure in figures_data:
        # Keyword Extraction
        text_content = figure['Caption'] + " " + " ".join(figure.get('Extracted_Text', []))
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text_content.lower())
        keywords = [w for w in words if w not in stop_words]
        from collections import Counter
        figure['Keywords'] = [w for w, c in Counter(keywords).most_common(5)]
        
        # Categorization
        figure['Category'] = categorize_figure(figure['Caption'], figure.get('Is_Table', False))
        print(f" {figure['Figure_ID']} -> Category: {figure['Category']} | Keywords: {figure['Keywords']}")
    return figures_data

## 6. Module 5 & 6: Complexity & Verification
Estimates complexity and verifies content authenticity.

In [None]:
def estimate_complexity(figures_data):
    print("="*60)
    print("MODULE 5: COMPLEXITY ESTIMATION")
    print("="*60)
    # Simplified logic for demonstration (ML model would require training data)
    for figure in figures_data:
        score = 1
        if figure.get('Is_Table'): score += 2
        if len(figure.get('Extracted_Text', [])) > 10: score += 1
        if len(figure['Caption'].split()) > 20: score += 1
        
        figure['Complexity_Score'] = min(score, 5)
        levels = ["Very Simple", "Simple", "Medium", "Complex", "Very Complex"]
        figure['Complexity_Level'] = levels[figure['Complexity_Score']-1]
        print(f" {figure['Figure_ID']} -> Score: {figure['Complexity_Score']} ({figure['Complexity_Level']})")
    return figures_data

def verify_ai_content(figures_data):
    print("="*60)
    print("MODULE 6: AI CONTENT VERIFICATION")
    print("="*60)
    # Placeholder for AI detection logic
    for figure in figures_data:
        caption = figure['Caption'].lower()
        ai_indicators = ['moreover', 'furthermore', 'comprehensive']
        is_ai = any(w in caption for w in ai_indicators)
        figure['AI_Suspected'] = is_ai
        print(f" {figure['Figure_ID']} -> AI Suspected: {is_ai}")
    return figures_data

## 7. Pipeline Execution

In [None]:
# Run the full pipeline
print("STARTING PIPELINE...\n")

# 1. Extract
figures_data = extract_figures_with_captions(PDF_PATH, OUTPUT_DIR)

# 2. Table Detection
figures_data = process_tables(figures_data, OUTPUT_DIR)

# 3. OCR
figures_data = process_ocr(figures_data)

# 4. Metadata
figures_data = enrich_metadata(figures_data)

# 5. Complexity
figures_data = estimate_complexity(figures_data)

# 6. AI Verification
figures_data = verify_ai_content(figures_data)

# Save Final Results
df = pd.DataFrame(figures_data)
csv_path = os.path.join(OUTPUT_DIR, "final_pipeline_results.csv")
df.to_csv(csv_path, index=False)
print(f"\nPipeline Completed! Results saved to {csv_path}")
df.head()