<a href="https://colab.research.google.com/github/mohankumar-cybersec/mohankumar-cybersec/blob/main/TNQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install PyMuPDF pytesseract opencv-python pandas numpy matplotlib pillow

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m113.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyMuPDF
Successfully installed PyMuPDF-1.26.4 pytesseract-0.3.13


In [None]:
import fitz
import re

In [None]:

def extract_figures_simple(pdf_path):
    print("🔍 Extracting figures from PDF...")
    doc = fitz.open(pdf_path)

    figures = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        image_list = page.get_images()

        for img_index, img in enumerate(image_list):
            try:
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                img_data = pix.tobytes("png")

                # Save figure
                filename = f"figure_page{page_num+1}_{img_index+1}.png"
                with open(filename, "wb") as f:
                    f.write(img_data)

                figures.append({
                    "id": f"fig_{page_num+1}_{img_index+1}",
                    "filename": filename,
                    "page": page_num + 1
                })
                print(f"✅ Extracted: {filename}")

            except Exception as e:
                print(f"❌ Error: {e}")

    doc.close()
    return figures

figures = extract_figures_simple("/Sample paper.pdf")
print(f"Total figures extracted: {len(figures)}")

🔍 Extracting figures from PDF...
✅ Extracted: figure_page3_1.png
✅ Extracted: figure_page6_1.png
✅ Extracted: figure_page6_2.png
✅ Extracted: figure_page8_1.png
✅ Extracted: figure_page9_1.png
✅ Extracted: figure_page9_2.png
✅ Extracted: figure_page11_1.png
✅ Extracted: figure_page12_1.png
Total figures extracted: 8


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def extract_all_captions(pdf_path):
    print("\n Extracting captions...")
    doc = fitz.open(pdf_path)
    all_text = ""

    for page in doc:
        all_text += page.get_text()

    # Look for figure captions
    figure_pattern = r'Figure\s+\d+[:\-]\s*([^\n]+)'
    fig_matches = re.findall(figure_pattern, all_text, re.IGNORECASE)

    # Look for table captions
    table_pattern = r'Table\s+\d+[:\-]\s*([^\n]+)'
    table_matches = re.findall(table_pattern, all_text, re.IGNORECASE)

    doc.close()

    print("Found figure captions:", fig_matches)
    print("Found table captions:", table_matches)

    return fig_matches, table_matches

fig_captions, table_captions = extract_all_captions("/Sample paper.pdf")


 Extracting captions...
Found figure captions: []
Found table captions: []


In [None]:
import pandas as pd

def create_metadata(figures, fig_captions):
    print("\n Creating metadata...")

    metadata = []
    for i, fig in enumerate(figures):
        # Assign caption if available
        caption = fig_captions[i] if i < len(fig_captions) else "Caption not found"

        # Simple category detection
        if 'map' in caption.lower():
            category = "map"
        elif 'table' in caption.lower():
            category = "table"
        elif 'image' in caption.lower() or 'sem' in caption.lower():
            category = "microscopy_image"
        else:
            category = "chart"

        metadata.append({
            "Figure_ID": fig['id'],
            "Caption": caption,
            "Category": category,
            "Page": fig['page'],
            "Filename": fig['filename']
        })

    # Save to CSV
    df = pd.DataFrame(metadata)
    df.to_csv("extracted_metadata.csv", index=False)
    print("Metadata saved to 'extracted_metadata.csv'")
    return df

metadata_df = create_metadata(figures, fig_captions)
metadata_df


 Creating metadata...
Metadata saved to 'extracted_metadata.csv'


Unnamed: 0,Figure_ID,Caption,Category,Page,Filename
0,fig_3_1,Caption not found,chart,3,figure_page3_1.png
1,fig_6_1,Caption not found,chart,6,figure_page6_1.png
2,fig_6_2,Caption not found,chart,6,figure_page6_2.png
3,fig_8_1,Caption not found,chart,8,figure_page8_1.png
4,fig_9_1,Caption not found,chart,9,figure_page9_1.png
5,fig_9_2,Caption not found,chart,9,figure_page9_2.png
6,fig_11_1,Caption not found,chart,11,figure_page11_1.png
7,fig_12_1,Caption not found,chart,12,figure_page12_1.png


In [None]:
try:
    import pytesseract
    from PIL import Image

    def simple_ocr_demo():
        print("\n Testing OCR on first figure...")
        if figures:
            first_fig = figures[0]['filename']
            image = Image.open(first_fig)
            text = pytesseract.image_to_string(image)
            print(f"OCR Text from {first_fig}:")
            print(text[:200] + "..." if len(text) > 200 else text)
        else:
            print("No figures to process")

    simple_ocr_demo()
except Exception as e:
    print(f"OCR not available: {e}")


 Testing OCR on first figure...
OCR Text from figure_page3_1.png:
Shandong Provineg

Yellow Sea

Yellow River Bejing

Anhui Province

East Sea

[sting

Zhejiang Province

 



In [None]:
def estimate_complexity(metadata):
    print("\n🎯 Estimating complexity...")

    complexity_scores = []
    for item in metadata:
        caption = item['Caption']

        # Simple scoring based on caption length and category
        length_score = min(len(caption) // 20, 4)  # 0-4 based on length
        category_bonus = {
            "map": 2,
            "table": 3,
            "microscopy_image": 1,
            "chart": 2
        }.get(item['Category'], 1)

        final_score = min(length_score + category_bonus, 5)  # Scale 1-5

        complexity_scores.append({
            "Figure_ID": item['Figure_ID'],
            "Caption": item['Caption'],
            "Complexity_Score": final_score,
            "Level": ["Very Simple", "Simple", "Medium", "Complex", "Very Complex"][final_score-1]
        })

    complexity_df = pd.DataFrame(complexity_scores)
    print(complexity_df)
    return complexity_df

complexity_df = estimate_complexity(metadata_df.to_dict('records'))


🎯 Estimating complexity...
  Figure_ID            Caption  Complexity_Score   Level
0   fig_3_1  Caption not found                 2  Simple
1   fig_6_1  Caption not found                 2  Simple
2   fig_6_2  Caption not found                 2  Simple
3   fig_8_1  Caption not found                 2  Simple
4   fig_9_1  Caption not found                 2  Simple
5   fig_9_2  Caption not found                 2  Simple
6  fig_11_1  Caption not found                 2  Simple
7  fig_12_1  Caption not found                 2  Simple


In [None]:
def ai_verification_placeholder(metadata):
    print("\n🤖 AI Verification (Basic Implementation)")

    ai_results = []
    for item in metadata:
        # Placeholder logic - in real scenario use AI models
        ai_results.append({
            "Figure_ID": item['Figure_ID'],
            "Image_Type": "Human-created (assumed)",
            "Confidence": "High",
            "Notes": "Full AI detection requires specialized models"
        })

    ai_df = pd.DataFrame(ai_results)
    print(ai_df)
    return ai_df

ai_df = ai_verification_placeholder(metadata_df.to_dict('records'))


🤖 AI Verification (Basic Implementation)
  Figure_ID               Image_Type Confidence  \
0   fig_3_1  Human-created (assumed)       High   
1   fig_6_1  Human-created (assumed)       High   
2   fig_6_2  Human-created (assumed)       High   
3   fig_8_1  Human-created (assumed)       High   
4   fig_9_1  Human-created (assumed)       High   
5   fig_9_2  Human-created (assumed)       High   
6  fig_11_1  Human-created (assumed)       High   
7  fig_12_1  Human-created (assumed)       High   

                                           Notes  
0  Full AI detection requires specialized models  
1  Full AI detection requires specialized models  
2  Full AI detection requires specialized models  
3  Full AI detection requires specialized models  
4  Full AI detection requires specialized models  
5  Full AI detection requires specialized models  
6  Full AI detection requires specialized models  
7  Full AI detection requires specialized models  


In [None]:
print("="*50)
print("🚀 PIPELINE EXECUTION SUMMARY")
print("="*50)
print(f"✅ Module 1: {len(figures)} figures extracted")
print(f"✅ Module 2: {len(fig_captions)} captions found")
print(f"✅ Module 3: OCR demo implemented")
print(f"✅ Module 4: Metadata CSV created")
print(f"✅ Module 5: Complexity scores generated")
print(f"✅ Module 6: AI verification placeholder")
print("\n📁 Output Files:")
print("   - extracted_metadata.csv")
print("   - figure_*.png files")
print("\n🎯 Ready for submission!")

🚀 PIPELINE EXECUTION SUMMARY
✅ Module 1: 8 figures extracted
✅ Module 2: 0 captions found
✅ Module 3: OCR demo implemented
✅ Module 4: Metadata CSV created
✅ Module 5: Complexity scores generated
✅ Module 6: AI verification placeholder

📁 Output Files:
   - extracted_metadata.csv
   - figure_*.png files

🎯 Ready for submission!
