<a href="https://colab.research.google.com/github/michaelwnau/consequential-products/blob/main/mindflayer_v1_0_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Setup and Imports
!pip install pdfplumber transformers torch Pillow matplotlib

In [None]:
import pdfplumber
import os
import io
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import textwrap
from google.colab import drive, files

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Set the path to your PDF file in Google Drive or other repository
pdf_file_path = '/content/drive/MyDrive/DTRA243-003-RESEARCH/s41467-024-45563-x.pdf'

In [None]:
# 1. Extract Text and Images from PDF
def extract_text_and_images(pdf_path):
    text = ""
    images = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
            for image in page.images:
                images.append(image)
    return text, images

text, images = extract_text_and_images(pdf_file_path)

print("Extracted Text Snippet:\n")
print(text[:500])

In [None]:
# 2. Save Extracted Images
os.makedirs('extracted_images', exist_ok=True)
image_paths = []

for i, img in enumerate(images):
    try:
        image_path = f'extracted_images/image_{i}.png'
        pil_image = Image.open(io.BytesIO(img['stream'].get_data()))

        # Convert to RGB if the image has an alpha channel
        if pil_image.mode in ('RGBA', 'LA') or (pil_image.mode == 'P' and 'transparency' in pil_image.info):
            pil_image = pil_image.convert('RGB')

        pil_image.save(image_path, 'PNG')
        image_paths.append(image_path)
        print(f"Saved image {i}")
    except Exception as e:
        print(f"Error processing image {i}: {str(e)}")

print(f"Extracted and saved {len(image_paths)} images")

In [None]:
# 3. Interpret Images Using a Multimodal Model (BLIP)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

captions = {}

for image_path in image_paths:
    try:
        raw_image = Image.open(image_path).convert('RGB')
        inputs = processor(raw_image, return_tensors="pt").to(torch.device('cpu'))
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        captions[image_path] = caption
        print(f"Generated caption for {image_path}")
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        captions[image_path] = "Caption generation failed"

In [None]:
# 3. Interpret Images Using a Multimodal Model (BLIP)
from PIL import Image, UnidentifiedImageError
import io

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

captions = {}

for image_path in image_paths:
    try:
        raw_image = Image.open(image_path).convert('RGB')
    except UnidentifiedImageError:
        # Attempt to open with io.BytesIO if Image.open fails
        with open(image_path, 'rb') as f:
            image_data = f.read()
        try:
            raw_image = Image.open(io.BytesIO(image_data)).convert('RGB')
        except UnidentifiedImageError:
            print(f"Skipping {image_path}: Could not identify image format.")
            continue

    inputs = processor(raw_image, return_tensors="pt").to(torch.device('cpu'))
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    captions[image_path] = caption

In [None]:
# 4. Display Results
fig = plt.figure(figsize=(15, len(image_paths) * 3))
gs = gridspec.GridSpec(len(image_paths), 2, width_ratios=[1, 1], hspace=0.5)

for idx, image_path in enumerate(image_paths):
    # Image subplot
    ax_img = fig.add_subplot(gs[idx, 0])
    img = Image.open(image_path)
    ax_img.imshow(img)
    ax_img.axis('off')

    # Caption subplot
    ax_text = fig.add_subplot(gs[idx, 1])
    ax_text.axis('off')

    caption = captions.get(image_path, 'No caption available')
    wrapped_caption = textwrap.fill(f"Caption: {caption}", width=60)

    ax_text.text(0.5, 0.5, wrapped_caption,
                 ha='center', va='center', fontsize=8,
                 bbox=dict(facecolor='white', alpha=0.5, edgecolor='none'),
                 wrap=True)

plt.tight_layout()
plt.show()

# Save the figure as a PDF
pdf_filename = 'results_matplotlib.pdf'
fig.savefig(pdf_filename, bbox_inches='tight')

# Download the PDF file
files.download(pdf_filename)

In [None]:
# 5. Advanced Interpretation (Optional): Summarize the Extracted Text
from transformers import pipeline, AutoTokenizer
import warnings

# Suppress the FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning)

model_name = "facebook/bart-large-cnn"
summarizer = pipeline("summarization", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def clean_text(text):
    # Remove any non-printable characters
    text = ''.join(char for char in text if char.isprintable())
    # Remove excessive whitespace
    text = ' '.join(text.split())
    return text

def chunk_text(text, max_chunk_size):
    words = text.split()
    chunks = []
    current_chunk = []
    current_size = 0
    for word in words:
        if current_size + len(word) + 1 > max_chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_size = len(word)
        else:
            current_chunk.append(word)
            current_size += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

cleaned_text = clean_text(text)

if not cleaned_text:
    print("The extracted text is empty or contains only non-printable characters.")
else:
    # Tokenize the text to get the number of tokens
    inputs = tokenizer(cleaned_text, return_tensors='pt', truncation=False)
    input_ids = inputs['input_ids']
    num_tokens = input_ids.shape[1]
    print(f"Number of tokens in extracted text: {num_tokens}")

    # Model's maximum input length
    max_model_length = 1024  # BART models typically have a max length of 1024 tokens
    print(f"Model's maximum input length: {max_model_length} tokens")

    # Chunk the text
    text_chunks = chunk_text(cleaned_text, max_model_length)
    print(f"Number of chunks: {len(text_chunks)}")

    # Summarize each chunk
    summaries = []
    for i, chunk in enumerate(text_chunks):
        if chunk.strip():  # Ensure the chunk is not empty
            try:
                summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False, clean_up_tokenization_spaces=True)
                summaries.append(summary[0]['summary_text'])
                print(f"Summarized chunk {i+1}/{len(text_chunks)}")
            except Exception as e:
                print(f"Error summarizing chunk {i+1}: {str(e)}")
                print("Chunk content:", chunk[:100] + "..." if len(chunk) > 100 else chunk)

    # Combine summaries
    if summaries:
        full_summary = ' '.join(summaries)
        print("\nSummary of the Article:\n")
        print(full_summary)
    else:
        print("No summaries were generated. The text might be too short or couldn't be processed.")


In [None]:
# Combine summaries
full_summary = ' '.join(summaries)
print("Summary of the Article:\n")
print(full_summary)