In [None]:
!pip install pdfplumber python-pptx python-docx transformers pytesseract Pillow
!apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-eng


Collecting pdfplumber
  Downloading pdfplumber-0.11.1-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m621.4 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-pptx
  Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloadi

In [None]:
import pdfplumber
from pptx import Presentation
from docx import Document
from PIL import Image
import pytesseract
from transformers import BlipProcessor, BlipForConditionalGeneration
import io

# BLIP Modeli için ayar
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def load_file(file_path):
    if file_path.endswith('.pdf'):
        return pdfplumber.open(file_path)
    elif file_path.endswith('.pptx'):
        return Presentation(file_path)
    elif file_path.endswith('.docx'):
        return Document(file_path)
    else:
        raise ValueError("Unsupported file format!")

def extract_text_and_images(file):
    text = ""
    images = []

    if isinstance(file, pdfplumber.PDF):
        for page in file.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
            for img in page.images:
                img_bbox = (
                    max(0, img['x0']),
                    max(0, img['top']),
                    min(page.width, img['x1']),
                    min(page.height, img['bottom'])
                )
                if img_bbox[0] < page.width and img_bbox[1] < page.height and img_bbox[2] > 0 and img_bbox[3] > 0:
                    try:
                        page_image = page.within_bbox(img_bbox).to_image()
                        extracted_image = page_image.original
                        images.append(extracted_image)
                    except ValueError as e:
                        print(f"Skipping image due to bounding box error: {e}")

    elif isinstance(file, Presentation):
        for slide in file.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text
                if shape.shape_type == 13:  # Picture
                    img = shape.image
                    images.append(img.blob)

    elif isinstance(file, Document):
        for para in file.paragraphs:
            text += para.text
        for rel in file.part.rels.values():
            if "image" in rel.target_ref:
                img = rel.target_part.blob
                images.append(img)

    return text, images

def analyze_image(image):
    inputs = processor(images=image, return_tensors="pt")
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

def main(file_path):
    file = load_file(file_path)
    text, images = extract_text_and_images(file)

    report = f"Extracted Text:\n{text}\n\n"
    for i, img in enumerate(images):
        if isinstance(img, bytes):
            img = Image.open(io.BytesIO(img))
        caption = analyze_image(img)
        report += f"Image {i+1} Analysis:\n{caption}\n\n"

    with open("report.txt", "w") as f:
        f.write(report)

    print("Report generated as 'report.txt'")


In [None]:
from google.colab import files

#uploaded = files.upload()

for file_name in uploaded.keys():
    print(f"Uploaded file: {'Chapter_3_v8.02.pptx.pdf'}")
    main(file_name)


Uploaded file: Chapter_3_v8.02.pptx.pdf




Report generated as 'report.txt'
