In [4]:
from pptx import Presentation
import os
from pptx.enum.shapes import MSO_SHAPE_TYPE
from PIL import Image
import io

In [5]:
IMAGE_FOLDER = "./extracted_images"
os.makedirs(IMAGE_FOLDER, exist_ok=True)

def extract_text_and_images(pptx_path):
    presentation = Presentation(pptx_path)
    texts, images = [], []
    source_file = os.path.basename(pptx_path)

    for i, slide in enumerate(presentation.slides):
        slide_data = {
            "title": "",
            "body": [],
            "tables": [],
            "slide_number": i + 1,
            "source": source_file
        }

        for shape in slide.shapes:
            # Text content
            if shape.has_text_frame:
                for para in shape.text_frame.paragraphs:
                    text = para.text.strip()
                    if not text:
                        continue

                    if not slide_data["title"] and "title" in shape.name.lower():
                        slide_data["title"] = text
                        continue

                    item = {
                        "text": text,
                        "is_bullet": para.level > 0,
                        "level": para.level
                    }
                    slide_data["body"].append(item)

            # Table content
            if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
                table_data = []
                table = shape.table
                for row in table.rows:
                    row_data = [cell.text.strip() for cell in row.cells]
                    table_data.append(row_data)
                slide_data["tables"].append(table_data)

            # Image content
            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                image = shape.image
                img_bytes = image.blob
                try:
                    pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
                    img_name = f"{source_file}_slide{i+1}_{len(images)}.png"
                    img_path = os.path.join(IMAGE_FOLDER, img_name)
                    pil_image.save(img_path, format="PNG")
                    images.append((img_path, i + 1))
                except Exception as e:
                    print(f"Failed to convert image on slide {i+1}: {e}")

        if slide_data["title"] or slide_data["body"] or slide_data["tables"]:
            texts.append(slide_data)

    print(texts)
    print("-" * 20)
    print(images)
    return texts, images

In [None]:
def test_extract_text():
    pptx_file = "test_pp.pptx"  # Make sure this file exists with sample slides
    text_chunks, images = extract_text_and_images(pptx_file)

    print(f"Found {len(text_chunks)} slides with text.")
    for i, slide in enumerate(text_chunks):
        print(f"\n--- Slide {slide['slide_number']} ---")
        print(f"Title: {slide['title']}")
        
        print("Body:")
        for item in slide["body"]:
            bullet = "• " if item["is_bullet"] else ""
            indent = "  " * item["level"]
            print(f"{indent}{bullet}{item['text']}")
        
        if slide["tables"]:
            print("\nTables:")
            for table in slide["tables"]:
                for row in table:
                    print(" | ".join(row))
                print("-" * 40)

if __name__ == "__main__":
    test_extract_text()