In [1]:
# System
!apt-get update && apt-get install -y poppler-utils tesseract-ocr

# Python
!pip install python-pptx pdfplumber pdf2image pillow pytesseract transformers torch


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [75.2 kB]                 
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease                                              
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]                           
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]                             
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]                                
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]              
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,842 kB]                       
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease                 
Get:1

In [2]:
# document_extractor.py

import os
import io
from typing import List, Dict

# PPTX support
from pptx import Presentation

# PDF support
import pdfplumber
from pdf2image import convert_from_path

# Image & OCR
from PIL import Image
import pytesseract

# Captioning
from transformers import pipeline
import torch

# -------------- Configuration --------------

# BLIP‑2 captioner (requires GPU/cuda or Colab with a GPU runtime)
CAPTIONER = pipeline(
    "image-to-text",
    model="Salesforce/blip2-opt-2.7b",
    torch_dtype=torch.float16,
    device_map="auto",
)

# File extensions we handle
PPTX_EXT = {".pptx"}
PDF_EXT  = {".pdf"}


# -------------- Core Extractors --------------

def extract_pptx_content(pptx_path: str) -> List[Dict]:
    prs = Presentation(pptx_path)
    pages = []
    for idx, slide in enumerate(prs.slides, start=1):
        texts, caps, ocrs = [], [], []
        # — text shapes
        for shape in slide.shapes:
            if shape.has_text_frame:
                for line in shape.text_frame.text.splitlines():
                    txt = line.strip()
                    if txt:
                        texts.append(txt)
        # — image shapes
        for shape in slide.shapes:
            if hasattr(shape, "image"):
                img = Image.open(io.BytesIO(shape.image.blob)).convert("RGB")
                # caption
                try:
                    cap = CAPTIONER(img, max_new_tokens=50)[0]["generated_text"].strip()
                except Exception as e:
                    cap = f"[caption_error: {e}]"
                caps.append(cap)
                # OCR
                try:
                    ocr = pytesseract.image_to_string(img).strip()
                except Exception as e:
                    ocr = f"[ocr_error: {e}]"
                if ocr:
                    ocrs.append(ocr)
        pages.append({
            "id": idx,
            "type": "slide",
            "texts": texts,
            "image_captions": caps,
            "ocr_texts": ocrs,
        })
    return pages


def extract_pdf_content(pdf_path: str, dpi: int = 200) -> List[Dict]:
    pages = []
    # 1) extract text per page
    with pdfplumber.open(pdf_path) as pdf:
        raw_texts = [(i+1, page.extract_text() or "") for i, page in enumerate(pdf.pages)]
    # 2) render each page to image
    pil_pages = convert_from_path(pdf_path, dpi=dpi)
    # 3) combine
    for (idx, txt), img in zip(raw_texts, pil_pages):
        texts = [line.strip() for line in txt.splitlines() if line.strip()]
        try:
            cap = CAPTIONER(img, max_new_tokens=50)[0]["generated_text"].strip()
        except Exception as e:
            cap = f"[caption_error: {e}]"
        try:
            ocr = pytesseract.image_to_string(img).strip()
        except Exception as e:
            ocr = f"[ocr_error: {e}]"
        ocrs = [line for line in ocr.splitlines() if line.strip()]
        pages.append({
            "id": idx,
            "type": "page",
            "texts": texts,
            "image_captions": [cap],
            "ocr_texts": ocrs,
        })
    return pages


# -------------- Chunking & Dispatch --------------

def chunk_items(items: List[Dict], chunk_size: int = 5) -> List[List[Dict]]:
    return [items[i : i + chunk_size] for i in range(0, len(items), chunk_size)]


def extract_document_content(path: str, chunk_size: int = 5) -> List[List[Dict]]:
    """
    Detects whether `path` is PPTX or PDF, extracts its content,
    then returns a list of chunks (each chunk is a list of page‑dicts).
    """
    ext = os.path.splitext(path)[1].lower()
    if ext in PPTX_EXT:
        items = extract_pptx_content(path)
    elif ext in PDF_EXT:
        items = extract_pdf_content(path)
    else:
        raise ValueError(f"Unsupported extension {ext!r}")
    return chunk_items(items, chunk_size=chunk_size)


# -------------- Example Usage --------------


2025-04-22 08:09:45.708588: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745309385.861475      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745309385.907675      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


In [3]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Lora&Qlora/Finetuning.pdf"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("doc_chunks.json", "w") as f:
        json.dump(chunks, f, indent=2)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



=== Chunk 1 (IDs 1–5) ===
 • Page 1: 9 texts, 1 captions, 9 OCR
 • Page 2: 14 texts, 1 captions, 18 OCR
 • Page 3: 13 texts, 1 captions, 15 OCR
 • Page 4: 2 texts, 1 captions, 16 OCR
 • Page 5: 14 texts, 1 captions, 16 OCR

=== Chunk 2 (IDs 6–10) ===
 • Page 6: 13 texts, 1 captions, 14 OCR
 • Page 7: 16 texts, 1 captions, 19 OCR
 • Page 8: 15 texts, 1 captions, 19 OCR
 • Page 9: 15 texts, 1 captions, 17 OCR
 • Page 10: 14 texts, 1 captions, 16 OCR

=== Chunk 3 (IDs 11–15) ===
 • Page 11: 14 texts, 1 captions, 15 OCR
 • Page 12: 12 texts, 1 captions, 14 OCR
 • Page 13: 2 texts, 1 captions, 13 OCR
 • Page 14: 2 texts, 1 captions, 18 OCR
 • Page 15: 10 texts, 1 captions, 18 OCR

=== Chunk 4 (IDs 16–20) ===
 • Page 16: 9 texts, 1 captions, 24 OCR
 • Page 17: 11 texts, 1 captions, 13 OCR
 • Page 18: 14 texts, 1 captions, 15 OCR
 • Page 19: 9 texts, 1 captions, 12 OCR
 • Page 20: 2 texts, 1 captions, 17 OCR

=== Chunk 5 (IDs 21–23) ===
 • Page 21: 12 texts, 1 captions, 13 OCR
 • Page 22: 8 

In [4]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/BST/Class2_Unit3_Tree_BST_DynamicInsert.pptx"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Class2_Unit3_Tree_BST_DynamicInsert-1.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Slide 1: 5 texts, 1 captions, 1 OCR
 • Slide 2: 4 texts, 1 captions, 1 OCR
 • Slide 3: 8 texts, 1 captions, 1 OCR
 • Slide 4: 5 texts, 1 captions, 1 OCR
 • Slide 5: 4 texts, 1 captions, 1 OCR

=== Chunk 2 (IDs 6–8) ===
 • Slide 6: 10 texts, 1 captions, 1 OCR
 • Slide 7: 26 texts, 6 captions, 1 OCR
 • Slide 8: 6 texts, 1 captions, 1 OCR


In [5]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/BST/Class3_Unit3_Trees_BSTDeletion.pptx"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Class3_Unit3_Trees_BSTDeletion-2.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Slide 1: 5 texts, 1 captions, 1 OCR
 • Slide 2: 4 texts, 1 captions, 1 OCR
 • Slide 3: 6 texts, 1 captions, 1 OCR
 • Slide 4: 9 texts, 1 captions, 1 OCR
 • Slide 5: 8 texts, 1 captions, 1 OCR

=== Chunk 2 (IDs 6–10) ===
 • Slide 6: 9 texts, 1 captions, 1 OCR
 • Slide 7: 11 texts, 1 captions, 1 OCR
 • Slide 8: 14 texts, 3 captions, 1 OCR
 • Slide 9: 12 texts, 3 captions, 1 OCR
 • Slide 10: 6 texts, 1 captions, 1 OCR


In [6]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/BST/Class4_Unit3_Trees_BST_ArrayInsert.pptx"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Class4_Unit3_Trees_BST_ArrayInsert.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Slide 1: 5 texts, 1 captions, 1 OCR
 • Slide 2: 4 texts, 1 captions, 1 OCR
 • Slide 3: 3 texts, 1 captions, 1 OCR
 • Slide 4: 11 texts, 1 captions, 1 OCR
 • Slide 5: 16 texts, 1 captions, 1 OCR

=== Chunk 2 (IDs 6–7) ===
 • Slide 6: 35 texts, 15 captions, 1 OCR
 • Slide 7: 6 texts, 1 captions, 1 OCR


In [7]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Multimodal/MAMBA.pdf"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("MAMBA.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Page 1: 9 texts, 1 captions, 9 OCR
 • Page 2: 15 texts, 1 captions, 18 OCR
 • Page 3: 16 texts, 1 captions, 17 OCR
 • Page 4: 5 texts, 1 captions, 13 OCR
 • Page 5: 15 texts, 1 captions, 17 OCR

=== Chunk 2 (IDs 6–10) ===
 • Page 6: 16 texts, 1 captions, 19 OCR
 • Page 7: 15 texts, 1 captions, 18 OCR
 • Page 8: 14 texts, 1 captions, 15 OCR
 • Page 9: 16 texts, 1 captions, 17 OCR
 • Page 10: 15 texts, 1 captions, 18 OCR

=== Chunk 3 (IDs 11–15) ===
 • Page 11: 14 texts, 1 captions, 17 OCR
 • Page 12: 12 texts, 1 captions, 13 OCR
 • Page 13: 2 texts, 1 captions, 89 OCR
 • Page 14: 19 texts, 1 captions, 25 OCR
 • Page 15: 18 texts, 1 captions, 30 OCR

=== Chunk 4 (IDs 16–20) ===
 • Page 16: 16 texts, 1 captions, 21 OCR
 • Page 17: 19 texts, 1 captions, 30 OCR
 • Page 18: 21 texts, 1 captions, 35 OCR
 • Page 19: 12 texts, 1 captions, 14 OCR
 • Page 20: 9 texts, 1 captions, 9 OCR


In [11]:
pip install python-pptx reportlab


Collecting reportlab
  Downloading reportlab-4.4.0-py3-none-any.whl.metadata (1.8 kB)
Downloading reportlab-4.4.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
from pptx import Presentation
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

# Extract text from the PPT file
def extract_text_from_ppt(ppt_file):
    prs = Presentation(ppt_file)
    text = []
    
    # Loop through all slides and extract text
    for slide in prs.slides:
        slide_text = ""
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text += shape.text + "\n"
        text.append(slide_text)
    
    return text

# Generate PDF from extracted text
def generate_pdf_from_text(text, pdf_file="output.pdf"):
    c = canvas.Canvas(pdf_file, pagesize=letter)
    width, height = letter
    
    # Set font and size
    c.setFont("Helvetica", 10)
    
    y_position = height - 40
    for slide_text in text:
        c.drawString(40, y_position, slide_text.strip())
        y_position -= 15
        if y_position < 40:
            c.showPage()
            y_position = height - 40
    
    c.save()

# Usage
ppt_file = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Heap/Class8_Unit3_Trees_Heap.pptx"
text = extract_text_from_ppt(ppt_file)
generate_pdf_from_text(text, "Class8_Unit3_Trees_Heap.pdf")


In [29]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Heap/Class8_Unit3_Trees_Heap.pptx"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Class8_Unit3_Trees_Heap.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Slide 1: 5 texts, 1 captions, 1 OCR
 • Slide 2: 4 texts, 1 captions, 1 OCR
 • Slide 3: 5 texts, 1 captions, 1 OCR
 • Slide 4: 5 texts, 1 captions, 1 OCR
 • Slide 5: 6 texts, 1 captions, 1 OCR

=== Chunk 2 (IDs 6–10) ===
 • Slide 6: 6 texts, 1 captions, 1 OCR
 • Slide 7: 8 texts, 2 captions, 1 OCR
 • Slide 8: 10 texts, 2 captions, 1 OCR
 • Slide 9: 10 texts, 2 captions, 1 OCR
 • Slide 10: 14 texts, 3 captions, 1 OCR

=== Chunk 3 (IDs 11–15) ===
 • Slide 11: 6 texts, 1 captions, 1 OCR
 • Slide 12: 24 texts, 1 captions, 1 OCR
 • Slide 13: 3 texts, 1 captions, 1 OCR
 • Slide 14: 14 texts, 1 captions, 1 OCR
 • Slide 15: 10 texts, 1 captions, 1 OCR

=== Chunk 4 (IDs 16–20) ===
 • Slide 16: 7 texts, 1 captions, 1 OCR
 • Slide 17: 7 texts, 1 captions, 1 OCR
 • Slide 18: 8 texts, 1 captions, 1 OCR
 • Slide 19: 3 texts, 1 captions, 1 OCR
 • Slide 20: 6 texts, 1 captions, 1 OCR


In [None]:
# Usage
ppt_file = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Heap/Class8_Unit3_Trees_Heap.pptx"
text = extract_text_from_ppt(ppt_file)
generate_pdf_from_text(text, "Class8_Unit3_Trees_Heap.pdf")


In [15]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Multimodal/MultiModal LLMs.pdf"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("MultiModal LLMs.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Page 1: 9 texts, 1 captions, 9 OCR
 • Page 2: 17 texts, 1 captions, 19 OCR
 • Page 3: 14 texts, 1 captions, 16 OCR
 • Page 4: 6 texts, 1 captions, 9 OCR
 • Page 5: 7 texts, 1 captions, 10 OCR

=== Chunk 2 (IDs 6–10) ===
 • Page 6: 2 texts, 1 captions, 20 OCR
 • Page 7: 26 texts, 1 captions, 16 OCR
 • Page 8: 11 texts, 1 captions, 14 OCR
 • Page 9: 3 texts, 1 captions, 9 OCR
 • Page 10: 9 texts, 1 captions, 23 OCR

=== Chunk 3 (IDs 11–15) ===
 • Page 11: 6 texts, 1 captions, 7 OCR
 • Page 12: 6 texts, 1 captions, 23 OCR
 • Page 13: 13 texts, 1 captions, 13 OCR
 • Page 14: 13 texts, 1 captions, 13 OCR
 • Page 15: 13 texts, 1 captions, 13 OCR

=== Chunk 4 (IDs 16–20) ===
 • Page 16: 16 texts, 1 captions, 14 OCR
 • Page 17: 9 texts, 1 captions, 11 OCR
 • Page 18: 7 texts, 1 captions, 10 OCR
 • Page 19: 10 texts, 1 captions, 12 OCR
 • Page 20: 6 texts, 1 captions, 8 OCR

=== Chunk 5 (IDs 21–21) ===
 • Page 21: 9 texts, 1 captions, 9 OCR


In [16]:
# Usage
ppt_file = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/TBT/Class6_Unit3_Trees_ThreadBST.pptx"
text = extract_text_from_ppt(ppt_file)
generate_pdf_from_text(text, "Class6_Unit3_Trees_ThreadBST.pdf")


In [30]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/TBT/Class6_Unit3_Trees_ThreadBST.pptx"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Class6_Unit3_Trees_ThreadBST.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Slide 1: 5 texts, 1 captions, 1 OCR
 • Slide 2: 4 texts, 1 captions, 1 OCR
 • Slide 3: 8 texts, 1 captions, 1 OCR
 • Slide 4: 5 texts, 1 captions, 1 OCR
 • Slide 5: 7 texts, 1 captions, 1 OCR

=== Chunk 2 (IDs 6–10) ===
 • Slide 6: 7 texts, 1 captions, 1 OCR
 • Slide 7: 5 texts, 1 captions, 1 OCR
 • Slide 8: 11 texts, 1 captions, 1 OCR
 • Slide 9: 23 texts, 8 captions, 1 OCR
 • Slide 10: 11 texts, 1 captions, 1 OCR

=== Chunk 3 (IDs 11–15) ===
 • Slide 11: 19 texts, 1 captions, 1 OCR
 • Slide 12: 28 texts, 1 captions, 1 OCR
 • Slide 13: 27 texts, 1 captions, 1 OCR
 • Slide 14: 36 texts, 1 captions, 1 OCR
 • Slide 15: 43 texts, 15 captions, 5 OCR

=== Chunk 4 (IDs 16–16) ===
 • Slide 16: 6 texts, 1 captions, 1 OCR


In [18]:
# Usage
ppt_file = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Tree_traversal/class9_Unit3_Trees_naryTraversal.pptx"
text = extract_text_from_ppt(ppt_file)
generate_pdf_from_text(text, "class9_Unit3_Trees_naryTraversal.pdf")


In [31]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Tree_traversal/class9_Unit3_Trees_naryTraversal.pptx"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("class9_Unit3_Trees_naryTraversal.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Slide 1: 5 texts, 1 captions, 1 OCR
 • Slide 2: 4 texts, 1 captions, 1 OCR
 • Slide 3: 8 texts, 1 captions, 1 OCR
 • Slide 4: 7 texts, 1 captions, 1 OCR
 • Slide 5: 11 texts, 9 captions, 1 OCR

=== Chunk 2 (IDs 6–10) ===
 • Slide 6: 11 texts, 1 captions, 1 OCR
 • Slide 7: 6 texts, 1 captions, 1 OCR
 • Slide 8: 11 texts, 9 captions, 1 OCR
 • Slide 9: 11 texts, 1 captions, 1 OCR
 • Slide 10: 6 texts, 1 captions, 1 OCR

=== Chunk 3 (IDs 11–13) ===
 • Slide 11: 11 texts, 9 captions, 1 OCR
 • Slide 12: 11 texts, 1 captions, 1 OCR
 • Slide 13: 5 texts, 1 captions, 1 OCR


In [21]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/agentic/Agentic Workflow.pdf"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Agentic Workflow.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Page 1: 9 texts, 1 captions, 9 OCR
 • Page 2: 16 texts, 1 captions, 20 OCR
 • Page 3: 17 texts, 1 captions, 22 OCR
 • Page 4: 11 texts, 1 captions, 12 OCR
 • Page 5: 3 texts, 1 captions, 15 OCR

=== Chunk 2 (IDs 6–10) ===
 • Page 6: 14 texts, 1 captions, 17 OCR
 • Page 7: 13 texts, 1 captions, 23 OCR
 • Page 8: 17 texts, 1 captions, 25 OCR
 • Page 9: 16 texts, 1 captions, 31 OCR
 • Page 10: 15 texts, 1 captions, 17 OCR

=== Chunk 3 (IDs 11–15) ===
 • Page 11: 11 texts, 1 captions, 12 OCR
 • Page 12: 12 texts, 1 captions, 22 OCR
 • Page 13: 18 texts, 1 captions, 25 OCR
 • Page 14: 11 texts, 1 captions, 12 OCR
 • Page 15: 12 texts, 1 captions, 13 OCR

=== Chunk 4 (IDs 16–16) ===
 • Page 16: 9 texts, 1 captions, 9 OCR


In [22]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/agentic/AutoGen CrewAI.pdf"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("AutoGen CrewAI.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Page 1: 5 texts, 1 captions, 7 OCR
 • Page 2: 3 texts, 1 captions, 4 OCR
 • Page 3: 16 texts, 1 captions, 17 OCR
 • Page 4: 12 texts, 1 captions, 14 OCR
 • Page 5: 10 texts, 1 captions, 12 OCR

=== Chunk 2 (IDs 6–10) ===
 • Page 6: 10 texts, 1 captions, 11 OCR
 • Page 7: 15 texts, 1 captions, 16 OCR
 • Page 8: 14 texts, 1 captions, 15 OCR
 • Page 9: 19 texts, 1 captions, 17 OCR
 • Page 10: 11 texts, 1 captions, 13 OCR

=== Chunk 3 (IDs 11–11) ===
 • Page 11: 4 texts, 1 captions, 6 OCR


In [25]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/binary_tree_traversal/Class5_Unit3_BST_Traversal.pptx"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Class5_Unit3_BST_Traversal.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Slide 1: 5 texts, 1 captions, 1 OCR
 • Slide 2: 4 texts, 1 captions, 1 OCR
 • Slide 3: 10 texts, 1 captions, 1 OCR
 • Slide 4: 24 texts, 18 captions, 1 OCR
 • Slide 5: 24 texts, 18 captions, 1 OCR

=== Chunk 2 (IDs 6–10) ===
 • Slide 6: 24 texts, 18 captions, 10 OCR
 • Slide 7: 15 texts, 1 captions, 1 OCR
 • Slide 8: 19 texts, 6 captions, 1 OCR
 • Slide 9: 19 texts, 4 captions, 1 OCR
 • Slide 10: 20 texts, 4 captions, 1 OCR

=== Chunk 3 (IDs 11–15) ===
 • Slide 11: 20 texts, 3 captions, 1 OCR
 • Slide 12: 26 texts, 5 captions, 1 OCR
 • Slide 13: 27 texts, 7 captions, 1 OCR
 • Slide 14: 24 texts, 4 captions, 1 OCR
 • Slide 15: 27 texts, 7 captions, 1 OCR

=== Chunk 4 (IDs 16–20) ===
 • Slide 16: 28 texts, 7 captions, 1 OCR
 • Slide 17: 25 texts, 4 captions, 1 OCR
 • Slide 18: 27 texts, 7 captions, 1 OCR
 • Slide 19: 17 texts, 1 captions, 1 OCR
 • Slide 20: 15 texts, 5 captions, 2 OCR

=== Chunk 5 (IDs 21–25) ===
 • Slide 21: 20 texts, 4 captions, 3 OCR
 • S

In [26]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/expn_tree/Class7_Unit3_Trees_ExprTree.pptx"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Class7_Unit3_Trees_ExprTree.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Slide 1: 5 texts, 1 captions, 1 OCR
 • Slide 2: 4 texts, 1 captions, 1 OCR
 • Slide 3: 5 texts, 1 captions, 1 OCR
 • Slide 4: 6 texts, 1 captions, 1 OCR
 • Slide 5: 22 texts, 5 captions, 1 OCR

=== Chunk 2 (IDs 6–10) ===
 • Slide 6: 10 texts, 1 captions, 1 OCR
 • Slide 7: 19 texts, 4 captions, 1 OCR
 • Slide 8: 20 texts, 4 captions, 1 OCR
 • Slide 9: 21 texts, 4 captions, 1 OCR
 • Slide 10: 24 texts, 6 captions, 1 OCR

=== Chunk 3 (IDs 11–15) ===
 • Slide 11: 23 texts, 7 captions, 1 OCR
 • Slide 12: 15 texts, 7 captions, 1 OCR
 • Slide 13: 17 texts, 5 captions, 1 OCR
 • Slide 14: 23 texts, 7 captions, 1 OCR
 • Slide 15: 16 texts, 4 captions, 1 OCR

=== Chunk 4 (IDs 16–20) ===
 • Slide 16: 15 texts, 3 captions, 1 OCR
 • Slide 17: 13 texts, 1 captions, 1 OCR
 • Slide 18: 5 texts, 1 captions, 1 OCR
 • Slide 19: 14 texts, 1 captions, 1 OCR
 • Slide 20: 15 texts, 1 captions, 1 OCR

=== Chunk 5 (IDs 21–25) ===
 • Slide 21: 8 texts, 1 captions, 1 OCR
 • Slide 22:

In [27]:

if __name__ == "__main__":
    import json

    # path = "lecture.pptx" or "document.pdf"
    path = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/stable_diffusion/Stable Diffusion.pdf"
    chunks = extract_document_content(path, chunk_size=5)

    # print summary
    for ci, chunk in enumerate(chunks, start=1):
        ids = [p["id"] for p in chunk]
        print(f"\n=== Chunk {ci} (IDs {ids[0]}–{ids[-1]}) ===")
        for p in chunk:
            print(f" • {p['type'].capitalize()} {p['id']}: "
                  f"{len(p['texts'])} texts, "
                  f"{len(p['image_captions'])} captions, "
                  f"{len(p['ocr_texts'])} OCR")


    # optionally, save to JSON for downstream steps
    with open("Stable Diffusion.json", "w") as f:
        json.dump(chunks, f, indent=2)




=== Chunk 1 (IDs 1–5) ===
 • Page 1: 0 texts, 1 captions, 7 OCR
 • Page 2: 1 texts, 1 captions, 1 OCR
 • Page 3: 5 texts, 1 captions, 5 OCR
 • Page 4: 3 texts, 1 captions, 7 OCR
 • Page 5: 9 texts, 1 captions, 7 OCR

=== Chunk 2 (IDs 6–10) ===
 • Page 6: 5 texts, 1 captions, 5 OCR
 • Page 7: 6 texts, 1 captions, 6 OCR
 • Page 8: 6 texts, 1 captions, 6 OCR
 • Page 9: 4 texts, 1 captions, 6 OCR
 • Page 10: 5 texts, 1 captions, 5 OCR

=== Chunk 3 (IDs 11–15) ===
 • Page 11: 5 texts, 1 captions, 5 OCR
 • Page 12: 6 texts, 1 captions, 6 OCR
 • Page 13: 8 texts, 1 captions, 11 OCR
 • Page 14: 4 texts, 1 captions, 6 OCR
 • Page 15: 6 texts, 1 captions, 7 OCR

=== Chunk 4 (IDs 16–20) ===
 • Page 16: 3 texts, 1 captions, 6 OCR
 • Page 17: 9 texts, 1 captions, 10 OCR
 • Page 18: 8 texts, 1 captions, 8 OCR
 • Page 19: 9 texts, 1 captions, 10 OCR
 • Page 20: 2 texts, 1 captions, 7 OCR

=== Chunk 5 (IDs 21–25) ===
 • Page 21: 10 texts, 1 captions, 10 OCR
 • Page 22: 8 texts, 1 captions, 9 OCR
 • P

In [33]:
!pip install -q python-pptx pymupdf

In [34]:
# Colab cell 3
import os
from pptx import Presentation
import fitz  # PyMuPDF

def save_pptx_images(pptx_path: str, out_dir: str):
    """
    Extracts all but the first image from each slide in a PPTX and saves them.
    """
    os.makedirs(out_dir, exist_ok=True)
    prs = Presentation(pptx_path)
    total = 0

    for slide_idx, slide in enumerate(prs.slides, start=1):
        seen_logo = False
        slide_count = 0

        for shape in slide.shapes:
            if hasattr(shape, "image"):
                # skip the first image per slide
                if not seen_logo:
                    seen_logo = True
                    continue

                img = shape.image
                img_bytes = img.blob
                ext = img.ext  # e.g. 'jpeg' or 'png'
                slide_count += 1
                total += 1

                fname = f"slide{slide_idx:02d}_img{slide_count:02d}.{ext}"
                with open(os.path.join(out_dir, fname), "wb") as f:
                    f.write(img_bytes)

        if slide_count:
            print(f"✔️  Slide {slide_idx}: saved {slide_count} image(s)")

    print(f"\n✅  Done—saved {total} image(s) to\n   {out_dir}")


def save_pdf_images(pdf_path: str, out_dir: str):
    """
    Extracts all but the first image from each PDF page and saves them.
    """
    os.makedirs(out_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    total = 0

    for page_idx in range(len(doc)):
        imgs = doc[page_idx].get_images(full=True)
        if not imgs:
            continue

        print(f"✔️  Page {page_idx+1}: found {len(imgs)} image(s), skipping the first (logo)")

        for img_num, img_info in enumerate(imgs, start=1):
            # skip the first image per page
            if img_num == 1:
                continue

            xref = img_info[0]
            base_image = doc.extract_image(xref)
            img_bytes = base_image["image"]
            ext = base_image["ext"]  # e.g. 'png', 'jpeg'
            total += 1

            fname = f"page{page_idx+1:02d}_img{img_num-1:02d}.{ext}"
            with open(os.path.join(out_dir, fname), "wb") as f:
                f.write(img_bytes)

    print(f"\n✅  Done—saved {total} image(s) to\n   {out_dir}")

In [36]:
'''# — PPTX example
pptx_path   = "/content/drive/MyDrive/lectures/lecture1.pptx"
out_pptx_dir = "/content/drive/MyDrive/lectures/lecture1_images"
save_pptx_images(pptx_path, out_pptx_dir)'''

# — PDF example
pdf_path    = "/kaggle/input/llm-data/LLM_DATASET/LLM_DATASET/Lora&Qlora/Finetuning.pdf"
out_pdf_dir  = "/kaggle/working"
save_pdf_images(pdf_path, out_pdf_dir)

✔️  Page 1: found 1 image(s), skipping the first (logo)
✔️  Page 2: found 2 image(s), skipping the first (logo)
✔️  Page 3: found 1 image(s), skipping the first (logo)
✔️  Page 4: found 2 image(s), skipping the first (logo)
✔️  Page 5: found 2 image(s), skipping the first (logo)
✔️  Page 6: found 2 image(s), skipping the first (logo)
✔️  Page 7: found 2 image(s), skipping the first (logo)
✔️  Page 8: found 2 image(s), skipping the first (logo)
✔️  Page 9: found 1 image(s), skipping the first (logo)
✔️  Page 10: found 1 image(s), skipping the first (logo)
✔️  Page 11: found 1 image(s), skipping the first (logo)
✔️  Page 12: found 1 image(s), skipping the first (logo)
✔️  Page 13: found 2 image(s), skipping the first (logo)
✔️  Page 14: found 2 image(s), skipping the first (logo)
✔️  Page 15: found 2 image(s), skipping the first (logo)
✔️  Page 16: found 1 image(s), skipping the first (logo)
✔️  Page 17: found 1 image(s), skipping the first (logo)
✔️  Page 18: found 1 image(s), skipping 