In [2]:
# 🧠 Transformers and Accelerate for model loading
!pip install -q transformers==4.37.2 accelerate==0.26.1

# ⚙️ Required for LLaVA (vision + language)
!pip install -q sentencepiece git+https://github.com/huggingface/peft.git

# 📄 For reading/writing .docx files
!pip install -q python-docx


# 🖼️ For image handling
!pip install -q pillow

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [8]:
import os
from docx import Document
from docx.shared import Inches
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch

# --- Paths ---
NOTES_PATH = "/content/lecture_notes.docx"
IMAGES_DIR = "/content/images"
OUTPUT_DOCX = "/content/lecture_notes_with_images.docx"

# --- Load CLIP model and processor ---
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

# --- Load docx notes ---
doc = Document(NOTES_PATH)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]

# --- Combine paragraphs into sections by heading ---
sections = []
current_section = {"title": "Introduction", "content": ""}
for p in paragraphs:
    if p.startswith("Part") or p.lower().startswith("section") or p.startswith("##"):
        sections.append(current_section)
        current_section = {"title": p, "content": ""}
    else:
        current_section["content"] += p + "\n"
sections.append(current_section)

# --- Function to encode both text and image ---
def encode_text_and_image(text, image_path):
    # Open image
    image = Image.open(image_path)

    # Preprocess text and image using CLIPProcessor, this automatically handles truncation
    inputs = clip_processor(text=text, images=image, return_tensors="pt", padding=True, truncation=True)

    # Get embeddings for both text and image
    text_features = clip_model.get_text_features(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
    image_features = clip_model.get_image_features(pixel_values=inputs['pixel_values'])

    return text_features, image_features

# --- Function to determine best section for image ---
def find_best_section_for_image(image_path, sections):
    image = Image.open(image_path).convert("RGB")
    scores = []

    for section in sections:
        prompt = f"Which section best matches this image?\n\nSection:\n{section['content'][:500]}"

        # Truncate the prompt text to ensure it fits within the CLIP model's input size
        # Tokenize the text manually before encoding
        inputs = clip_processor(text=prompt, return_tensors="pt", padding=True, truncation=True)

        # Get features for text and image
        text_features, image_features = encode_text_and_image(prompt, image_path)

        # Calculate similarity (cosine similarity between text and image features)
        similarity = torch.cosine_similarity(text_features, image_features)

        # Store the section with the highest similarity score
        scores.append((section["title"], similarity.item()))

    if scores:
        # Sort by similarity score and return the section with the highest score
        best_section = max(scores, key=lambda x: x[1])
        return best_section[0]
    else:
        return "Introduction"

# --- Map images to best-fit section ---
image_placement_map = {}
for img_file in os.listdir(IMAGES_DIR):
    if img_file.lower().endswith((".png", ".jpg", ".jpeg", ".webp")):
        image_path = os.path.join(IMAGES_DIR, img_file)
        best_section = find_best_section_for_image(image_path, sections)
        image_placement_map[best_section] = image_path
        print(f"🖼️ {img_file} → 📌 {best_section}")

# --- Create new docx with images inserted ---
new_doc = Document()

for para in doc.paragraphs:
    new_doc.add_paragraph(para.text, style=para.style)

    if para.text.strip() in image_placement_map:
        image_path = image_placement_map[para.text.strip()]
        new_doc.add_picture(image_path, width=Inches(5))
        new_doc.add_paragraph("")  # spacing

new_doc.save(OUTPUT_DOCX)
print(f"✅ New lecture notes saved to {OUTPUT_DOCX}")

🖼️ page04_img01.png → 📌 Part 15
🖼️ page13_img01.png → 📌 Part 2
🖼️ page06_img01.png → 📌 Part 11
🖼️ page05_img01.png → 📌 Part 15
🖼️ page01_img01.png → 📌 Part 15
🖼️ page03_img01.png → 📌 Part 15
🖼️ page14_img01.png → 📌 Part 1
🖼️ page19_img01.png → 📌 Part 5
🖼️ page07_img01.png → 📌 Part 15
🖼️ page12_img01.png → 📌 Part 10
✅ New lecture notes saved to /content/lecture_notes_with_images.docx
