In [None]:
# !git clone https://github.com/osamaizhar/Rag-pipelines-experiments

# PPT PARSER (.pptx)

In [37]:
# !pip install python-pptx


Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.1/165.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter, python-pptx
Successfully installed XlsxWriter-3.2.2 python-pptx-1.0.2


In [46]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from pptx import Presentation

def parse_pptx_and_generate_embeddings(directory):
    """Parses all PowerPoint (.pptx) files in a directory and generates embeddings for slide text."""
    
    # Load Jina Embeddings Model
    model_name = "jinaai/jina-embeddings-v2-base-en"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    embedding_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

    # Find all PowerPoint files
    pptx_files = [f for f in os.listdir(directory) if f.endswith(".pptx")]
    parsed_data = {}
    ppt_embeddings = []
    for file in pptx_files:
        file_path = os.path.join(directory, file)
        parsed_data[file] = []

        try:
            presentation = Presentation(file_path)
            for slide in presentation.slides:
                slide_text = " ".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])

                if slide_text.strip():  # Ensure non-empty text
                    inputs = tokenizer(slide_text, return_tensors="pt", padding=True, truncation=True)
                    with torch.no_grad():
                        embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1)  # Mean pooling
                    
                    ppt_embeddings.append(embedding.squeeze().tolist())
                    parsed_data[file] = ppt_embeddings

        except Exception as e:
            print(f"Error processing file '{file}': {e}")
            continue
   
    # Print example embeddings
    for file, slides in parsed_data.items():
        print(f"\n📁 {file}: {len(slides)} slides processed")
        if slides:
            print(f"  🔹 First Slide Embedding: {slides[0][:5]}...")  # Show first 5 values
        # break  # Show only one file for display

    return ppt_embeddings



In [47]:
# Usage Example
directory = "/kaggle/working/Rag-pipelines-experiments/data"  # Change to your actual directory
ppt_embeddings = parse_pptx_and_generate_embeddings(directory)



📁 Lesson 02 - Communication and Teamwork.pptx: 174 slides processed
  🔹 First Slide Embedding: [-0.20139260590076447, -0.3115476071834564, 0.1886497288942337, 0.5844323039054871, 0.200248122215271]...

📁 Lesson 03 - Medicolegal Aspects of Surgical Technology.pptx: 174 slides processed
  🔹 First Slide Embedding: [-0.20139260590076447, -0.3115476071834564, 0.1886497288942337, 0.5844323039054871, 0.200248122215271]...

📁 Sample - Copy It.pptx: 174 slides processed
  🔹 First Slide Embedding: [-0.20139260590076447, -0.3115476071834564, 0.1886497288942337, 0.5844323039054871, 0.200248122215271]...

📁 Lesson 04 - Health Care Facility Structure and Environment.pptx: 174 slides processed
  🔹 First Slide Embedding: [-0.20139260590076447, -0.3115476071834564, 0.1886497288942337, 0.5844323039054871, 0.200248122215271]...

📁 Lesson 01 - Surgical Technology The Profession and The Professional.pptx: 174 slides processed
  🔹 First Slide Embedding: [-0.20139260590076447, -0.3115476071834564, 0.1886497

In [48]:
ppt_embeddings[0]

[-0.20139260590076447,
 -0.3115476071834564,
 0.1886497288942337,
 0.5844323039054871,
 0.200248122215271,
 0.553331196308136,
 0.01521590817719698,
 -0.7385714650154114,
 0.4191761016845703,
 0.45269879698753357,
 -0.33522191643714905,
 0.19675619900226593,
 -0.8370568156242371,
 0.1358143538236618,
 -0.5040600895881653,
 0.490649551153183,
 -0.17906594276428223,
 -0.012552035041153431,
 -0.21829192340373993,
 -0.5770393013954163,
 -0.5270491242408752,
 -0.43703290820121765,
 -1.7104445695877075,
 -0.6517637372016907,
 0.23284833133220673,
 1.325897216796875,
 0.613616406917572,
 0.20612581074237823,
 0.7205274105072021,
 0.6209684014320374,
 0.015426372177898884,
 -0.3050311803817749,
 -0.7862327098846436,
 0.822803795337677,
 -0.4934389591217041,
 -1.0027474164962769,
 0.0845283567905426,
 0.04830727353692055,
 0.3043513298034668,
 0.7387192845344543,
 -0.35939839482307434,
 0.3055199980735779,
 0.12588678300380707,
 0.8529673218727112,
 0.20381061732769012,
 0.5644808411598206,
 -0