In [3]:
pip install python-pptx

Collecting python-pptx
  Using cached python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Using cached XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Collecting lxml>=3.1.0 (from python-pptx)
  Using cached lxml-5.3.1-cp312-cp312-win_amd64.whl.metadata (3.8 kB)
Using cached python_pptx-1.0.2-py3-none-any.whl (472 kB)
Downloading lxml-5.3.1-cp312-cp312-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 3.8/3.8 MB 28.4 MB/s eta 0:00:00
Using cached XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
Installing collected packages: XlsxWriter, lxml, python-pptx
Successfully installed XlsxWriter-3.2.2 lxml-5.3.1 python-pptx-1.0.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Using cached sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-3.4.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
pip install streamlit

Collecting streamlit
  Using cached streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Using cached cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyarrow>=7.0 (from streamlit)
  Using cached pyarrow-19.0.1-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Using cached tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Using cached watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Co


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import PyPDF2
from pptx import Presentation
from typing import List, Dict, Any, Optional
import logging
from sentence_transformers import SentenceTransformer
import streamlit as st

logger = logging.getLogger(__name__)

class DocumentProcessor:
    """
    Processes documents (PDF, PPTX) for the review chatbot.
    """
    def __init__(self, embedding_model: str = "all-MiniLM-L6-v2", 
                 chunk_size: int = 500, chunk_overlap: int = 100):
        """
        Initialize the document processor.
        
        Args:
            embedding_model: Name of the sentence-transformers model to use
            chunk_size: Size of document chunks in characters
            chunk_overlap: Overlap between chunks in characters
        """
        self.embedding_model = SentenceTransformer(embedding_model)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    
    def process_document(self, file_path: str) -> List[Dict[str, Any]]:
        """
        Process a document file and return chunks with embeddings.
        
        Args:
            file_path: Path to the document file
            
        Returns:
            List of document chunks with content, embeddings, and metadata
        """
        # Extract text based on file type
        file_extension = os.path.splitext(file_path)[1].lower()
        
        if file_extension == '.pdf':
            page_texts = self._extract_pdf_text(file_path)
            # Extract topics from the combined text of all pages
            full_text = ' '.join([page_info['text'] for page_info in page_texts])
            topics = self._extract_topics(full_text)
            
            # Process each page and track page numbers
            processed_chunks = []
            chunk_id = 0
            
            for page_info in page_texts:
                page_text = page_info['text']
                page_num = page_info['page_number']
                
                # Chunk the page text
                chunks = self._chunk_text(page_text)
                
                # Create embeddings for each chunk from this page
                for chunk in chunks:
                    embedding = self.embedding_model.encode(chunk, show_progress_bar=False)
                    
                    processed_chunks.append({
                        'content': chunk,
                        'embedding': embedding,
                        'metadata': {
                            'source': os.path.basename(file_path),
                            'chunk_id': chunk_id,
                            'topics': topics,
                            'page_number': page_num
                        }
                    })
                    chunk_id += 1
            
            return processed_chunks
            
        elif file_extension in ['.pptx', '.ppt']:
            slide_texts = self._extract_pptx_text(file_path)
            # Extract topics from the combined text of all slides
            full_text = ' '.join([slide_info['text'] for slide_info in slide_texts])
            topics = self._extract_topics(full_text)
            
            # Process each slide and track slide numbers
            processed_chunks = []
            chunk_id = 0
            
            for slide_info in slide_texts:
                slide_text = slide_info['text']
                slide_num = slide_info['slide_number']
                
                # Chunk the slide text
                chunks = self._chunk_text(slide_text)
                
                # Create embeddings for each chunk from this slide
                for chunk in chunks:
                    embedding = self.embedding_model.encode(chunk, show_progress_bar=False)
                    
                    processed_chunks.append({
                        'content': chunk,
                        'embedding': embedding,
                        'metadata': {
                            'source': os.path.basename(file_path),
                            'chunk_id': chunk_id,
                            'topics': topics,
                            'page_number': slide_num  # For slides, use slide number as page number
                        }
                    })
                    chunk_id += 1
            
            return processed_chunks
        else:
            raise ValueError(f"Unsupported file type: {file_extension}")
    
    def _extract_pdf_text(self, file_path: str) -> List[Dict[str, Any]]:
        """
        Extract text from PDF file with page tracking.
        
        Returns:
            List of dictionaries with text content and page number
        """
        page_texts = []
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(reader.pages):
                text = page.extract_text()
                if text.strip():  # Only add non-empty pages
                    page_texts.append({
                        'text': text,
                        'page_number': page_num + 1  
                    })
        return page_texts
    
    def _extract_pptx_text(self, file_path: str) -> List[Dict[str, Any]]:
        """
        Extract text from PowerPoint file with slide tracking.
        
        Returns:
            List of dictionaries with text content and slide number
        """
        slide_texts = []
        prs = Presentation(file_path)
        for slide_num, slide in enumerate(prs.slides):
            text = ""
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
            
            if text.strip():  # Only add non-empty slides
                slide_texts.append({
                    'text': text,
                    'slide_number': slide_num + 1  
                })
        return slide_texts
    
    def _chunk_text(self, text: str) -> List[str]:
        """Split text into chunks with overlap."""
        chunks = []
        start = 0
        
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            
            # Adjust end to avoid splitting words
            if end < len(text):
                # Look for the last space within the chunk
                last_space = text.rfind(' ', start, end)
                if last_space != -1 and last_space > start:
                   end = last_space + 1  # Include the space

            # Add the chunk
            chunks.append(text[start:end])
            
            # Move the start position for the next chunk, considering overlap
            start = max(end - self.chunk_overlap, start + 1)
        
        return chunks
    
    def _extract_topics(self, text: str) -> List[str]:
        """
        Extract key topics from text (simple version).
        A more sophisticated topic extraction would be implemented here.
        """
        # Simple keyword extraction - in a real system, use TF-IDF or LDA
        common_words = ['the', 'and', 'or', 'to', 'a', 'in', 'that', 'it', 'with']
        words = [word.lower() for word in text.split() if len(word) > 4]
        word_counts = {}
        
        for word in words:
            if word not in common_words:
                word_counts[word] = word_counts.get(word, 0) + 1
        
        # Get the top 5 words as "topics"
        sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
        topics = [word for word, count in sorted_words[:5]]
        
        return topics
    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Test the processor
print("Initializing DocumentProcessor...")
processor = DocumentProcessor()

# Test with PDF file
pdf_path = r"C:\Users\karel\Downloads\answer-generation-for-retrieval-based-question-answering-systems.pdf"
print(f"Testing PDF processing: {pdf_path}")
try:
    pdf_chunks = processor.process_document(pdf_path)
    print(f"Successfully processed PDF into {len(pdf_chunks)} chunks")
    if pdf_chunks:
        print(f"First chunk content: {pdf_chunks[0]['content'][:100]}...")
        print(f"Topics identified: {pdf_chunks[0]['metadata']['topics']}")
        print(f"Page number: {pdf_chunks[0]['metadata']['page_number']}")
except Exception as e:
    print(f"Error processing PDF: {str(e)}")
    import traceback
    traceback.print_exc()

# Test with PowerPoint file
pptx_path = r"C:\Users\karel\Downloads\DTI 5125 Question Answering Group 1.pptx"
print(f"\nTesting PowerPoint processing: {pptx_path}")
try:
    pptx_chunks = processor.process_document(pptx_path)
    print(f"Successfully processed PowerPoint into {len(pptx_chunks)} chunks")
    if pptx_chunks:
        print(f"First chunk content: {pptx_chunks[0]['content'][:100]}...")
        print(f"Topics identified: {pptx_chunks[0]['metadata']['topics']}")
        print(f"Slide number: {pptx_chunks[0]['metadata']['page_number']}")
except Exception as e:
    print(f"Error processing PPTX: {str(e)}")
    import traceback
    traceback.print_exc()

Initializing DocumentProcessor...
Testing PDF processing: C:\Users\karel\Downloads\answer-generation-for-retrieval-based-question-answering-systems.pdf
Successfully processed PDF into 775 chunks
First chunk content: Answer Generation for Retrieval-based Question Answering Systems
Chao-Chun Hsu1, Eric Lind2, Luca S...
Topics identified: ['answer', 'model', 'genqa', 'association', 'pages']
Page number: 1

Testing PowerPoint processing: C:\Users\karel\Downloads\DTI 5125 Question Answering Group 1.pptx
Successfully processed PowerPoint into 1835 chunks
First chunk content: Article presentation:
Question answering SYSTEMS
Akshat Khare - 300342170
Laurie Kanga-Eba - 3004331...
Topics identified: ['answer', 'model', 'question', 'article', 'answering']
Slide number: 1
