# Document Processing Test
This notebook tests the DocumentLoader class for loading and processing PDF files.

In [13]:
# Setup: Add parent directory to path so we can import from src
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

Project root: c:\Users\kissa\OneDrive\Desktop\research-assistant


In [14]:
# Import the DocumentLoader class
from src.processing.document_loader import DocumentLoader

In [15]:
# Load a sample PDF
import os

# Construct absolute path to the PDF
pdf_path = project_root / "data" / "samples" / "sample.pdf"

# Check if file exists
if not pdf_path.exists():
    print(f"‚ùå ERROR: PDF file not found at: {pdf_path}")
else:
    print(f"‚úÖ Found PDF at: {pdf_path}")
    loader = DocumentLoader()
    docs = loader.load_pdf(str(pdf_path))
    print(f"\nüìÑ Loaded {len(docs)} pages")

‚úÖ Found PDF at: c:\Users\kissa\OneDrive\Desktop\research-assistant\data\samples\sample.pdf



üìÑ Loaded 9 pages


In [16]:
# Inspect the first page content
print("First 200 characters of page 1:")
print(docs[0].page_content[:200])

First 200 characters of page 1:
A Brief Introduction to Artificial Intelligence
What is AI and how is it going to shape the future 
By Dibbyo Saha, Undergraduate Student, Computer Science,
Ryerson University
What is Artificial Intel


In [17]:
# Check metadata
print("\nMetadata for page 1:")
print(docs[0].metadata)


Metadata for page 1:
{'source': 'c:\\Users\\kissa\\OneDrive\\Desktop\\research-assistant\\data\\samples\\sample.pdf', 'page': 0, 'filename': 'sample.pdf', 'upload_date': datetime.datetime(2026, 2, 5, 23, 31, 36, 334615)}


In [18]:
# Test loading multiple PDFs (if you have more)
# pdf_files = ["data/samples/sample.pdf", "data/samples/another.pdf"]
# all_docs = loader.load_multiple_pdfs(pdf_files)
# print(f"Total documents loaded: {len(all_docs)}")