In [3]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting Pillow>=9.1 (from pdfplumber)
  Downloading pillow-11.3.0-cp313-cp313-win_amd64.whl.metadata (9.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Collecting charset-normalizer>=2.0.0 (from pdfminer.six==20250506->pdfplumber)
  Downloading charset_normalizer-3.4.3-cp313-cp313-win_amd64.whl.metadata (37 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20250506->pdfplumber)
  Downloading cryptography-46.0.1-cp311-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting cffi>=2.0.0 (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber)
  Downloading cffi-2.0.0-cp313-cp313-win_amd64.whl.metadata (2.6 kB)
Collecting pycparser (from cffi>=2.0.0->cryptography>=36.0.0->pdfminer.six==20250506->pdfp



In [2]:
import pdfplumber
from pathlib import Path
import json

In [3]:
# Go one level up from Notebook/ → into Data/Cards
BASE_FOLDER = Path("..") / "Data" / "Cards"

In [4]:
# List all PDFs
for pdf_file in BASE_FOLDER.rglob("*.pdf"):
    print(pdf_file)

..\Data\Cards\Kohls\20250930\Cashback.pdf
..\Data\Cards\Discover\20250930\Cashback.pdf
..\Data\Cards\Citi\20250930\Additional Document.pdf
..\Data\Cards\Citi\20250930\Cashback.pdf
..\Data\Cards\BOA Allegiant\20250930\Cashback.pdf
..\Data\Cards\Apple\20250930\Cashback.pdf
..\Data\Cards\Amex\20250930\Cashback.pdf


In [5]:
# Output folder (relative to Notebook/)
OUTPUT_FOLDER = Path("..") / "output"
OUTPUT_FOLDER.mkdir(exist_ok=True)  # create folder if it doesn’t exist

# JSONL file path
JSONL_OUTPUT = OUTPUT_FOLDER / "documents.jsonl"

In [6]:
# Chunking parameters
CHUNK_SIZE = 300
CHUNK_OVERLAP = 100

In [7]:
def extract_text_by_page(pdf_path: Path):
    """Yield text page by page from a PDF file."""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text() or ""
            yield page_num, text.strip()

In [8]:
def chunk_text(text, chunk_size=300, overlap=100):
    """Split text into overlapping chunks."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        if chunk:  # skip empty
            chunks.append(chunk)
    return chunks


In [9]:
def main():
    with open(JSONL_OUTPUT, "w", encoding="utf-8") as f:
        for pdf_file in BASE_FOLDER.rglob("*.pdf"):
            for page_num, page_text in extract_text_by_page(pdf_file):
                if not page_text:
                    continue

                chunks = chunk_text(page_text, CHUNK_SIZE, CHUNK_OVERLAP)

                for chunk_idx, chunk in enumerate(chunks):
                    card_name = pdf_file.parents[1].name  # e.g. DiscoverIt
                    date = pdf_file.parents[0].name       # e.g. 2025-09-01

                    record = {
                        "card": card_name,
                        "date": date,
                        "filename": pdf_file.name,
                        "path": str(pdf_file.resolve()),
                        "page": page_num,
                        "chunk_index": chunk_idx,
                        "text": chunk
                    }
                    f.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"✅ Extracted PDFs from {BASE_FOLDER}")
    print(f"📄 JSONL saved to {JSONL_OUTPUT.resolve()}")



In [10]:
if __name__ == "__main__":
    main()

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Cannot set gray stroke color because /'P133' is an invalid float value
Cannot set gray non-stroke color because /'P133' is an invalid float value
Cannot set gray stroke color because /'P153' is an invalid float value
Cannot set gray non-stroke color because /'P153' is an invalid float value
Cannot set gray stroke color because /'P13' is an invalid float value
Cannot set gray non-stroke color because /'P13' is an invalid float value
Cannot set gray stroke color because /'P14' is an invalid float value
Cannot set gray non-stroke color because /'P14' is an invalid float value
Cannot set gray stroke color because /'P15' is an invalid float value
Cannot set gray non-stroke color because /'P15' is an invalid float value
Cannot set gray stroke color because /'P16' is an invalid float value
Cannot set gray non-stroke color because /

✅ Extracted PDFs from ..\Data\Cards
📄 JSONL saved to C:\Users\soumy\OneDrive\Documents\IntelligentCardSelectorEngine\output\documents.jsonl
