In [4]:
import re
from PyPDF2 import PdfReader
from pathlib import Path

def clean_text(text):
    return re.sub(r'\s{2,}', ' ', text).strip()

def extract_pdf_info_cleaned(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() or ""

    lines = [clean_text(line) for line in full_text.strip().splitlines() if line.strip()]

    # Heuristic: pick first line with >4 words and mixed casing as title
    title = "Untitled"
    for line in lines[:10]:
        if len(line.split()) > 4 and any(c.islower() for c in line) and any(c.isupper() for c in line):
            title = line
            break
    if title == "Untitled" and lines:
        title = lines[0]

    body_lines = [l for l in lines if l != title]
    body = "\n".join(body_lines)

    return {
        "filename": Path(pdf_path).name,
        "title": title,
        "body": body
    }

# Example for a folder of PDFs
def load_pdfs_info(folder_path):
    pdfs = Path(folder_path).glob("*.pdf")
    return [extract_pdf_info_cleaned(str(pdf)) for pdf in pdfs]

# Usage
pdf_infos = load_pdfs_info("./data/v1/docs")
for info in pdf_infos:
    print(f"Title: {info['title']}")
    print(f"Body (excerpt): {info['body'][:200]}...\n")


Title: United States Office of Personnel Management
Body (excerpt): Annual Performance Report
Fiscal Year 2014
February 2015The United States Office
of Personnel Management
Fiscal Year 2014
Annual Performance ReportOPM Fiscal Year 2014 Annual Performance ReportvTable ...

Title: Everyfouryears,HHSupdatesitsStrategic Plan,whichdescribes itsworktoaddresscomplex, multifaceted, andevolving
Body (excerpt): Page1Strategic PlanFY2022–2026
healthandhumanservicesissues.Anagencystrategicplanisoneofthreemainelements requiredbytheGovernment
Performance andResultsAct(GPRA)of1993(P.L.103-62)andtheGPRAModernizati...

Title: United StateS Office Of PerSOnnel M anageMent
Body (excerpt): Annual Performance Report
Fiscal Year 2018
OPM .GOV MARCH 2019Message from the Acting Director . . . . . . . . 3
OPM Overview . . . . . . . . . . . . . . . . . . . . . . . . . . 5
Key Functions . . . ...

Title: ThisStrategic Planmeetstherequirements oftheGovernment Performance ResultsActModernization Act(GPRAMA) and
Bo

In [6]:
from PyPDF2 import PdfReader
from pathlib import Path

def extract_pdf_info_filename_as_title(pdf_path):
    reader = PdfReader(pdf_path)

    # Extract all text from all pages
    all_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            all_text += text + " "

    # Clean and normalize whitespace
    body = " ".join(all_text.strip().split())

    # Title = filename without extension
    title = Path(pdf_path).stem

    return {
        "filename": Path(pdf_path).name,
        "title": title,
        "body": body
    }

# Example: process all PDFs in a folder
def load_pdfs_info(folder_path):
    pdf_paths = Path(folder_path).glob("*.pdf")
    return [extract_pdf_info_filename_as_title(str(path)) for path in pdf_paths]

# Usage
pdf_infos = load_pdfs_info("./data/v1/docs")

# Preview
for info in pdf_infos:
    print(f"Title: {info['title']}")
    print(f"Body: {info['body'][:200]}...\n")


Title: 2014-annual-performance-report
Body: United States Office of Personnel Management Annual Performance Report Fiscal Year 2014 United States Office of Personnel Management February 2015 The United States Office of Personnel Management Fisc...

Title: HHS2022
Body: Page1Strategic PlanFY2022–2026 Everyfouryears,HHSupdatesitsStrategic Plan,whichdescribes itsworktoaddresscomplex, multifaceted, andevolving healthandhumanservicesissues.Anagencystrategicplanisoneofthr...

Title: 2018-annual-performance-report
Body: United StateS Office Of PerSOnnel M anageMent Annual Performance Report Fiscal Year 2018 OPM .GOV MARCH 2019 Message from the Acting Director . . . . . . . . 3 OPM Overview . . . . . . . . . . . . . ....

Title: DOL2022
Body: Page1U.S.DEPARTMENT OFLABORFY2022–2026Strategic Plan ThisStrategic Planmeetstherequirements oftheGovernment Performance ResultsActModernization Act(GPRAMA) and isthefoundation fortheDepartment ofLabor...

Title: DOC2022
Body: Page1Strategic Plan|2022–2026

In [7]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, models

word_embedding_model = models.Transformer("sentence-transformers/all-MiniLM-L6-v2")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [8]:
type(pdf_infos)

list

In [12]:
from sentence_transformers import SentenceTransformer, models as st_models
from qdrant_client import QdrantClient, models


client = QdrantClient(":memory:")

# use st_models for transformer models
word_embedding_model = st_models.Transformer("sentence-transformers/all-MiniLM-L6-v2")
pooling_model = st_models.Pooling(word_embedding_model.get_word_embedding_dimension())
encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# use models (Qdrant) here:
client.create_collection(
    collection_name="reports",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    ),
)

client.upload_points(
    collection_name="reports",
    points=[
        models.PointStruct(
            id=idx, vector=encoder.encode(doc["body"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(pdf_infos)
    ],
)


In [13]:
hits = client.query_points(
    collection_name="reports",
    query=encoder.encode("finance").tolist(),
    limit=3,
).points

for hit in hits:
    print(hit.payload, "score:", hit.score) # 

{'filename': '2016-annual-performance-report.pdf', 'title': '2016-annual-performance-report', 'body': "United StateS Office Of PerSOnnel M anageMent Annual Performance Report Fiscal Year 2016 OPM .GOV MAY 2017 Contents Message from the Acting Director . . . . . . . . . . . . . . 5 OPM Overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 History . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 Profile . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 Purpose and Scope . . . . . . . . . . . . . . . . . . . . . . . . . . . 12 Results OPM: OPM’s Performance Management Approach . . . . . . . . . . . . . . . . . . . . . . 13 OPM’s Strategic Framework . . . . . . . . . . . . . . . . . . . 15 Organizational Structure . . . . . . . . . . . . . . . . . . . . . 17 Major Management Priorities and Challenges . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22 1 . Strategic Human Capital . . . . . 