# Notice-of-Privacy-Practice.pdf

In [11]:
import fitz  # PyMuPDF
import math

def extract_chunks_with_metadata(pdf_path, chunk_size=100, chunk_overlap=15):
    doc = fitz.open(pdf_path)
    all_chunks = []

    for page_num in range(len(doc)):
        text = doc[page_num].get_text()
        words = text.split()
        i = 0
        chunk_id = 0

        while i < len(words):
            chunk_words = words[i:i + chunk_size]
            chunk_text = ' '.join(chunk_words)

            metadata = {
                "source": pdf_path.split("/")[-1],
                "page": page_num + 1,
                "chunk_id": chunk_id
            }

            all_chunks.append({
                "content": chunk_text,
                "metadata": metadata
            })

            i += chunk_size - chunk_overlap
            chunk_id += 1

    return all_chunks

# Example usage
chunks = extract_chunks_with_metadata("/Users/sanket/Documents/Python Projects/New Project LLM MultiDB/pdf/pdf-pharmacy-solutions-UNC_HDHP_PreventiveMedications_2024.pdf")
print(f"Extracted {len(chunks)} chunks.")
print(chunks[0])  # Preview first chunk


Extracted 15 chunks.
{'content': '1 2024 Preventive Medications UNC Health Pharmacy Solutions Certain preventative medications can be obtained at zero or low cost before the deductible is satisfied depending on your specific plan documents. Medications must be covered by your plan to be eligible for the preventive medication program– not all strengths and dosages will be covered. To check the cost of a medication, refer to your benefit handbook or contact customer service at the toll-free member phone number on your benefit plan ID card. Quality Driven and Cost-Conscious Medications As the pharmacy benefit manager for your benefit plan, UNC Health Pharmacy Solutions strives', 'metadata': {'source': 'pdf-pharmacy-solutions-UNC_HDHP_PreventiveMedications_2024.pdf', 'page': 1, 'chunk_id': 0}}


### 1. Preview All Chunks (Sample View)

In [2]:
for i in range(3):
    print(f"\n--- Chunk {i} ---")
    print(chunks[i]['content'])

for i in range(-3, 0):
    print(f"\n--- Chunk {len(chunks) + i} ---")
    print(chunks[i]['content'])



--- Chunk 0 ---
Page 1 of 8 Version 7 JOINT NOTICE OF PRIVACY PRACTICES OF THE UNIVERSITY OF NORTH CAROLINA HEALTH CARE SYSTEM ORGANIZED HEALTH CARE ARRANGEMENT THIS NOTICE DESCRIBES HOW MEDICAL INFORMATION ABOUT YOU MAY BE USED AND DISCLOSED AND HOW YOU CAN GET ACCESS TO THIS INFORMATION. PLEASE REVIEW IT CAREFULLY. Effective date: May 1, 2019 WHO FOLLOWS THIS NOTICE This Joint Notice of Privacy Practices (“Notice”) applies to entities that are owned or managed by the University of North Carolina Health Care System (“UNCHCS”), including UNCHCS facilities, practices, departments, and other sites of service (“UNCHCS entities”); our employees, contractors, trainees,

--- Chunk 1 ---
UNCHCS facilities, practices, departments, and other sites of service (“UNCHCS entities”); our employees, contractors, trainees, and volunteers; members of our medical staffs and their approved personnel when providing services to you at a UNCHCS location; and any other members of the UNCHCS workforce who ar

### 2. Check for Empty or Very Short Chunks

In [3]:
for chunk in chunks:
    if len(chunk['content'].strip()) < 50:
        print("⚠️ Short chunk found:", chunk['metadata'])


⚠️ Short chunk found: {'source': 'Notice-of-Privacy-Practice.pdf', 'page': 1, 'chunk_id': 5}


### 3. Check Page Coverage

In [4]:
from collections import Counter

pages = [chunk['metadata']['page'] for chunk in chunks]
counts = Counter(pages)
print("Chunks per page:", dict(counts))


Chunks per page: {1: 6, 2: 6, 3: 6, 4: 6, 5: 7, 6: 8, 7: 7, 8: 4}


### 4. Total Word Count Comparison

In [5]:
total_words = sum(len(chunk['content'].split()) for chunk in chunks)
print("Estimated total words from chunks:", total_words)


Estimated total words from chunks: 4486


# Creating EMbeddings


In [6]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

# Make sure your `chunks` list is already available
# Each chunk: {"content": str, "metadata": dict}

# Step 1: Convert to LangChain Documents
documents = [
    Document(page_content=chunk["content"], metadata=chunk["metadata"])
    for chunk in chunks
]

# Step 2: Load open-source embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 3: Create FAISS vector store
vectorstore = FAISS.from_documents(documents, embedding_model)

# Step 4: Save FAISS index locally
vectorstore.save_local("faiss_index_notice_privacy")
print("✅ FAISS index saved to: faiss_index_notice_privacy")


  from .autonotebook import tqdm as notebook_tqdm


✅ FAISS index saved to: faiss_index_notice_privacy


In [15]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

# Define FAISS storage path
faiss_path = "/Users/sanket/Documents/Python Projects/New Project LLM MultiDB/faiss_index_notice_privacy"

# Convert chunks to LangChain Document format
documents = [
    Document(page_content=chunk["content"], metadata=chunk["metadata"])
    for chunk in chunks
]

# Load embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Check if FAISS index already exists
if os.path.exists(f"{faiss_path}/index.faiss") and os.path.exists(f"{faiss_path}/index.pkl"):
    print("📦 Existing FAISS index found. Loading and appending...")
    vectorstore = FAISS.load_local(faiss_path, embedding_model, allow_dangerous_deserialization=True)
    vectorstore.add_documents(documents)
else:
    print("🆕 No FAISS index found. Creating new one...")
    vectorstore = FAISS.from_documents(documents, embedding_model)

# Save the updated index
vectorstore.save_local(faiss_path)
print(f"✅ FAISS index updated and saved at: {faiss_path}")

chunks = extract_chunks_with_metadata("/Users/sanket/Documents/Python Projects/New Project LLM MultiDB/pdf/User Guide.pdf")


📦 Existing FAISS index found. Loading and appending...
✅ FAISS index updated and saved at: /Users/sanket/Documents/Python Projects/New Project LLM MultiDB/faiss_index_notice_privacy


In [17]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Step 1: Load the same embedding model used during index creation
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 2: Load the FAISS index
vectorstore = FAISS.load_local(
    "faiss_index_notice_privacy",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True  # Required if you're using pickle (index.pkl)
)

# Step 3: Access internal FAISS index
index = vectorstore.index

# Step 4: Get number of vectors and dimensions
num_vectors = index.ntotal
embedding_dim = index.d

print(f"📐 FAISS index shape: {num_vectors} vectors × {embedding_dim} dimensions")


📐 FAISS index shape: 120 vectors × 384 dimensions
