# Pdf Reader

In [1]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings

# Configuration
DOCS_DIR = "basic/"
PERSIST_DIR = "stores/insurance_metadata_v3"
EMBEDDING_MODEL = "BAAI/bge-large-en"

def extract_plan_type(filename):
    """Extract plan type from filename (case-insensitive)"""
    filename_lower = filename.lower()
    if "basic" in filename_lower: return "basic"
    if "standard" in filename_lower: return "standard"
    if "enhanced" in filename_lower: return "enhanced"
    return "other"

# Initialize components
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

embeddings = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": False}
)

# Process each PDF
all_splits = []

for pdf_file in os.listdir(DOCS_DIR):
    if not pdf_file.endswith(".pdf"):
        continue
        
    print(f"Processing: {pdf_file}")
    file_path = os.path.join(DOCS_DIR, pdf_file)
    
    # Load PDF
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    
    # Split and add metadata
    plan_type = extract_plan_type(pdf_file)
    for page in pages:
        splits = text_splitter.split_documents([page])
        for split in splits:
            split.metadata.update({
                "plan_type": plan_type,
                "source_file": pdf_file,
                "file_type": "pdf"
            })
        all_splits.extend(splits)

# Create vector store
vectorstore = Chroma.from_documents(
    documents=all_splits,
    embedding=embeddings,
    persist_directory=PERSIST_DIR
)

vectorstore._collection.modify(
    metadata={"allow_filtering": True}  # Enable filtering by doc_type
)

print(f"\nIngestion complete! Stored {len(all_splits)} chunks from {len(os.listdir(DOCS_DIR))} PDFs.")

  embeddings = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Processing: phi-standard.pdf
Processing: phi-enhanced.pdf
Processing: phi-basic.pdf

Ingestion complete! Stored 248 chunks from 7 PDFs.


# Extract data from Table

In [3]:
import os
import pdfplumber
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings


def extract_structured_docs_from_pdf(pdf_path):
    docs = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for table in tables:
                if table and len(table) > 1:  # Skip empty or header-only tables
                    for row in table[1:]:  # Skip header row
                        try:
                            benefit, reimbursement, maximum = row
                            sentence = f"{benefit} coverage reimburses {reimbursement} with a maximum of {maximum}."
                            metadata = {
                                "source": os.path.basename(pdf_path),
                                "page": i + 1,
                                "benefit": benefit.strip()
                            }
                            docs.append(Document(page_content=sentence, metadata=metadata))
                        except ValueError:
                            continue  # skip malformed rows
    return docs

def process_pdf_folder(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            docs = extract_structured_docs_from_pdf(pdf_path)
            all_docs.extend(docs)
    return all_docs

# Extract documents
docs = process_pdf_folder("basic/")
print(docs)

# Add new documents
vectorstore.add_documents(docs)

# Save updates
vectorstore.persist()

print(f"✅ {len(docs)} documents embedded and saved to vector store.")


[Document(metadata={'source': 'phi-standard.pdf', 'page': 5, 'benefit': 'Drug'}, page_content='Drug coverage reimburses 70% on first $7,000\n100% on next $93,000 with a maximum of $100,000 in a calendar year.'), Document(metadata={'source': 'phi-standard.pdf', 'page': 5, 'benefit': 'Extended health'}, page_content='Extended health coverage reimburses 100% with a maximum of Described in the Extended health\nprovision section.'), Document(metadata={'source': 'phi-standard.pdf', 'page': 5, 'benefit': 'Vision'}, page_content='Vision coverage reimburses 100% with a maximum of $250 every two calendar years\nAn insured person becomes\neligible for vision coverage 1\nyear after the effective date of\nthis policy,.'), Document(metadata={'source': 'phi-standard.pdf', 'page': 5, 'benefit': 'Emergency travel medical\ncoverage'}, page_content='Emergency travel medical\ncoverage coverage reimburses 100% with a maximum of 60 days per trip\n$1,000,000 lifetime.'), Document(metadata={'source': 'phi-sta

  vectorstore.persist()


# WebPageLoader

In [5]:
URLS = [
    "https://www.ontario.ca/page/what-ohip-covers",
    "https://www.ontario.ca/page/ohip-coverage-while-outside-canada",
    "https://www.ontario.ca/page/documents-needed-get-health-card",
    "https://www.ontario.ca/page/military-families-services-and-support",
    "https://www.ontario.ca/page/apply-ohip-and-get-health-card",
    "https://uhip.ca/help-faq/",
    "https://www.ontario.ca/page/learn-about-ohip-plus"
]


In [None]:
from bs4 import BeautifulSoup
import re
import requests
import time
from typing import List
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import WebBaseLoader

# 1. Define the custom text splitter
class GovernmentTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self):
        super().__init__(
            chunk_size=512,
            chunk_overlap=64,
            separators=[
                "\n\n", 
                "\n", 
                r"(?<=\. )",  # Split after periods
                " ", 
                ""
            ],
            keep_separator=True
        )

# 2. Custom web loader with cleaning
class GovernmentWebLoader(WebBaseLoader):
    def __init__(self, urls):
        super().__init__(urls)
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        })
    
    def load(self) -> List[Document]:
        docs = super().load()
        return self.clean_documents(docs)
    
    def clean_documents(self, docs: List[Document]) -> List[Document]:
        cleaned_docs = []
        for doc in docs:
            try:
                soup = BeautifulSoup(doc.page_content, 'html.parser')
                
                # Remove unwanted elements
                for element in soup(['script', 'style', 'nav', 'footer']):
                    element.decompose()
                
                # Get clean text
                text = soup.get_text('\n', strip=True)
                text = re.sub(r'\n{3,}', '\n\n', text).strip()
                
                # Preserve important metadata
                metadata = doc.metadata.copy()
                metadata.update({
                    "doc_type": "OHIP" if "ontario.ca" in doc.metadata["source"] else "UHIP",
                    "cleaned": True
                })
                
                cleaned_docs.append(Document(
                    page_content=text,
                    metadata=metadata
                ))
            except Exception as e:
                print(f"Error cleaning document: {str(e)}")
                cleaned_docs.append(doc)
        return cleaned_docs

# 3. Vectorization pipeline
def vectorize_webpages():
    
    print("🕸️ Loading webpages...")
    loader = GovernmentWebLoader(URLS)
    docs = loader.load()
    
    print("✂️ Splitting documents...")
    splitter = GovernmentTextSplitter()
    chunks = splitter.split_documents(docs)
    
    return chunks

# Run the pipeline
chunks_web_based = vectorize_webpages()
vectorstore.add_documents(chunks_web_based)

# Save updates
vectorstore.persist()

🕸️ Loading webpages...
✂️ Splitting documents...


In [9]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from langchain_core.documents import Document
from urllib.parse import urljoin
from typing import List

# List of official OHIP/UHIP URLs
OHIP_URLS = [
    "https://www.ontario.ca/page/what-ohip-covers",
    "https://www.ontario.ca/page/ohip-coverage-while-outside-canada",
    "https://www.ontario.ca/page/documents-needed-get-health-card",
    "https://www.ontario.ca/page/apply-ohip-and-get-health-card"
]

UHIP_URLS = [
    "https://uhip.ca/help-faq/",
    "https://uhip.ca/coverage-details/"
]

def fetch_with_retry(url: str, max_retries: int = 3) -> requests.Response:
    """Handle request failures with retries"""
    for attempt in range(max_retries):
        try:
            response = requests.get(
                url,
                headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'},
                timeout=10
            )
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(2 ** attempt)  # Exponential backoff

def extract_tables_from_url(url: str) -> List[Document]:
    """Extract all tables from a single URL"""
    try:
        response = fetch_with_retry(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        documents = []
        
        for i, table in enumerate(soup.find_all('table'), 1):
            try:
                # Extract table data
                df = pd.read_html(str(table))[0]
                
                # Create document with context
                table_text = f"HEALTH TABLE FROM {url}:\n{df.to_markdown(index=False)}"
                
                documents.append(Document(
                    page_content=table_text,
                    metadata={
                        "source": url + f"#table_{i}",
                        "doc_type": "OHIP" if "ontario.ca" in url else "UHIP",
                        "columns": str(list(df.columns)),
                        "row_count": len(df),
                        "last_updated": response.headers.get('Last-Modified', '')
                    }
                ))
            except Exception as e:
                print(f"Skipped table {i} at {url}: {str(e)}")
                continue
                
        return documents
    except Exception as e:
        print(f"Failed to process {url}: {str(e)}")
        return []

def scrape_all_tables(url_list: List[str]) -> List[Document]:
    """Process multiple URLs in sequence"""
    all_tables = []
    for url in url_list:
        print(f"Processing {url}...")
        all_tables.extend(extract_tables_from_url(url))
        time.sleep(1)  # Respectful delay between requests
    return all_tables

# Usage
ohip_tables = scrape_all_tables(OHIP_URLS)
uhip_tables = scrape_all_tables(UHIP_URLS)
all_tables = ohip_tables + uhip_tables

vectorstore.add_documents(all_tables)

# 5. Save updates
vectorstore.persist()



Processing https://www.ontario.ca/page/what-ohip-covers...
Processing https://www.ontario.ca/page/ohip-coverage-while-outside-canada...
Processing https://www.ontario.ca/page/documents-needed-get-health-card...
Processing https://www.ontario.ca/page/apply-ohip-and-get-health-card...
Processing https://uhip.ca/help-faq/...


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Processing https://uhip.ca/coverage-details/...


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


# OCR Based

In [10]:
from PIL import Image
import pytesseract
from io import BytesIO
import requests

class OCRWebLoader(GovernmentWebLoader):
    def __init__(self, urls):
        super().__init__(urls)
        pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update path as needed

    def _extract_text_from_image(self, img_url: str) -> str:
        try:
            response = requests.get(img_url, stream=True)
            img = Image.open(BytesIO(response.content))
            return pytesseract.image_to_string(img)
        except Exception as e:
            print(f"OCR failed for {img_url}: {str(e)}")
            return ""

    def load(self) -> List[Document]:
        docs = super().load()
        
        for doc in docs:
            soup = BeautifulSoup(doc.page_content, 'html.parser')
            for img in soup.find_all('img', src=True):
                if any(keyword in img['src'] for keyword in ['coverage', 'eligibility', 'table']):
                    ocr_text = self._extract_text_from_image(img['src'])
                    if ocr_text:
                        doc.page_content += f"\n[IMAGE TEXT]: {ocr_text}"
                        doc.metadata['ocr_extracted'] = True
        
        return docs

In [12]:
import pdfplumber

def extract_pdf_tables(pdf_url: str) -> List[Document]:
    try:
        response = requests.get(pdf_url)
        docs = []
        
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            for page in pdf.pages:
                # Extract text
                text = page.extract_text()
                if text:
                    docs.append(Document(
                        page_content=text,
                        metadata={
                            "source": pdf_url,
                            "page": page.page_number,
                            "type": "pdf_text"
                        }
                    ))
                
                # Extract tables
                for table in page.extract_tables():
                    docs.append(Document(
                        page_content=str(table),
                        metadata={
                            "source": pdf_url,
                            "page": page.page_number,
                            "type": "pdf_table"
                        }
                    ))
        
        return docs
    except Exception as e:
        print(f"PDF extraction failed: {str(e)}")
        return []

In [13]:
def enhanced_vectorization():
    # Standard text content
    text_urls = URLS
    
    # PDF resources
    pdf_urls = [
        
    ]
    
    print("📄 Processing text content...")
    text_docs = OCRWebLoader(text_urls).load()
    
    print("📑 Processing PDF content...")
    pdf_docs = []
    for pdf_url in pdf_urls:
        pdf_docs.extend(extract_pdf_tables(pdf_url))
    
    print("✂️ Chunking documents...")
    all_docs = text_docs + pdf_docs
    chunks = GovernmentTextSplitter().split_documents(all_docs)
    
    return chunks

chucks_ocr = vectorize_webpages()


🕸️ Loading webpages...
✂️ Splitting documents...


In [15]:
vectorstore.add_documents(chucks_ocr)

# 5. Save updates
vectorstore.persist()