In [4]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.0-cp39-abi3-macosx_11_0_arm64.whl (22.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.4/22.4 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.8.7-cp312-cp312-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.10-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Down

In [34]:
import re
import json
from typing import List, Tuple, Dict, Optional
from dataclasses import dataclass, asdict
import fitz
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import spacy
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

# Download spaCy model if not already present
try:
    spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model...")
    from spacy.cli import download
    download("en_core_web_sm")

@dataclass
class FundInfo:
    fund_name: str = ""
    sponsor: str = ""
    inception_year: str = ""
    annualized_distribution_rate: str = ""
    total_investments_fair_value: str = ""
    management_fee: str = ""
    asset_allocation: str = ""
    suitability_requirements: str = ""
    liquidity: str = ""
    assumptions_made: List[str] = None

    def __post_init__(self):
        if self.assumptions_made is None:
            self.assumptions_made = []

class EnhancedFundParser:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        
        # Set up device
        if torch.backends.mps.is_available():
            self.device = torch.device("mps")
        elif torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        
        print(f"Using device: {self.device}")
        
        # Extract structured content
        self.pages_content = self._extract_pages_with_structure()
        self.text = "\n".join([page["text"] for page in self.pages_content])
        
        # Initialize spaCy
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp.max_length = 2000000
        
        # Extract sentences with context
        self.sentences_with_context = self._extract_sentences_with_context()

        # Initialize BERT model
        print("Loading BERT model...")
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.model = AutoModel.from_pretrained('bert-base-uncased')
        self.model.to(self.device)
        self.model.eval()

        # Enhanced field configurations with better patterns and section wise hints
        self.field_contexts = {
            "fund_name": {
                "context_keywords": [
                    "fund name", "private credit fund", "blackrock", "investment company", 
                    "prospectus", "fund", "blackrock private credit", "series", "class",
                    "institutional shares", "investor shares", "portfolio", "the fund",
                    "closed-end fund", "interval fund", "credit fund", "name of fund",
                    "registrant", "fund title"
                ],
                "section_hints": [
                    "cover", "summary", "prospectus summary", "fund overview", "title",
                    "front matter", "header", "fund information", "general information",
                    "investment objective", "fund details"
                ],
                "entity_patterns": [
                    r"BlackRock\s+Private\s+Credit\s+Fund",
                    r"BlackRock\s+[A-Za-z\s&]*(?:Private\s+)?Credit\s+Fund",
                    r"(?:The\s+)?BlackRock\s+Private\s+Credit\s+Fund",
                    r"Fund\s+Name[:\s]+([A-Za-z\s,&]*BlackRock[A-Za-z\s,&]*Fund)",
                    r"(BlackRock[A-Za-z\s&]*Fund)(?:\s+\(|,|\.|$)",
                    r"Registrant[:\s]+([A-Za-z\s,&]*BlackRock[A-Za-z\s,&]*Fund)",
                    r"Investment\s+Company[:\s]+([A-Za-z\s,&]*BlackRock[A-Za-z\s,&]*Fund)",
                    r"Series[:\s]+([A-Za-z\s,&]*BlackRock[A-Za-z\s,&]*Fund)",
                    r"(?:^|\n)([A-Za-z\s]*BlackRock[A-Za-z\s]*Private[A-Za-z\s]*Credit[A-Za-z\s]*Fund)",
                    r"([A-Za-z\s]*Private\s+Credit\s+Fund)",
                    r"BlackRock\s+([A-Za-z\s&]+(?:Fund|Trust|Income|Credit))"
                ],
                "negative_keywords": ["fee", "expense", "manager", "advisor", "client", "employee"],
                "expected_format": "company_name"
            },
            "sponsor": {
                "context_keywords": [
                    "investment adviser", "sponsor", "adviser", "advisor", "fund manager", 
                    "managed by", "blackrock", "investment advisor", "portfolio manager",
                    "fund adviser", "general partner", "management company", "sub-adviser",
                    "investment management", "asset manager", "fund sponsor", "administers",
                    "investment advisory", "serves as", "manager", "advises", "principal",
                    "blackrock advisors", "blackrock inc", "blackrock fund advisors"
                ],
                "section_hints": [
                    "management", "sponsor", "advisor", "investment adviser", "fund management",
                    "portfolio management", "advisory services", "fund advisor", "management company",
                    "investment advisory", "administrator", "general information"
                ],
                "entity_patterns": [
                    r"Investment\s+Adviser?[:\s]+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*(?:Inc|LLC|Advisors?|Management)?)",
                    r"Sponsor[:\s]+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*(?:Inc|LLC)?)",
                    r"(BlackRock[A-Za-z\s,&.]*(?:Inc|LLC|Advisors?|Management|Fund\s+Advisors?))",
                    r"managed\s+by\s+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*(?:Inc|LLC|Advisors?))",
                    r"Investment\s+Manager[:\s]+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*)",
                    r"Portfolio\s+Manager[:\s]+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*)",
                    r"Fund\s+Adviser?[:\s]+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*)",
                    r"serves\s+as.*?(?:adviser?|manager)[:\s]*([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*)",
                    r"(BlackRock)(?:\s+Inc\.?|\s+LLC|\s+Advisors?)?(?:\s+serves\s+as|\s+acts\s+as|\s+is\s+the)",
                    r"(?:^|\n|\s)(BlackRock)(?:\s+Inc\.?|\s+Fund\s+Advisors?|\s+Advisors?)?(?=\s+is\s+the|\s+serves|\s+acts)",
                    r"Investment\s+Adviser?[:\s]+([A-Za-z\s,&.]+(?:Inc|LLC|LP|Advisers?))"
                ],
                "negative_keywords": ["client", "investor", "shareholder", "distribution", "employee", "board"],
                "expected_format": "company_name"
            },
            "inception_year": {
                "context_keywords": ["inception", "commenced operations", "fund inception", "launched", "formed", "established"],
                "section_hints": ["fund history", "inception", "operations", "summary", "overview"],
                "entity_patterns": [
                    r"(?:inception|commenced|launched|formed|established).{0,50}(20\d{2})",
                    r"(20\d{2}).{0,50}(?:inception|commenced|launched)",
                    r"Fund\s+inception[:\s]+(20\d{2})",
                    r"since\s+(20\d{2})"
                ],
                "negative_keywords": ["expiration", "maturity", "termination"],
                "expected_format": "year"
            },
            "distribution_rate": {
                "context_keywords": ["distribution rate", "target distribution", "annualized", "yield", "effective yield", "monthly distribution"],
                "section_hints": ["distributions", "dividend policy", "yield", "performance", "summary"],
                "entity_patterns": [
                    r"(\d+\.\d+%)\s*(?:annualized|effective|target|distribution)",
                    r"(?:annualized|effective|target)\s+(?:distribution\s+rate|yield)[:\s]*(\d+\.\d+%)",
                    r"\$\d+\.\d+\s*(?:per\s+share|\/share).*?(?:monthly|quarterly)",
                    r"target.*?(\d+\.\d+%)"
                ],
                "negative_keywords": ["expense", "fee", "cost", "management"],
                "expected_format": "percentage_or_dollar"
            },
            "total_investments": {
                "context_keywords": ["total investments", "fair value", "portfolio value", "net assets", "total assets"],
                "section_hints": ["portfolio", "investments", "financial statements", "balance sheet", "assets"],
                "entity_patterns": [
                    r"[Tt]otal\s+investments[:\s]*\$([0-9,]+(?:\.[0-9]+)?)\s*(?:million|billion)?",
                    r"\$([0-9,]+(?:\.[0-9]+)?)\s*(?:million|billion)?.{0,30}(?:total\s+investments|fair\s+value)",
                    r"(?:Total|Net)\s+(?:investments|assets)[:\s]*\$([0-9,]+(?:\.[0-9]+)?)"
                ],
                "negative_keywords": ["fee", "expense", "liability", "distribution"],
                "expected_format": "dollar_amount"
            },
            "management_fee": {
                "context_keywords": ["management fee", "base fee", "advisory fee", "expense ratio", "fees"],
                "section_hints": ["fees", "expenses", "management", "compensation", "costs"],
                "entity_patterns": [
                    r"(?:base|management|advisory)\s+fee[:\s]*(\d+\.\d+%)",
                    r"(\d+\.\d+%)\s*(?:of\s+net\s+assets|annually|management)",
                    r"expense\s+ratio[:\s]*(\d+\.\d+%)"
                ],
                "negative_keywords": ["distribution", "dividend", "yield", "return"],
                "expected_format": "percentage_structure"
            },
            "asset_allocation": {
                "context_keywords": ["level 1", "level 2", "level 3", "fair value hierarchy", "asset allocation", "valuation"],
                "section_hints": ["portfolio", "fair value", "investments", "allocation", "valuation"],
                "entity_patterns": [
                    r"Level\s+([123])[:\s,]*([0-9]+\.[0-9]+)%",
                    r"([0-9]+\.[0-9]+)%.*?Level\s+([123])",
                    r"Level\s+([123]).*?([0-9]+\.[0-9]+%).*?portfolio"
                ],
                "negative_keywords": ["fee", "expense", "distribution"],
                "expected_format": "level_percentages"
            },
            "suitability": {
                "context_keywords": ["suitability", "minimum income", "net worth", "investor requirements", "eligibility", "qualified"],
                "section_hints": ["suitability", "investor", "eligibility", "requirements", "qualification"],
                "entity_patterns": [
                    r"\$([0-9,]+)\s+(?:annual\s+)?income.*?\$([0-9,]+)\s+net\s+worth",
                    r"\$([0-9,]+)\s+net\s+worth.*?\$([0-9,]+)\s+(?:annual\s+)?income",
                    r"(\$[0-9,]+)\s+(?:and|or)\s+(\$[0-9,]+)"
                ],
                "negative_keywords": ["fee", "expense", "distribution", "management"],
                "expected_format": "income_networth"
            },
            "liquidity": {
                "context_keywords": ["liquidity", "repurchase", "quarterly repurchase", "not publicly traded", "share repurchase"],
                "section_hints": ["liquidity", "repurchase", "trading", "shares", "redemption"],
                "entity_patterns": [
                    r"not\s+publicly\s+traded",
                    r"quarterly\s+repurchase",
                    r"limited\s+liquidity",
                    r"repurchase.*?program",
                    r"(\d+)%.*?repurchase"
                ],
                "negative_keywords": ["fee", "distribution", "yield"],
                "expected_format": "description"
            }
        }

    def _extract_pages_with_structure(self) -> List[Dict]:
        # Extract text with page numbers and basic structure detection
        doc = fitz.open(self.pdf_path)
        pages_content = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            
            # Detect potential section headers 
            headers = re.findall(r'^[A-Z][A-Z\s]{10,}$', text, re.MULTILINE)
            
            # Look for table-like structures
            tables = self._detect_tables(text)
            
            pages_content.append({
                "page_num": page_num + 1,
                "text": text,
                "headers": headers,
                "tables": tables,
                "has_financial_data": any(pattern in text.lower() 
                                        for pattern in ["$", "million", "billion", "%", "rate", "fee"])
            })
        
        doc.close()
        return pages_content

    def _detect_tables(self, text: str) -> List[str]:
        #Simple table detection based on patterns
        lines = text.split('\n')
        tables = []
        
        for i, line in enumerate(lines):
            # Look for lines with multiple dollar amounts or percentages
            if len(re.findall(r'\$[\d,]+|\d+\.\d+%', line)) >= 2:
                # Get surrounding context
                start = max(0, i-2)
                end = min(len(lines), i+3)
                table_context = '\n'.join(lines[start:end])
                tables.append(table_context)
        
        return tables

    def _extract_sentences_with_context(self) -> List[Dict]:
        #Extract sentences with page and section context
        sentences_with_context = []
        
        for page_info in self.pages_content:
            page_text = page_info["text"]
            page_num = page_info["page_num"]
            
            # Split into sentences
            try:
                # Process text in smaller chunks to avoid memory issues
                chunk_size = 1000000
                page_sentences = []
                
                if len(page_text) > chunk_size:
                    for i in range(0, len(page_text), chunk_size):
                        chunk = page_text[i:i+chunk_size]
                        try:
                            doc = self.nlp(chunk)
                            chunk_sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
                            page_sentences.extend(chunk_sentences)
                        except:
                            # Fallback to NLTK if spaCy fails
                            chunk_sentences = sent_tokenize(chunk)
                            page_sentences.extend([s.strip() for s in chunk_sentences if s.strip()])
                else:
                    try:
                        doc = self.nlp(page_text)
                        page_sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
                    except:
                        page_sentences = sent_tokenize(page_text)
                        page_sentences = [s.strip() for s in page_sentences if s.strip()]
                        
            except Exception as e:
                print(f"Warning: Error processing page {page_num}: {e}")
                page_sentences = sent_tokenize(page_text)
                page_sentences = [s.strip() for s in page_sentences if s.strip()]
            
            for sentence in page_sentences:
                if len(sentence) > 10:  # Filter out very short sentences
                    sentences_with_context.append({
                        "text": sentence,
                        "page_num": page_num,
                        "has_financial_data": page_info["has_financial_data"],
                        "near_headers": page_info["headers"],
                        "in_table": any(sentence in table for table in page_info["tables"])
                    })
        
        print(f"Extracted {len(sentences_with_context)} sentences with context")
        return sentences_with_context

    def _get_bert_embeddings(self, texts: List[str]) -> np.ndarray:
        #Get BERT embeddings for a list of texts
        embeddings = []
        
        for text in texts:
            inputs = self.tokenizer(text[:512], return_tensors='pt', truncation=True, padding=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = self.model(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.append(embedding[0])
        
        return np.array(embeddings)

    def _enhanced_pattern_search(self, field_name: str, config: Dict) -> List[Tuple[str, float, Dict]]:
        # Enhanced pattern-based search with context scoring
        candidates = []
        
        for sent_info in self.sentences_with_context:
            sentence = sent_info["text"]
            base_score = 0
            
            # Pattern matching with higher weights
            for pattern in config["entity_patterns"]:
                matches = re.finditer(pattern, sentence, re.IGNORECASE)
                for match in matches:
                    base_score += 5
                    
            # Context keyword matching
            sentence_lower = sentence.lower()
            for keyword in config["context_keywords"]:
                if keyword in sentence_lower:
                    base_score += 2
            
            # Section relevance bonus
            for hint in config["section_hints"]:
                for header in sent_info["near_headers"]:
                    if hint.lower() in header.lower():
                        base_score += 3
            
            # Financial data context bonus
            if sent_info["has_financial_data"] and field_name in ["total_investments", "management_fee", "distribution_rate"]:
                base_score += 2
            
            # Table context bonus for structured data
            if sent_info["in_table"] and field_name in ["asset_allocation", "total_investments"]:
                base_score += 3
            
            # Negative keyword penalty
            for neg_kw in config["negative_keywords"]:
                if neg_kw in sentence_lower:
                    base_score -= 1
            
            # Special scoring boosts for fund_name and sponsor
            if field_name == "fund_name":
                # Boost for exact fund name matches
                if "blackrock private credit fund" in sentence_lower:
                    base_score += 10
                # Boost for early pages (likely to contain fund name)
                if sent_info["page_num"] <= 5:
                    base_score += 3
                    
            if field_name == "sponsor":
                # Boost for BlackRock mentions in advisory context
                if "blackrock" in sentence_lower and any(word in sentence_lower for word in ["adviser", "advisor", "manager", "management"]):
                    base_score += 8
                # Boost for early pages
                if sent_info["page_num"] <= 10:
                    base_score += 2
            
            if base_score > 0:
                candidates.append((sentence, base_score, sent_info))
        
        # Sort by score
        candidates.sort(key=lambda x: x[1], reverse=True)
        return candidates[:20]  # Top 20 candidates

    def _semantic_search_enhanced(self, context_keywords: List[str], candidates: List[Tuple]) -> List[Tuple[str, float, Dict]]:
        # Enhanced semantic search on pre-filtered candidates
        if not candidates:
            return []
        
        query = " ".join(context_keywords)
        query_embedding = self._get_bert_embeddings([query])
        
        candidate_texts = [cand[0] for cand in candidates]
        candidate_embeddings = self._get_bert_embeddings(candidate_texts)
        
        similarities = cosine_similarity(query_embedding, candidate_embeddings)[0]
        
        # Combine pattern score with semantic similarity
        enhanced_candidates = []
        for i, (sentence, pattern_score, context) in enumerate(candidates):
            combined_score = pattern_score * 0.7 + similarities[i] * 10 * 0.3
            enhanced_candidates.append((sentence, combined_score, context))
        
        enhanced_candidates.sort(key=lambda x: x[1], reverse=True)
        return enhanced_candidates[:10]

    def _extract_specific_value(self, sentence: str, field_name: str, config: Dict) -> Optional[str]:
        # Extract specific values using targeted patterns
        
        if field_name == "fund_name":
            # Enhanced fund name patterns with priority order
            patterns = [
                # Exact match patterns (highest priority)
                r"(BlackRock\s+Private\s+Credit\s+Fund)(?!\s+Advisors)",
                r"(?:The\s+)?(BlackRock\s+Private\s+Credit\s+Fund)",
                
                # Partial match patterns
                r"Fund\s+Name[:\s]+([A-Za-z\s,&]*BlackRock[A-Za-z\s,&]*Fund)",
                r"Registrant[:\s]+([A-Za-z\s,&]*BlackRock[A-Za-z\s,&]*Fund)",
                r"Investment\s+Company[:\s]+([A-Za-z\s,&]*BlackRock[A-Za-z\s,&]*Fund)",
                
                # Generic BlackRock fund patterns
                r"(BlackRock\s+[A-Za-z\s&]*Credit\s+Fund)",
                r"BlackRock\s+([A-Za-z\s&]+(?:Fund|Trust|Income|Credit))",
                r"([A-Za-z\s&]*Private\s+Credit\s+Fund)"
            ]
            
            for pattern in patterns:
                match = re.search(pattern, sentence, re.IGNORECASE)
                if match:
                    name = match.group(1).strip()
                    name = re.sub(r'\s+', ' ', name)
                    name = re.sub(r'^(the\s+)', '', name, flags=re.IGNORECASE)
                    # Prefer exact matches
                    if "BlackRock Private Credit Fund" in name:
                        return "BlackRock Private Credit Fund"
                    return name
        
        elif field_name == "sponsor":
            patterns = [
                r"Investment\s+Adviser?[:\s]+(BlackRock)(?:\s+Inc\.?|\s+Fund\s+Advisors?|\s+Advisors?|$|\s+serves|\s+acts)",
                r"Sponsor[:\s]+(BlackRock)(?:\s+Inc\.?|$|\s+serves|\s+acts)",
                r"(BlackRock)(?:\s+Inc\.?)?\s+(?:serves\s+as|acts\s+as|is\s+the).*?(?:adviser?|advisor|manager)",
                
                r"Investment\s+Adviser?[:\s]+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*(?:Inc|LLC|Advisors?|Management|Fund\s+Advisors?))",
                r"Sponsor[:\s]+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*(?:Inc|LLC|Fund\s+Advisors?))",
                r"managed\s+by\s+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*(?:Inc|LLC|Advisors?))",
                r"Fund\s+Adviser?[:\s]+([A-Za-z\s,&.]*BlackRock[A-Za-z\s,&.]*)",
                
                r"Investment\s+Adviser?[:\s]+([A-Za-z\s,&.]+(?:Inc|LLC|LP|Advisers?))"
            ]
            
            for pattern in patterns:
                match = re.search(pattern, sentence, re.IGNORECASE)
                if match:
                    sponsor = match.group(1).strip()
                    # Clean up extra whitespace and trailing punctuation
                    sponsor = re.sub(r'\s+', ' ', sponsor)
                    sponsor = re.sub(r'[.,;]+$', '', sponsor)
                    
                    # Prefer simple "BlackRock" if found
                    if sponsor.lower() == "blackrock" or sponsor.lower() == "blackrock inc":
                        return "BlackRock"
                    elif "blackrock" in sponsor.lower():
                        return sponsor
                    else:
                        return sponsor
        
        elif field_name == "inception_year":
            # Look for years near inception keywords
            patterns = [
                r"(?:inception|commenced|launched|formed).{0,50}(20\d{2})",
                r"(20\d{2}).{0,50}(?:inception|commenced|launched)",
                r"Fund\s+inception[:\s]+(20\d{2})",
                r"since\s+(20\d{2})",
                r"established\s+in\s+(20\d{2})"
            ]
            for pattern in patterns:
                match = re.search(pattern, sentence, re.IGNORECASE)
                if match:
                    return match.group(1)
        
        elif field_name == "total_investments":
            # Look for dollar amounts with investment context
            patterns = [
                r"[Tt]otal\s+investments[:\s]*\$([0-9,]+(?:\.[0-9]+)?)\s*(?:million|billion)?",
                r"\$([0-9,]+(?:\.[0-9]+)?)\s*(?:million|billion)?.{0,30}(?:total\s+investments|fair\s+value)",
                r"(?:Total|Net)\s+(?:investments|assets)[:\s]*\$([0-9,]+(?:\.[0-9]+)?)\s*(?:million|billion)?"
            ]
            for pattern in patterns:
                match = re.search(pattern, sentence, re.IGNORECASE)
                if match:
                    amount = match.group(1)
                    # Check for million/billion modifier
                    if re.search(r'million', sentence, re.IGNORECASE):
                        return f"${amount} million"
                    elif re.search(r'billion', sentence, re.IGNORECASE):
                        return f"${amount} billion"
                    return f"${amount}"
        
        elif field_name == "management_fee":
            # Look for fee structure 
            fee_context = sentence.lower()
            if any(word in fee_context for word in ['base fee', 'management fee', 'advisory fee']):
                # Look for clean percentage patterns
                fees = re.findall(r"(\d+\.\d+%)", sentence)
                # Filter out obviously wrong percentages (like >50%)
                clean_fees = [fee for fee in fees if float(fee.replace('%', '')) <= 10.0]
                if clean_fees:
                    return " + ".join(clean_fees) if len(clean_fees) > 1 else clean_fees[0]
        
        elif field_name == "distribution_rate":
            # Look for distribution rates or monthly amounts
            patterns = [
                r"(\d+\.\d+%)\s*(?:annualized|effective|target|distribution)",
                r"(?:annualized|effective|target)\s+(?:distribution\s+rate|yield)[:\s]*(\d+\.\d+%)",
                r"\$(\d+\.\d+)\s*(?:per\s+share|\/share).*?(?:monthly|quarterly|annual)",
                r"target.*?(\d+\.\d+%)"
            ]
            for pattern in patterns:
                match = re.search(pattern, sentence, re.IGNORECASE)
                if match:
                    value = match.group(1)
                    if '$' in value:
                        return f"${value}/share"
                    return value
        
        elif field_name == "asset_allocation":
            # Look for level percentages - extract all levels from sentence
            levels_found = {}
            patterns = [
                r"Level\s+([123])[:\s,]*([0-9]+\.[0-9]+)%",
                r"([0-9]+\.[0-9]+)%.*?Level\s+([123])"
            ]
            
            for pattern in patterns:
                matches = re.findall(pattern, sentence, re.IGNORECASE)
                for match in matches:
                    if len(match) == 2:
                        if pattern.startswith("Level"):
                            level, pct = match
                        else:
                            pct, level = match
                        levels_found[level] = f"{pct}%"
            
            if levels_found:
                result = []
                for level in ['1', '2', '3']:
                    if level in levels_found:
                        result.append(f"Level {level}: {levels_found[level]}")
                return "; ".join(result)
        
        elif field_name == "suitability":
            # Look for income/net worth requirements
            amounts = re.findall(r"\$([0-9,]+)", sentence)
            if len(amounts) >= 2:
                return f"${amounts[0]} income and ${amounts[1]} net worth"
            elif len(amounts) == 1:
                return f"${amounts[0]}"
        
        return None

    def _extract_field_enhanced(self, field_name: str) -> Tuple[str, List[str]]:
        """Enhanced field extraction combining multiple techniques"""
        assumptions = []
        config = self.field_contexts[field_name]
        
        # Step 1: Pattern-based candidate filtering
        pattern_candidates = self._enhanced_pattern_search(field_name, config)
        
        if not pattern_candidates:
            assumptions.append(f"{field_name} - no pattern matches found")
            return "", assumptions
        
        # Step 2: Semantic refinement
        semantic_candidates = self._semantic_search_enhanced(config["context_keywords"], pattern_candidates)
        
        # Step 3: Extract specific values
        for sentence, score, context in semantic_candidates:
            extracted_value = self._extract_specific_value(sentence, field_name, config)
            
            if extracted_value:
                confidence = "high" if score > 8 else "medium" if score > 5 else "low"
                if confidence != "high":
                    assumptions.append(f"{field_name} extracted with {confidence} confidence (score: {score:.1f})")
                
                return extracted_value, assumptions
        
        # Step 4: Fallback to best context
        if semantic_candidates:
            best_sentence, best_score, best_context = semantic_candidates[0]
            assumptions.append(f"{field_name} - using best context match (score: {best_score:.1f})")
            return best_sentence[:200] + "..." if len(best_sentence) > 200 else best_sentence, assumptions
        
        assumptions.append(f"{field_name} - no suitable candidates found")
        return "", assumptions

    def extract(self) -> FundInfo:
        """Extract fund information using enhanced hybrid approach"""
        info = FundInfo()
        all_assumptions = []
        
        field_mapping = {
            "total_investments": "total_investments_fair_value",
            "suitability": "suitability_requirements", 
            "distribution_rate": "annualized_distribution_rate"
        }
        
        for field_name in tqdm(self.field_contexts.keys(), desc="Extracting fields"):
            field_attr = field_mapping.get(field_name, field_name)
            
            value, assumptions = self._extract_field_enhanced(field_name)
            setattr(info, field_attr, value)
            all_assumptions.extend(assumptions)
        
        info.assumptions_made = all_assumptions
        return info

    def to_json(self, info: FundInfo) -> str:
        return json.dumps(asdict(info), indent=2, ensure_ascii=False)

# Usage
if __name__ == "__main__":
    pdf_path = "/Users/nikhithasivaprakasam/Downloads/Blackrock Prospectus.pdf"  # Update this path
    
    try:
        parser = EnhancedFundParser(pdf_path)
        fund_info = parser.extract()
        
        print("\n" + "="*60)
        print("ENHANCED EXTRACTION RESULTS")
        print("="*60)
        print(parser.to_json(fund_info))

    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()

Using device: mps
Extracted 4948 sentences with context
Loading BERT model...


Extracting fields: 100%|██████████████████████████| 9/9 [00:07<00:00,  1.17it/s]


ENHANCED EXTRACTION RESULTS
{
  "fund_name": "BlackRock Private Credit Fund",
  "sponsor": "and BlackRock Advisors, LLC",
  "inception_year": "2022",
  "annualized_distribution_rate": "5.0%",
  "total_investments_fair_value": "$400.9 million",
  "management_fee": "2.0% + 2.0% + 2.0% + 1.25% + 1.25% + 1.25% + 0.85% + 0.25% + 0.00% + 9.62% + 9.62% + 9.62% + 0.80% + 0.80% + 0.80% + 3.5% + 1.5%",
  "asset_allocation": "Level 1: 36.2%; Level 2: 36.2%; Level 3: 63.8%",
  "suitability_requirements": "$398,929,289 income and $210,903,951, net worth",
  "liquidity": "Share Repurchase Program\nAt the discretion of the Fund’s Board of Trustees, the Fund is conducting a share repurchase program in which the Fund is repurchasing, in each quarter, up to 5% of the Fund’s...",
  "assumptions_made": [
    "suitability extracted with medium confidence (score: 7.3)",
    "liquidity - using best context match (score: 12.2)"
  ]
}



