In [2]:
import os
import re
import pandas as pd
import numpy as np
import PyPDF2
import pdfplumber
import fitz  # PyMuPDF
from pathlib import Path
from typing import List, Dict, Tuple
from tqdm.notebook import tqdm
# Add at the top with other imports
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import torch
import time

# Define the paths to the financial reports
pdf_paths = [
    r"tt-files/BVH_Baocaotaichinh_Q3_2024_Hopnhat.pdf",
    r"tt-files/BVH_Baocaotaichinh_Q4_2024_Hopnhat.pdf"
]

class FinancialReportProcessor:
    def __init__(self, pdf_paths):
        self.pdf_paths = pdf_paths
        self.output_dir = os.path.join(os.path.dirname(pdf_paths[0]), "processed_chunks")
        os.makedirs(self.output_dir, exist_ok=True)

    def extract_text_with_pdfplumber(self, pdf_path: str) -> str:
        """Extract text from PDF using pdfplumber to better maintain layout."""
        text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    page_text = page.extract_text(x_tolerance=3, y_tolerance=3)
                    if page_text:
                        text += f"\n\n--- Page {page_num + 1} ---\n\n" + page_text
        except Exception as e:
            print(f"Error extracting text from {pdf_path} with pdfplumber: {e}")
            # Fallback to PyPDF2
            text = self.extract_text_with_pypdf2(pdf_path)

        return text

    def extract_text_with_pypdf2(self, pdf_path: str) -> str:
        """Fallback extractor using PyPDF2."""
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                num_pages = len(reader.pages)

                for page_num in range(num_pages):
                    page = reader.pages[page_num]
                    page_text = page.extract_text()
                    if page_text:
                        text += f"\n\n--- Page {page_num + 1} ---\n\n" + page_text
        except Exception as e:
            print(f"Error extracting text from {pdf_path} with PyPDF2: {e}")

        return text

    def clean_financial_text(self, text: str) -> str:
        """Clean OCR errors common in financial documents."""
        # Replace common OCR errors in financial reports
        replacements = {
            r'\(VND\)': ' VND',
            r'\b,\b': '.',  # Replace isolated commas with decimal points
            r'[\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a]+': ' ',  # Replace various space chars
            r'[^\S\r\n]+': ' ',  # Multiple spaces to single space
            r'\n{3,}': '\n\n',  # Multiple newlines to double newline
        }

        for pattern, replacement in replacements.items():
            text = re.sub(pattern, replacement, text)

        return text

    def detect_document_structure(self, text: str) -> Dict[str, List[Tuple[str, int, int]]]:
        """
        Detect the document structure by identifying section headers and their positions.
        Returns a dictionary of sections with their start and end positions.
        """
        # Common section headers in financial reports
        section_patterns = [
            r"(THÔNG TIN CHUNG)",
            r"(BẢNG CÂN ĐỐI KẾ TOÁN HỢP NHẤT[^\n]*)",
            r"(BÁO CÁO KẾT QUẢ HOẠT ĐỘNG KINH DOANH HỢP NHẤT[^\n]*)",
            r"(BÁO CÁO LƯU CHUYỂN TIỀN TỆ HỢP NHẤT[^\n]*)",
            r"(THUYẾT MINH BÁO CÁO TÀI CHÍNH HỢP NHẤT[^\n]*)",
            r"(MỤC LỤC)",
            r"(\d+\.\s+[A-Z\s][A-ZĐÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴ\s]+)"
        ]

        sections = {}
        for pattern in section_patterns:
            for match in re.finditer(pattern, text, re.MULTILINE):
                section_title = match.group(1).strip()
                sections[section_title] = (section_title, match.start(), match.end())

        # Sort sections by position
        sorted_sections = sorted(sections.values(), key=lambda x: x[1])

        # Determine section bounds (start of one section to start of next)
        structured_sections = []
        for i in range(len(sorted_sections)):
            section_title, start_pos, _ = sorted_sections[i]
            end_pos = sorted_sections[i+1][1] if i < len(sorted_sections) - 1 else len(text)
            structured_sections.append((section_title, start_pos, end_pos))

        return structured_sections

    def extract_metadata(self, text: str, file_path: str) -> Dict[str, Any]:
        """Extract metadata from the document text."""
        metadata = {
            "source": os.path.basename(file_path),
            "file_path": file_path
        }

        # Extract quarter and year
        quarter_match = re.search(r'Q(\d+)_(\d{4})', file_path)
        if quarter_match:
            metadata["quarter"] = quarter_match.group(1)
            metadata["year"] = quarter_match.group(2)

        # Extract company info
        if re.search(r'Tập đoàn Bảo Việt', text, re.IGNORECASE):
            metadata["company"] = "Tập đoàn Bảo Việt"
            metadata["ticker"] = "BVH"

        return metadata

    def classify_section_type(self, section_title: str, section_text: str) -> str:
        """Classify the type of financial report section."""
        if re.search(r'bảng cân đối', section_title, re.IGNORECASE):
            return "balance_sheet"
        elif re.search(r'kết quả.*kinh doanh', section_title, re.IGNORECASE):
            return "income_statement"
        elif re.search(r'lưu chuyển tiền tệ', section_title, re.IGNORECASE):
            return "cash_flow"
        elif re.search(r'thuyết minh', section_title, re.IGNORECASE):
            return "notes"
        elif re.search(r'thông tin chung', section_title, re.IGNORECASE):
            return "general_info"
        elif re.search(r'^\d+\.', section_title.strip()):
            return "note_item"
        else:
            return "other"

    def is_tabular_section(self, section_text: str) -> bool:
        """Determine if a section contains financial tables."""
        # Look for patterns indicating tables (lots of numbers, aligned data)
        if re.search(r'(\d{1,3}(\.|\,)\d{3}){2,}', section_text):
            return True
        # Count number of lines with aligned data patterns
        lines = section_text.split('\n')
        aligned_lines = 0
        for line in lines:
            if re.search(r'\d+\s+[\d\.,]+\s+[\d\.,]+', line):
                aligned_lines += 1
        return aligned_lines > 5

    def chunk_financial_report(self, text: str, file_path: str) -> List[Dict[str, Any]]:
        """
        Split a financial report into semantic chunks based on document structure,
        optimized for RAG with special handling for tabular and text content.
        Returns a list of dictionaries with text content and metadata.
        """
        chunks = []
        base_metadata = self.extract_metadata(text, file_path)

        # Clean text
        text = self.clean_financial_text(text)

        # Detect sections
        structured_sections = self.detect_document_structure(text)

        # Process each section
        for section_title, start_pos, end_pos in structured_sections:
            section_text = text[start_pos:end_pos].strip()
            section_type = self.classify_section_type(section_title, section_text)

            if section_type in ["balance_sheet", "income_statement", "cash_flow"]:
                # Financial statements: keep tables intact
                # Split into logical sub-tables if needed
                # For balance sheet: assets, liabilities, equity
                if self.is_tabular_section(section_text):
                    # Special handling for tables - prevent chunking within tables
                    tables = self.extract_logical_tables(section_text)
                    for i, table_text in enumerate(tables):
                        metadata = base_metadata.copy()
                        metadata.update({
                            "section": section_title,
                            "section_type": section_type,
                            "chunk_type": "financial_statement_table",
                            "table_index": i
                        })
                        chunks.append({
                            "content": table_text.strip(),
                            "metadata": metadata
                        })
                else:
                    # If no tables detected, keep as is
                    metadata = base_metadata.copy()
                    metadata.update({
                        "section": section_title,
                        "section_type": section_type,
                        "chunk_type": "financial_statement"
                    })
                    chunks.append({
                        "content": section_text.strip(),
                        "metadata": metadata
                    })

            elif section_type == "notes" or section_type == "note_item":
                # Notes: chunk by note number
                note_chunks = self.chunk_notes_section(section_text, section_title)
                for note_title, note_content in note_chunks:
                    metadata = base_metadata.copy()
                    metadata.update({
                        "section": section_title if section_type == "notes" else "THUYẾT MINH BÁO CÁO TÀI CHÍNH HỢP NHẤT",
                        "note": note_title,
                        "section_type": section_type,
                        "chunk_type": "financial_note"
                    })
                    chunks.append({
                        "content": f"{note_title}\n\n{note_content}".strip(),
                        "metadata": metadata
                    })

            else:
                # General information and other sections: semantic chunking
                semantic_chunks = self.chunk_by_semantic_paragraphs(section_text, section_title)
                for chunk_title, chunk_content in semantic_chunks:
                    metadata = base_metadata.copy()
                    metadata.update({
                        "section": section_title,
                        "section_type": section_type,
                        "chunk_type": "general_info",
                        "subsection": chunk_title if chunk_title != section_title else ""
                    })
                    chunks.append({
                        "content": chunk_content.strip(),
                        "metadata": metadata
                    })

        return chunks

    def extract_logical_tables(self, section_text: str) -> List[str]:
        """Split financial tables into logical subcomponents."""
        # Look for common dividing patterns in tables
        table_divisions = [
            r"TÀI SẢN",
            r"NGUỒN VỐN",
            r"CHỈ TIÊU",
            r"Lưu chuyển tiền từ hoạt động",
        ]

        # Find table divisions
        divisions = [(0, "Start")]
        for pattern in table_divisions:
            for match in re.finditer(pattern, section_text):
                divisions.append((match.start(), pattern))
        divisions.append((len(section_text), "End"))

        # Sort divisions by position
        divisions.sort()

        # Extract tables between divisions
        tables = []
        for i in range(len(divisions) - 1):
            start_pos = divisions[i][0]
            end_pos = divisions[i+1][0]
            table_text = section_text[start_pos:end_pos].strip()
            if table_text and len(table_text.split()) > 10:  # Skip very small chunks
                tables.append(table_text)

        # If no divisions found, return the entire section
        if len(tables) == 0:
            return [section_text]

        return tables

    def chunk_notes_section(self, section_text: str, section_title: str) -> List[Tuple[str, str]]:
        """
        Chunk notes section into individual notes.
        Returns a list of (note_title, note_content) tuples.
        """
        # Pattern to identify note headers
        note_pattern = r"(\d+\.?\s+[A-ZĐÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴ\s]+)"

        # Split by note headers
        note_sections = re.split(note_pattern, section_text)
        chunks = []

        # If section_title is a note title itself
        if re.search(r"^\d+\.", section_title.strip()):
            chunks.append((section_title, section_text))
            return chunks

        # Otherwise, process all notes in the section
        if len(note_sections) > 1:
            for i in range(1, len(note_sections), 2):
                if i+1 < len(note_sections):
                    note_title = note_sections[i].strip()
                    note_content = note_sections[i+1].strip()

                    # Handle long notes by splitting them into context-preserving chunks
                    if len(note_content.split()) > 800:
                        # Find logical divisions within the note
                        subsections = self.find_note_subsections(note_content)
                        if subsections:
                            for subsection in subsections:
                                chunks.append((note_title, subsection))
                        else:
                            # If no subsections, chunk by paragraphs with overlap
                            chunks.append((note_title, note_content))
                    else:
                        chunks.append((note_title, note_content))
        else:
            # If no notes found, return the whole section
            chunks.append((section_title, section_text))

        return chunks

    def find_note_subsections(self, note_content: str) -> List[str]:
        """Find logical subsections within a note."""
        # Look for subsection patterns like a), b), c) or i), ii), iii)
        subsection_pattern = r'(?:^|\n)(?:[a-z]\)|\([a-z]\)|\(?\d+\)|\([ivx]+\))'

        # Find subsection starts
        subsection_matches = list(re.finditer(subsection_pattern, note_content, re.MULTILINE))

        if len(subsection_matches) <= 1:
            return []

        # Extract subsections
        subsections = []
        for i in range(len(subsection_matches)):
            start_pos = subsection_matches[i].start()
            end_pos = subsection_matches[i+1].start() if i < len(subsection_matches) - 1 else len(note_content)
            subsection_text = note_content[start_pos:end_pos].strip()
            if subsection_text and len(subsection_text.split()) > 15:  # Skip very small chunks
                subsections.append(subsection_text)

        return subsections

    def chunk_by_semantic_paragraphs(self, section_text: str, section_title: str) -> List[Tuple[str, str]]:
        """
        Chunk text by semantic paragraphs, respecting context.
        Returns a list of (subsection_title, subsection_content) tuples.
        """
        # Look for subsection headers (all caps or with specific formats)
        subsection_pattern = r'(?:^|\n)([A-ZĐÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴ][A-ZĐÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴ\s]{2,}(?:\n|\:))'

        # Find subsection headers
        subsection_matches = list(re.finditer(subsection_pattern, section_text))

        if not subsection_matches:
            # No subsections found, check if text is long enough to split
            if len(section_text.split()) > 800:
                # Split by paragraphs with contextual overlap
                paragraphs = re.split(r'\n\s*\n', section_text)
                chunks = []
                current_chunk = ""

                for para in paragraphs:
                    if not para.strip():
                        continue

                    if len(current_chunk.split()) + len(para.split()) > 800:
                        chunks.append((section_title, current_chunk.strip()))
                        # Start new chunk with 20% overlap for context
                        current_chunk = para
                    else:
                        if current_chunk:
                            current_chunk += "\n\n"
                        current_chunk += para

                # Add final chunk
                if current_chunk:
                    chunks.append((section_title, current_chunk.strip()))

                return chunks
            else:
                # Text is short enough to keep as a single chunk
                return [(section_title, section_text)]

        # Extract subsections
        chunks = []
        for i in range(len(subsection_matches)):
            start_pos = subsection_matches[i].start()
            end_pos = subsection_matches[i+1].start() if i < len(subsection_matches) - 1 else len(section_text)
            subsection_title = subsection_matches[i].group(1).strip()
            subsection_text = section_text[start_pos:end_pos].strip()

            if subsection_text and len(subsection_text.split()) > 15:  # Skip very small chunks
                chunks.append((subsection_title, subsection_text))

        # Include text before the first subsection
        if subsection_matches and subsection_matches[0].start() > 0:
            prefix_text = section_text[:subsection_matches[0].start()].strip()
            if prefix_text and len(prefix_text.split()) > 15:
                chunks.insert(0, (section_title, prefix_text))

        # If no valid chunks, return the whole section
        if not chunks:
            return [(section_title, section_text)]

        return chunks

    def process_all_reports(self):
        """Process all financial reports and save the chunks."""
        all_chunks = []

        for pdf_path in tqdm(self.pdf_paths, desc="Processing PDF files"):
            print(f"\nProcessing {os.path.basename(pdf_path)}...")

            # Extract text from PDF
            doc_text = self.extract_text_with_pdfplumber(pdf_path)

            # If extraction failed, skip this file
            if not doc_text:
                print(f"Failed to extract text from {pdf_path}, skipping...")
                continue

            # Chunk the document
            doc_chunks = self.chunk_financial_report(doc_text, pdf_path)
            print(f"Generated {len(doc_chunks)} chunks from this document")

            # Add numeric IDs to chunks for this document
            for i, chunk in enumerate(doc_chunks):
                chunk["metadata"]["chunk_id"] = f"{os.path.basename(pdf_path)}_chunk_{i}"

            all_chunks.extend(doc_chunks)
            print(f"Completed processing {os.path.basename(pdf_path)}")

        # Save chunks
        self.save_chunks_for_rag(all_chunks)

        # Print statistics
        self.print_chunk_statistics(all_chunks)

        # Return chunks for further processing
        return all_chunks

    def save_chunks_for_rag(self, chunks: List[Dict[str, Any]]):
        """Save chunks to files for use in a RAG system."""
        # Save the chunks to a CSV file
        df = pd.DataFrame([{
            "chunk_id": chunk["metadata"]["chunk_id"],
            "content": chunk["content"],
            "source": chunk["metadata"]["source"],
            "section": chunk["metadata"].get("section", ""),
            "note": chunk["metadata"].get("note", ""),
            "section_type": chunk["metadata"].get("section_type", ""),
            "chunk_type": chunk["metadata"].get("chunk_type", ""),
            "quarter": chunk["metadata"].get("quarter", ""),
            "year": chunk["metadata"].get("year", ""),
        } for chunk in chunks])

        csv_path = os.path.join(self.output_dir, "financial_report_chunks.csv")
        df.to_csv(csv_path, index=False)
        print(f"Saved {len(chunks)} chunks to {csv_path}")

        # Save sample chunks for inspection
        sample_dir = os.path.join(self.output_dir, "sample_chunks")
        os.makedirs(sample_dir, exist_ok=True)

        for i, chunk in enumerate(chunks[:10]):  # Save first 10 chunks as samples
            sample_path = os.path.join(sample_dir, f"chunk_{i}.txt")
            with open(sample_path, "w", encoding="utf-8") as f:
                f.write(f"Metadata: {chunk['metadata']}\n\n")
                f.write(chunk["content"])

    def print_chunk_statistics(self, chunks: List[Dict[str, Any]]):
        """Print statistics about the chunks."""
        print(f"\nTotal number of chunks: {len(chunks)}")

        df_stats = pd.DataFrame([{
            "file": chunk["metadata"]["source"],
            "chunk_type": chunk["metadata"].get("chunk_type", "unknown"),
            "section_type": chunk["metadata"].get("section_type", "unknown"),
            "word_count": len(chunk["content"].split())
        } for chunk in chunks])

        print("\nChunking statistics:")
        print(df_stats.groupby(["file", "chunk_type"]).agg({
            "word_count": ["count", "mean", "min", "max"]
        }).round(1))

        print("\nSample chunks:")
        for i, chunk in enumerate(chunks[:3]):
            print(f"\nChunk {i+1}:")
            print(f"Metadata: {chunk['metadata']}")
            print(f"Content preview: {chunk['content'][:200]}...")
            print(f"Word count: {len(chunk['content'].split())}")
            print("-" * 80)

    # Add these methods to the FinancialReportProcessor class in Untitled4.ipynb

def get_vietnamese_embeddings(self, texts: list) -> np.ndarray:
    """
    Generate embeddings using the same Vietnamese model as in data_ingestion.ipynb
    """
    # Use the same model as in the data_ingestion.ipynb file
    model = SentenceTransformer('keepitreal/vietnamese-sbert')
    
    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        show_progress_bar=True,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    
    return embeddings

def upload_to_qdrant(self, chunks: List[Dict[str, Any]], embeddings: np.ndarray, batch_size=32):
    """
    Upload chunks and their embeddings to Qdrant vector database.
    """
    # Load environment variables
    load_dotenv('.env.local')  # Adjust path if needed
    
    # Qdrant configuration
    QDRANT_URL = os.getenv('QDRANT_URL')
    QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
    COLLECTION_NAME = 'legal_docs'  # Using the same collection as in data_ingestion.ipynb
    
    print(f"Connecting to Qdrant at {QDRANT_URL}")
    
    # Initialize Qdrant client
    client = QdrantClient(
        url=QDRANT_URL,
        api_key=QDRANT_API_KEY,
        prefer_grpc=False,
        timeout=60
    )
    
    # Test connection
    try:
        collections = client.get_collections()
        print(f"Successfully connected to Qdrant")
    except Exception as e:
        print(f"Connection failed: {str(e)}")
        print("Please check your Qdrant URL and API key")
        return False
    
    # Create collection if it doesn't exist
    embedding_size = embeddings.shape[1]
    if not client.collection_exists(COLLECTION_NAME):
        print(f"Creating collection '{COLLECTION_NAME}'...")
        client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=VectorParams(
                size=embedding_size,
                distance=Distance.COSINE
            )
        )
        print(f"Collection created successfully")
    else:
        print(f"Collection '{COLLECTION_NAME}' already exists")
    
    # Upload data in batches
    total_batches = (len(chunks) + batch_size - 1) // batch_size
    start_time = time.time()
    
    for batch_idx in range(total_batches):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(chunks))
        
        # Prepare batch of points
        points = []
        for i in range(start_idx, end_idx):
            # Create a unique ID for each chunk
            chunk_id = i + 1  # Start IDs from 1, matching data_ingestion.ipynb approach
            
            # Prepare payload with exact metadata field names from data_ingestion.ipynb
            payload = {
                "text": chunks[i]["content"],
                "metadata": {
                    "source": chunks[i]["metadata"]["source"],
                    "section": chunks[i]["metadata"].get("section", ""),
                    "section_type": chunks[i]["metadata"].get("section_type", ""),
                    "chunk_type": chunks[i]["metadata"].get("chunk_type", ""),
                    "chunk_id": chunks[i]["metadata"]["chunk_id"]
                }
            }
            
            # Add financial-specific metadata fields
            if "quarter" in chunks[i]["metadata"]:
                payload["metadata"]["quarter"] = chunks[i]["metadata"]["quarter"]
            if "year" in chunks[i]["metadata"]:
                payload["metadata"]["year"] = chunks[i]["metadata"]["year"]
            if "company" in chunks[i]["metadata"]:
                payload["metadata"]["company"] = chunks[i]["metadata"]["company"]
            if "ticker" in chunks[i]["metadata"]:
                payload["metadata"]["ticker"] = chunks[i]["metadata"]["ticker"]
            if "note" in chunks[i]["metadata"]:
                payload["metadata"]["note"] = chunks[i]["metadata"]["note"]
            
            points.append({
                "id": chunk_id,
                "vector": embeddings[i].tolist(),
                "payload": payload
            })
        
        # Upload batch
        try:
            client.upsert(
                collection_name=COLLECTION_NAME,
                points=points
            )
            print(f"Batch {batch_idx + 1}/{total_batches} uploaded ({start_idx + 1}-{end_idx} of {len(chunks)})")
        except Exception as e:
            print(f"Failed to upload batch {batch_idx + 1}: {str(e)}")
            return False
    
    # Print upload statistics
    total_time = time.time() - start_time
    print(f"\nUpload completed successfully!")
    print(f"Total documents: {len(chunks)}")
    print(f"Total time: {total_time:.2f} seconds")
    
    return True

def process_and_upload_to_qdrant(self):
    """
    Process all financial reports, generate embeddings, and upload to Qdrant.
    """
    print("Starting processing and uploading to Qdrant...")
    
    # Process all reports to get chunks
    all_chunks = self.process_all_reports()
    
    if not all_chunks or len(all_chunks) == 0:
        print("No chunks were generated. Check your PDF paths.")
        return
    
    # Extract text from chunks for embedding
    texts = [chunk["content"] for chunk in all_chunks]
    
    print(f"Generating embeddings for {len(texts)} chunks...")
    embeddings = self.get_vietnamese_embeddings(texts)
    print(f"Embeddings generated with shape: {embeddings.shape}")
    
    # Upload chunks and embeddings to Qdrant
    print("Uploading to Qdrant database...")
    success = self.upload_to_qdrant(all_chunks, embeddings)
    
    if success:
        print("Financial reports successfully uploaded to Qdrant")
    else:
        print("Failed to upload financial reports to Qdrant")


# At the main execution part of the notebook, add:
if __name__ == "__main__":
    print("Starting financial report processing...")

    # Create processor with all PDF paths
    processor = FinancialReportProcessor(pdf_paths)
    
    # Process all reports, generate embeddings, and upload to Qdrant
    processor.get_vietnamese_embeddings = get_vietnamese_embeddings
    processor.upload_to_qdrant = upload_to_qdrant
    processor.process_and_upload_to_qdrant = process_and_upload_to_qdrant
    
    # Run the upload process
    processor.process_and_upload_to_qdrant()

ModuleNotFoundError: No module named 'frontend'