In [7]:
import sys
import os
sys.path.append(os.path.abspath('..'))

In [10]:
from llm_client import AzureOpenAIClient
from time import time
from KG_visualizer import KnowledgeGraphVisualizer
from company_identifier import CompanyIdentifier
from dotenv import load_dotenv
from pathlib import Path
from ontology.loader import PEKGOntology
from utils.pdf_utils import PDFProcessor
from utils.kg_utils import (
    merge_knowledge_graphs, merge_multiple_knowledge_graphs, 
    clean_knowledge_graph, normalize_entity_ids
)

load_dotenv()

class FinancialKGBuilder:
    """
    A unified class to build financial knowledge graphs from PDF documents using Azure OpenAI.
    
    It supports:
    - Text-only extraction: Processes text content from PDF documents
    - Multimodal extraction: Processes both text and visual elements (tables, charts, etc.)
    
    Construction modes:
    - "iterative": Processes the PDF page by page, using previous pages' graphs as context
    - "onego": Processes all content at once or independently and then merges results
    - "parallel": Processes pages independently in parallel using multiple LLM instances
    
    The class provides functionality to extract, merge, consolidate, and visualize 
    knowledge graphs based on a predefined ontology.
    """
    
    def __init__(
        self, 
        model_name, 
        deployment_name, 
        project_name, 
        construction_mode="iterative",
        extraction_mode="text",
        max_workers=4,  # Number of parallel workers for parallel mode
        #ontology_path: str = Path(__file__).resolve().parent / "ontology" / "pekg_ontology.yaml"
    ):
        """
        Initialize the FinancialKGBuilder with the model name and deployment name.
        
        Args:
            model_name (str): The name of the model to be used for extraction.
            deployment_name (str): The name of the deployment in Azure OpenAI.
            project_name (str): The name of the project for file naming.
            construction_mode (str): "iterative", "onego", or "parallel" for the KG construction approach.
            extraction_mode (str): Either "text" or "multimodal" for the extraction method.
            max_workers (int): Maximum number of parallel workers (for parallel mode only).
            ontology_path (str): Path to the ontology file.
        """
        self.model_name = model_name
        self.project_name = project_name
        self.llm = AzureOpenAIClient(model_name=model_name)
        self.deployment_name = deployment_name
        #self.ontology = PEKGOntology(ontology_path)
        self.pdf_path = "C:/PE/infomemo/systran/Project System - Information Memorandum - March 2023 - ChapsVision.pdf"
        self.page_dpi = 300  # For image rendering in multimodal mode
        self.vizualizer = KnowledgeGraphVisualizer()
        self.pdf_processor = PDFProcessor(self.pdf_path)
        self.max_workers = max_workers
        
        # Validate construction mode
        if construction_mode not in ["iterative", "onego", "parallel"]:
            raise ValueError("construction_mode must be one of: 'iterative', 'onego', 'parallel'")
        self.construction_mode = construction_mode
        
        # Validate extraction mode
        if extraction_mode not in ["text", "multimodal"]:
            raise ValueError("extraction_mode must be either 'text' or 'multimodal'")
        self.extraction_mode = extraction_mode

        # Identify the target company and advisory firms
        start_time = time()
        print(f"Identifying target company and advisory firms for project '{project_name}'...")
        self.company_identifier = CompanyIdentifier(self.llm, self.pdf_processor)
        self.companies_info = self.company_identifier.identify_companies(project_name)
        end_time = time()
        print(f"Company identification completed in {end_time - start_time:.2f} seconds.")
        print(f"Identified target company: {self.companies_info['target_company']['name']}")
        print(f"Identified advisory firms: {[firm['name'] for firm in self.companies_info['advisory_firms']]}")
        print(f"Project codename: {self.companies_info['project_codename']}")

In [11]:
builer = FinancialKGBuilder(
    model_name="gpt-4.1",
    deployment_name=os.getenv(f"AZURE_DEPLOYMENT_NAME_{"gpt-4.1"}"),
    project_name="System",
    construction_mode="iterative",
    extraction_mode="text"
)

Identifying target company and advisory firms for project 'System'...
Company identification completed in 1.50 seconds.
Identified target company: Systran SAS
Identified advisory firms: ['Rothschild & Co', 'Samsung Securities']
Project codename: System
