In [1]:
from llm_client import VertexAIClient, AzureOpenAIClient
from company_identifier import CompanyIdentifier
from utils.pdf_utils import PDFProcessor

from dotenv import load_dotenv
from pathlib import Path
import os

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [10]:
import json

def export_interactive_html(kg_data: dict, output_path = "blabla.html"):
        """
        Export the knowledge graph to an interactive HTML file using PyVis.
        Args:
            kg_data (dict): The knowledge graph data in JSON format.
            output_path (str): The path to save the HTML file.
        """
        from pyvis.network import Network

        net = Network(height="800px", width="100%", directed=True, notebook=False, cdn_resources='remote')

        # Updated color map for the streamlined ontology v1.2
        type_to_color = {
            # Core Business & Structure (Blues)
            "pekg:Company": "#1f77b4",      # Muted Blue
            "pekg:LegalEntity": "#aec7e8",  # Light Blue
            "pekg:Client": "#9edae5",       # Pale Cyan/Blue
            "pekg:GovernmentBody": "#10ac1a",# Bright Green (for government entities)

            # People & Roles (Greens)
            "pekg:Person": "#2ca02c",       # Muted Green
            "pekg:Position": "#98df8a",     # Light Green
            "pekg:Shareholder": "#d62728",  # Muted Red (distinct for ownership) - Or a Green if preferred with People
            
            # Financials & Metrics (Oranges/Yellows)
            "pekg:FinancialValue": "#ffbf7f",   # Light Orange (for the component, if visualized)
            "pekg:FinancialMetric": "#ff7f0e",  # Orange
            "pekg:OperationalKPI": "#ffbb78",   # Lighter Orange
            "pekg:Headcount": "#fdd0a2",        # Very Light Orange/Peach
            "pekg:RevenueStream": "#ffd700",    # Gold/Yellow

            # Products, Market, Technology (Purples/Pinks/Browns)
            "pekg:ProductOrService": "#9467bd", # Muted Purple
            "pekg:Technology": "#8c564b",       # Brown
            "pekg:MarketContext": "#e377c2",    # Pink
            "pekg:MarketMetric": "#f7b6d2",     # Lighter Pink
            "pekg:UseCaseOrIndustry": "#ce6dbd",# Medium Purple/Pink
            
            # Transactions & Events (Reds/Magentas)
            "pekg:TransactionContext": "#d62728", # Muted Red (same as Shareholder for impact, or choose different)
            "pekg:HistoricalEvent": "#e7969c",    # Desaturated Red/Pink

            # Supporting & Contextual (Greys/Other)
            "pekg:Advisor": "#7f7f7f",      # Medium Grey
            "pekg:Location": "#c7c7c7",     # Light Grey
            
            "default": "#cccccc" # Default for any unmapped types
        }

        # Attempt to create a more meaningful label for each node for PyVis
        id_to_label = {}
        for e in kg_data.get("entities", []):
            # Prioritize specific name fields based on type, then general 'name', then value fields
            entity_type = e.get("type")
            label_content = e.get("name") # Default to 'name'

            if entity_type == "pekg:FinancialMetric":
                label_content = e.get("metricName", e.get("name"))
                label_content += f" ({e.get('fiscalPeriod', '')})" if e.get("fiscalPeriod") else ""
            elif entity_type == "pekg:Headcount":
                label_content = e.get("headcountName", e.get("name"))
            elif entity_type == "pekg:MarketContext":
                label_content = e.get("segmentName", e.get("name"))
            elif entity_type == "pekg:TransactionContext":
                label_content = e.get("contextName", e.get("name"))
            elif entity_type == "pekg:GovernmentBody":
                label_content = e.get("name", e.get("name"))
            elif entity_type == "pekg:OperationalKPI":
                label_content = e.get("kpiName", e.get("name"))
            elif entity_type == "pekg:ProductOrService":
                label_content = e.get("productName", e.get("name")) # Assuming 'productName' might be used
            elif entity_type == "pekg:Person":
                label_content = e.get("fullName", e.get("name"))
            elif entity_type == "pekg:Position":
                label_content = e.get("titleName", e.get("name"))
            elif entity_type == "pekg:Location":
                label_content = e.get("locationName", e.get("name"))
            elif entity_type == "pekg:Shareholder":
                label_content = e.get("shareholderName", e.get("name"))
            elif entity_type == "pekg:HistoricalEvent":
                label_content = e.get("eventName", e.get("name"))
            elif entity_type == "pekg:Historicalevent":
                label_content = e.get("eventName", e.get("name"))
            # Add more specific fallbacks if needed for other types

            # If still no specific name, try common value fields before ID
            if not label_content:
                label_content = e.get("valueString", 
                                  e.get("kpiValueString",
                                  str(e.get("metricValue",
                                            e.get("headcountValue", 
                                                  e["id"]))))) # Default to ID

            id_to_label[e["id"]] = str(label_content)[:50] # Truncate very long labels for display

        for entity in kg_data.get("entities", []):
            entity_id = entity.get("id")
            entity_type = entity.get("type") # This is the prefixed type like "pekg:Company"
            
            if not entity_id or not entity_type:
                print(f"Skipping entity due to missing id or type: {entity}")
                continue

            tooltip_parts = [f"ID: {entity_id}", f"Type: {entity_type}"]
            for k, v in entity.items():
                if k not in {"id", "type"}:
                    tooltip_parts.append(f"{k}: {v}")
            tooltip = "<br>".join(tooltip_parts)
            
            # Use unprefixed type for color lookup if your map uses that, or full type if map uses full
            # Current map uses full prefixed types.
            color = type_to_color.get(entity_type, type_to_color["default"])
            
            label_for_node = id_to_label.get(entity_id, entity_id) # Use processed label

            net.add_node(
                entity_id,
                label=label_for_node,
                title=tooltip,
                color=color,
                shape="dot", # Default shape, can be customized per type
                size=15 # Default size
            )

        for rel in kg_data.get("relationships", []):
            source_id = rel.get("source")
            target_id = rel.get("target")
            rel_type_full = rel.get("type")

            if not source_id or not target_id or not rel_type_full:
                print(f"Skipping relationship due to missing source, target, or type: {rel}")
                continue
            
            # Ensure source and target nodes exist before adding edge (PyVis might handle this, but good practice)
            # This check might be too slow for very large graphs if done here.
            # PyVis usually just won't draw edges to non-existent nodes.

            relation_label = rel_type_full.split(":")[-1] # Show unprefixed relation type
            net.add_edge(source_id, target_id, label=relation_label, title=rel_type_full)

        net.set_options("""
        var options = {
            "nodes": {
            "shape": "dot",
            "size": 18,
            "font": {"size": 14, "face": "Tahoma"}
            },
            "edges": {
                "arrows": {"to": {"enabled": true, "scaleFactor": 0.7}},
                "color": {"inherit": "from"},
                "smooth": {"type": "continuous", "roundness": 0.2},
                "font": {"size": 10, "align": "middle"}
            },
            "physics": {
                "enabled": true,
                "barnesHut": {
                    "gravitationalConstant": -30000,
                    "centralGravity": 0.3,
                    "springLength": 250,
                    "springConstant": 0.04,
                    "damping": 0.09
                },
                "minVelocity": 0.75
            },
            "interaction": {
                "hover": true,
                "tooltipDelay": 200,
                "multiselect": true
            },
            "layout": {
                "hierarchical": false 
            }
        }
        """)

        try:
            net.write_html(output_path)
            # print(f"✅ Interactive graph saved to: {output_path}") # Moved to _save_page_graph
        except Exception as e:
            print(f"Error writing HTML file for graph: {e}")

def save_knowledge_graph(data: dict):
        if not data or (not data.get("entities") and not data.get("relationships")):
            print("Skipping save of final knowledge graph as it is empty or invalid.")
            return
            
        
        base_filename = f"blabla"

        json_output_file = f"{base_filename}.json"
        try:
            with open(json_output_file, "w") as f:
                json.dump(data, f, indent=2)
            print(f"Final knowledge graph saved to {json_output_file}")
        except Exception as e:
            print(f"Error saving final JSON knowledge graph: {e}")
        
        html_output_file = str(f"{base_filename}.html")
        try:
            if data.get("entities"): 
                export_interactive_html(data)
                print(f"Final knowledge graph visualization saved to {html_output_file}")
        except Exception as e:
            print(f"Could not save final HTML visualization: {e}")

In [11]:
with open("C:\\PE\\outputs\\System\\multimodal_kg_System_gemini-2.5-flash-preview-05-20_vertexai_iterative.json") as f:
    data = json.load(f)

In [12]:
export_interactive_html(data)

OTHER

In [2]:
input_file = Path("C:/PE/outputs/System/multimodal_kg_System_gemini-2.5-pro-preview-05-06_vertexai_parallel_12w.json")
ontology_file = Path("C:/PE/REPOS/llm_kg_extraction/llm_kg_extraction/ontology/pekg_ontology3.yaml")

In [3]:
from transform_json import transform_kg

In [5]:
transform_kg(
    input_file_path=input_file,
    ontology_file=ontology_file, 
    output_dir=Path("C:/PE/outputs/System/"),
    request_id_to_use="124",
    meta_title_to_use="Systran",
    extraction_mode="multimodal",
    model_name="gemini-2.5-pro-preview-05-06",
    llm_provider="vertexai",
    construction_mode="parallel"
)

Successfully created transformed file: C:\PE\outputs\System\multimodal_gemini-2.5-pro-preview-05-06_vertexai_parallel_meta.json
Successfully created transformed file: C:\PE\outputs\System\multimodal_gemini-2.5-pro-preview-05-06_vertexai_parallel_nodes.json
Successfully created transformed file: C:\PE\outputs\System\multimodal_gemini-2.5-pro-preview-05-06_vertexai_parallel_links.json


In [1]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()