# KB Enrichment - Manual Extraction

This notebook allows you to manually control and refine the knowledge base extraction process.

## Setup
Run this first to load your data files.

In [None]:
import json
from pathlib import Path
from typing import List, Dict, Any

# If running in Colab, mount drive or upload files
# from google.colab import drive
# drive.mount('/content/drive')

# Helper functions
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"✓ Saved {len(data)} items to {file_path}")

# Load source data
DATA_DIR = Path("src/data")  # Adjust path as needed

discovery = load_json(DATA_DIR / "AI_discovery.json")
archetypes = load_json(DATA_DIR / "AI_archetypes.json")
prerequisites = load_json(DATA_DIR / "AI_prerequisites.json")

print("✓ Data loaded successfully")

## Explore Data Structure
Understand what's in your source files.

In [None]:
# Explore discovery data
print("Discovery keys:", discovery.keys())
print("\nBusiness Function structure:")
print(json.dumps(discovery["Business_Function"], indent=2)[:500])

print("\n" + "="*50)
print("\nBusiness Sector structure:")
print(json.dumps(discovery["Business_Sector"], indent=2)[:500])

In [None]:
# Explore archetypes
print("Archetypes structure:")
print(json.dumps(archetypes, indent=2)[:800])

In [None]:
# Explore prerequisites
print("Prerequisites structure:")
print(json.dumps(prerequisites, indent=2)[:800])

## Phase 0: Extract Business Context Nodes
Manually define what to extract from discovery data.

In [None]:
# CUSTOMIZE THIS: Extract Business Functions
function_nodes = []
bf_data = discovery["Business_Function"]
categories = bf_data.get("categories", [])

func_id = 1
for category in categories:
    cat_name = category.get("category", "Unknown")
    functions = category.get("functions", [])
    
    for func in functions:
        if isinstance(func, dict):
            function_nodes.append({
                "id": f"FUNCTION_{func_id:03d}",
                "name": func.get("name", "Unknown"),
                "node_type": "BUSINESS_FUNCTION",
                "category": cat_name,
                # CUSTOMIZE: What attributes do you want?
                # "tools": func.get("tools_and_processes", []),
                # "description": func.get("description", "")
            })
            func_id += 1

print(f"Extracted {len(function_nodes)} functions")
print("\nSample:")
print(json.dumps(function_nodes[:2], indent=2))

In [None]:
# CUSTOMIZE THIS: Extract Business Sectors
# Do you want categories or individual sectors?
sector_nodes = []
bs_data = discovery["Business_Sector"]
categories = bs_data.get("categories", [])

sector_id = 1
for category in categories:
    if isinstance(category, dict):
        # Option 1: Just categories
        # sector_nodes.append({
        #     "id": f"SECTOR_{sector_id:03d}",
        #     "name": category.get("category", "Unknown"),
        #     "node_type": "BUSINESS_SECTOR"
        # })
        # sector_id += 1
        
        # Option 2: Individual sectors
        sectors = category.get("sectors", [])
        for sector in sectors:
            if isinstance(sector, dict):
                sector_nodes.append({
                    "id": f"SECTOR_{sector_id:03d}",
                    "name": sector.get("name", "Unknown"),
                    "node_type": "BUSINESS_SECTOR",
                    "category": category.get("category", "Unknown")
                })
                sector_id += 1

print(f"Extracted {len(sector_nodes)} sectors")
print("\nSample:")
print(json.dumps(sector_nodes[:2], indent=2))

In [None]:
# CUSTOMIZE THIS: Extract Tools
# Should tools be extracted at all? Or kept as attributes?
tool_set = set()

# Collect from functions
for category in discovery["Business_Function"].get("categories", []):
    for func in category.get("functions", []):
        if isinstance(func, dict):
            tools = func.get("tools_and_processes", [])
            for tool in tools:
                if isinstance(tool, str):
                    tool_set.add(tool)

tool_nodes = []
for idx, tool_name in enumerate(sorted(tool_set), 1):
    tool_nodes.append({
        "id": f"TOOL_{idx:03d}",
        "name": tool_name,
        "node_type": "BUSINESS_TOOL"
    })

print(f"Extracted {len(tool_nodes)} tools")
print("\nSample:")
print(json.dumps(tool_nodes[:3], indent=2))

## Phase 1: Extract AI Archetype Nodes

In [None]:
# CUSTOMIZE THIS: Extract Archetypes
archetype_nodes = []
model_nodes = []
output_nodes = []

archetype_list = archetypes.get("AI_Use_Case_Archetypes", [])

for idx, archetype in enumerate(archetype_list, 1):
    if isinstance(archetype, dict):
        archetype_nodes.append({
            "id": f"ARCHETYPE_{idx:03d}",
            "name": archetype.get("archetype", "Unknown"),
            "node_type": "AI_ARCHETYPE",
            "description": archetype.get("core_task", ""),
            "analytical_purpose": archetype.get("analytical_purpose", []),
            "technical_family": archetype.get("technical_family", "")
        })
        
        # Extract models
        for model in archetype.get("common_models", []):
            if model not in [m["name"] for m in model_nodes]:
                model_nodes.append({
                    "id": f"MODEL_{len(model_nodes) + 1:03d}",
                    "name": model,
                    "node_type": "COMMON_MODEL"
                })
        
        # Extract outputs
        for output in archetype.get("example_outputs", []):
            if output not in [o["name"] for o in output_nodes]:
                output_nodes.append({
                    "id": f"OUTPUT_{len(output_nodes) + 1:03d}",
                    "name": output,
                    "node_type": "AI_OUTPUT"
                })

print(f"Extracted {len(archetype_nodes)} archetypes")
print(f"Extracted {len(model_nodes)} models")
print(f"Extracted {len(output_nodes)} outputs")

In [None]:
# CUSTOMIZE THIS: Extract Prerequisites
prerequisite_nodes = []

for category, prereqs in prerequisites.items():
    if isinstance(prereqs, list):
        for prereq in prereqs:
            if isinstance(prereq, str):
                prerequisite_nodes.append({
                    "id": f"PREREQ_{len(prerequisite_nodes) + 1:03d}",
                    "name": prereq,
                    "node_type": "AI_PREREQUISITE",
                    "category": category
                })

print(f"Extracted {len(prerequisite_nodes)} prerequisites")
print("\nSample:")
print(json.dumps(prerequisite_nodes[:3], indent=2))

## Save Extracted Nodes

In [None]:
# Create output directory
OUTPUT_DIR = Path("manual_output")
OUTPUT_DIR.mkdir(exist_ok=True)

# Save Phase 0 nodes
save_json(function_nodes, OUTPUT_DIR / "FUNCTION_NODES.json")
save_json(sector_nodes, OUTPUT_DIR / "SECTOR_NODES.json")
save_json(tool_nodes, OUTPUT_DIR / "TOOL_NODES.json")

# Save Phase 1 nodes
save_json(archetype_nodes, OUTPUT_DIR / "AI_ARCHETYPE_NODES.json")
save_json(model_nodes, OUTPUT_DIR / "COMMON_MODEL_NODES.json")
save_json(output_nodes, OUTPUT_DIR / "AI_OUTPUT_NODES.json")
save_json(prerequisite_nodes, OUTPUT_DIR / "AI_PREREQUISITE_NODES.json")

print("\n✓ All nodes saved to", OUTPUT_DIR)

## Review Statistics

In [None]:
print("=" * 50)
print("EXTRACTION SUMMARY")
print("=" * 50)
print(f"\nPhase 0 (Business Context):")
print(f"  Functions: {len(function_nodes)}")
print(f"  Sectors: {len(sector_nodes)}")
print(f"  Tools: {len(tool_nodes)}")
print(f"\nPhase 1 (AI Components):")
print(f"  Archetypes: {len(archetype_nodes)}")
print(f"  Models: {len(model_nodes)}")
print(f"  Outputs: {len(output_nodes)}")
print(f"  Prerequisites: {len(prerequisite_nodes)}")
print(f"\nTotal Nodes: {len(function_nodes) + len(sector_nodes) + len(tool_nodes) + len(archetype_nodes) + len(model_nodes) + len(output_nodes) + len(prerequisite_nodes)}")