# グラフ抽出 (Graph Extraction)

収集したテキストデータから手続きの依存関係グラフを抽出する。
LLMを使用してテキストを解析し、JSON形式のグラフデータに変換する。

In [None]:
import os
import json
import glob
import asyncio
from typing import List, Optional
from enum import Enum
from pydantic import BaseModel, Field

from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
import config

# 設定
DATA_DIR = "data/raw_text"
OUTPUT_DIR = "data/processed_graph"
PROMPT_FILE = "prompt.md"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# LLM設定
# MODEL_NAME = "gemma3:12b"
MODEL_NAME = "qwen3-vl:8b"
BASE_URL = "http://localhost:11434"

In [None]:
class ActionType(str, Enum):
    Physical_Go = "Physical_Go"
    Physical_Mail = "Physical_Mail"
    Physical_Copy = "Physical_Copy"
    Physical_Print = "Physical_Print"
    Physical_Fill = "Physical_Fill"
    Physical_Attach = "Physical_Attach"
    External_Acquire = "External_Acquire"
    Digital_Input = "Digital_Input"
    Digital_Auth = "Digital_Auth"
    Digital_Upload = "Digital_Upload"
    Digital_Capture = "Digital_Capture"
    Digital_Submit = "Digital_Submit"
    Wait_Process = "Wait_Process"
    No_Action = "No_Action"

class ActionCategory(str, Enum):
    Work = "Work"
    Move = "Move"
    Wait = "Wait"

class GraphEdge(BaseModel):
    source: str = Field(description="The starting point or prerequisite of the action")
    target: str = Field(description="The result or next step of the action")
    action: str = Field(description="The specific action taken")
    type: ActionType = Field(description="The type of cost/action")
    category: ActionCategory = Field(description="The subject/category of the action")

class GraphData(BaseModel):
    analog: List[GraphEdge] = Field(description="Dependency graph for analog application")
    digital: List[GraphEdge] = Field(description="Dependency graph for digital application")

In [None]:
# Initialize LLM
llm = ChatOllama(
    model=MODEL_NAME,
    base_url=BASE_URL,
    temperature=0.1
)

# Structured Output
structured_llm = llm.with_structured_output(GraphData)

def create_chain(prompt_path: str):
    with open(prompt_path, "r", encoding="utf-8") as f:
        prompt_text = f.read()
    
    # Create Prompt Template
    # prompt.md contains {input_homepage} and {input_digital}
    prompt = ChatPromptTemplate.from_template(prompt_text)
    
    # Create Chain
    chain = prompt | structured_llm
    return chain

async def extract_graph(chain, homepage_text, digital_text):
    try:
        result = await chain.ainvoke({
            "input_homepage": homepage_text,
            "input_digital": digital_text
        })
        return result
    except Exception as e:
        print(f"Error during extraction: {e}")
        return None

In [None]:
async def main():
    chain = create_chain(PROMPT_FILE)
    
    for city in config.TARGET_CITIES:
        city_id = city["id"]
        city_name = city["name"]
        print(f"Processing {city_name} ({city_id})...")
        
        # ファイルの検索
        homepage_files = glob.glob(os.path.join(DATA_DIR, f"{city_id}_homepage*.txt"))
        digital_files = glob.glob(os.path.join(DATA_DIR, f"{city_id}_digital*.txt"))
        
        if not homepage_files or not digital_files:
            print(f"  -> Missing files for {city_name}. Skipping.")
            continue
            
        with open(homepage_files[0], "r", encoding="utf-8") as f:
            homepage_text = f.read()
        with open(digital_files[0], "r", encoding="utf-8") as f:
            digital_text = f.read()
            
        # グラフ抽出
        graph_data = await extract_graph(chain, homepage_text, digital_text)
        
        if graph_data:
            output_path = os.path.join(OUTPUT_DIR, f"{city_id}.json")
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(graph_data.model_dump(), f, indent=2, ensure_ascii=False)
            print(f"  -> Saved to {output_path}")
        else:
            print(f"  -> Failed to extract graph for {city_name}")

if __name__ == "__main__":
    await main()

Processing 文京区 (13105)...


CancelledError: 