# 主要思路
该目标数量庞大，直接进行知识图谱转化效率太慢，且损失了文件原本的结构信息，因此采用如下方案：
1. 根据源文件生成结构树，保留文件结构，标题信息
2. 分别对于每个树节点进行知识图谱关系的抽取（叶节点），其余节点通过子节点关系进行融合操作生成图像
3. 对应每一层级都存在相应的知识图谱关系


# 步骤一 结构化文档生成
以原目标文件第一个大标题下的所有内容为例 

In [6]:
# 树节点构建
import os

class Node:
    def __init__(self, name):
        self.name = name
        self.children = {}
        self.parent = None
        self.text = ""
        self.graph = None
        self.is_leaf = False

    def add_child(self, child):
        if child.name in self.children:
            raise ValueError(f"子节点名称已存在: {child.name}")
        self.children[child.name] = child
        child.parent = self

    def get_child(self, name):
        return self.children.get(name)

    def get_children(self):
        return list(self.children.values())

    def __hash__(self):
        return hash(self.name)

    def __eq__(self, other):
        if isinstance(other, Node):
            return self.name == other.name
        return False

    def get_parent(self):
        return self.parent

    def get_name(self):
        return self.name
    
    def get_text(self):
        return self.text

    def __str__(self):
        return self.name
    
    def __repr__(self):
        return f"<Node: {self.name}>"


In [7]:
# 递归遍历树结构
class TextProcessor:
    def __init__(self):
        self.root = Node("ROOT")  # 虚拟根节点
        self.stack = [self.root]  # 堆栈初始包含虚拟根节点

    def parse_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                self._process_line(line.strip('\n'))
        self._post_process(self.root)  # 后处理清除非叶子节点的text

    def _process_line(self, line):
        if line.startswith('#'):
            # 解析标题级别和名称
            level = 0
            while line.startswith('#'):
                level += 1
                line = line[1:].lstrip()
            name = line.strip()
            self._update_hierarchy(level, name)
        else:
            # 添加文本内容到当前节点
            if self.stack:
                current_node = self.stack[-1]
                current_node.text += line + '\n'

    def _update_hierarchy(self, level, name):
        # 检查标题级别合法性
        if level < 1 or level > 6:
            raise ValueError(f"无效的标题级别: {level}")

        # 计算父节点层级并调整堆栈
        parent_level = level - 1
        if parent_level > len(self.stack):
            raise ValueError(f"标题层级跳级: Lv{level} 父节点未定义")
        
        # 截断堆栈到父节点层级
        self.stack = self.stack[:parent_level]
        parent_node = self.stack[-1] if self.stack else self.root

        # 查找或创建节点
        current_node = parent_node.get_child(name)
        if not current_node:
            current_node = Node(name)
            parent_node.add_child(current_node)
        
        # 更新堆栈
        self.stack.append(current_node)

    def _post_process(self, node):
        """后处理：清除非叶子节点的文本内容"""
        if node.children:
            node.text = ""
            for child in node.get_children():
                self._post_process(child)
        else:
            node.is_leaf = True


In [8]:
# if __name__ == "__main__":
#     processor = TextProcessor()
#     processor.parse_file("文献/1.txt")  
    
#     # 打印树结构验证
#     def print_tree(node, indent=0):
#         prefix = '  ' * indent
#         print(f"{prefix}└─ {node.name} [Leaf: {node.is_leaf}]")
#         if node.text.strip():
#             print(f"{prefix}    Text: {node.text.strip()[:30]}...")
#         for child in node.get_children():
#             print_tree(child, indent + 1)
    

#     print_tree(processor.root)
#     print(processor.root.children)



# 步骤二 递归构建节点知识图谱

In [9]:
from kg_gen import KGGen


kg = KGGen(
  model="deepseek/deepseek-chat",  
  temperature=0.0,       
  api_key="sk-c0ff37fca9f3439fab7ca60ef6819cef"  
)

# kg2 = KGGen(
#   model="github/llama3-8b-8192",  
#   temperature=0.0,       
#   api_key="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"  
# )

In [10]:
def clean_text(text):
    """移除可能干扰关系提取的特殊字符"""
    import re
    return re.sub(r'[【】、，；：（）“”‘’…]', ' ', text)

def build_graph(node, kg):
    text = clean_text(node.get_text())
    graph = kg.generate(
        input_data=text,
        context=node.get_name()
    )
    node.graph = graph

# 基于子图的融合

def merge_tree(node,kg):
    if not node.children:
        print("该节点不含有子节点")
    else:
        temp=[]
        for child in node.get_children():
            temp.append(child.graph)
        combined_graph=kg.aggregate(temp)

        node.graph=combined_graph

In [11]:
# 生成叶节点的图
def create_map(node,kg):
    if node.children:
        for child in node.get_children():
            create_map(child,kg)
    else:
            build_graph(node,kg)
# 生成其他节点的图
def merge_map(node,kg):
    for child in node.get_children():
        if child.is_leaf is not True:
        
            merge_map(child,kg)
    else:
        merge_tree(node,kg)


In [64]:
# create_map(processor.root,kg)
# merge_map(processor.root,kg)
# print(processor.root.graph)

# 步骤三 绘制

In [65]:
# import networkx as nx
# from pyvis.network import Network


# graph=processor.root.children['一、掌握历史主动，在新时代更好坚持和发展中国特色社会主义'].children['关于《中共中央关于党的百年奋斗重大成就和历史经验的决议》的说明“'].graph
# print(graph)
# entities,edges,relations=graph.entities,graph.edges,graph.relations



In [66]:


# # 创建有向图
# G = nx.DiGraph()
# # 添加节点（实体）
# for entity in entities:
#     G.add_node(entity, title=entity, color='#97C2FC', size=20)

# # 添加边（关系）
# for (source, edge, target) in relations:
#     G.add_edge(source, target, title=edge, label=edge, color='gray', arrows='to')

# # 生成交互式可视化
# net = Network(height='800px', width='100%', notebook=False, directed=True)
# net.from_nx(G)

# # 配置可视化参数
# net.set_options("""
# {
#   "nodes": {
#     "font": {
#       "size": 14,
#       "color": "#333"
#     }
#   },
#   "edges": {
#     "color": {
#       "inherit": true
#     },
#     "smooth": false
#   },
#   "physics": {
#     "stabilization": true,
#     "barnesHut": {
#       "gravitationalConstant":-5000,
#       "springLength": 100,
#       "springConstant": 0.02
#     }
#   }
# }
# """)

# # 保存并显示
# net.show('knowledge_graph.html', notebook=False)

In [12]:
import json
from jinja2 import Template

class EnhancedTextProcessor(TextProcessor):
    def collect_data(self):
        """收集所有节点的图谱数据"""
        graph_data = {}
        
        def _traverse(node):
            if node.graph:
                # 转换图谱数据格式
                graph_data[node.name] = {
                    'nodes': [{'id': e, 'label': e} for e in node.graph.entities],
                    'edges': [{'from': s, 'to': t, 'label': r} for (s, r, t) in node.graph.relations]
            }
            for child in node.get_children():
                _traverse(child)
        
        _traverse(self.root)
        return graph_data

    def generate_html(self, output_path):
        """生成整合的可视化页面"""
        # 生成树形结构数据
        def build_tree(node):
            return {
                'id': node.name,
                'text': node.name,
                'children': [build_tree(c) for c in node.get_children()]
            }
        
        # 准备模板数据
        context = {
            'tree_json': json.dumps(build_tree(self.root)),
            'graph_data': json.dumps(self.collect_data())
        }

        # HTML模板
        html_template = """
<!DOCTYPE html>
<html>
<head>
    <title>集成可视化</title>
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/jstree/3.2.1/themes/default/style.min.css">
    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/jstree/3.2.1/jstree.min.js"></script>
    <script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
    <style>
        .container { display: flex; height: 100vh; }
        #tree { width: 300px; border-right: 1px solid #ccc; padding: 10px; }
        #graph { flex: 1; padding: 10px; }
    </style>
</head>
<body>
    <div class="container">
        <div id="tree"></div>
        <div id="graph"></div>
    </div>

    <script>
        // 初始化树形结构
        $('#tree').jstree({
            'core': {
                'data': {{ tree_json|safe }},
                'themes': { 'dots': false }
            }
        }).on('select_node.jstree', function(e, data) {
            // 加载对应图谱
            const graphData = {{ graph_data|safe }}[data.node.id];
            if (!graphData) return;

            // 销毁旧实例
            if (window.network) window.network.destroy();
            
            // 创建新实例
            const container = document.getElementById('graph');
            const options = {
                nodes: { 
                    shape: 'box',
                    color: '#97C2FC',
                    margin: 10,
                    font: { size: 14 }
                },
                edges: {
                    arrows: 'to',
                    color: '#666',
                    font: { background: 'white' }
                },
                physics: {
                    stabilization: {
                        enabled: true,
                        iterations: 100
                    }
                }
            };
            
            window.network = new vis.Network(
                container,
                {
                    nodes: new vis.DataSet(graphData.nodes),
                    edges: new vis.DataSet(graphData.edges)
                },
                options
            );
        });

        // 默认选中根节点
        $(document).ready(() => $('#tree').jstree('select_node', 'ROOT'));
    </script>
</body>
</html>
        """

        # 生成最终文件
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(Template(html_template).render(context))

In [14]:
# 初始化增强版处理器
processor_2 = EnhancedTextProcessor()

# 解析Markdown文件构建树结构
processor_2.parse_file("3.txt")
create_map(processor_2.root,kg)
merge_map(processor_2.root,kg)


# 生成可视化文件
processor_2.generate_html("integrated_view_3.html")

