open-sciencelab · ChenZiHong-Gavin · Aug 28, 2025 · Aug 27, 2025 · Aug 28, 2025
diff --git a/README.md b/README.md
@@ -56,6 +56,8 @@ Here is post-training result which **over 50% SFT data** comes from GraphGen and
 It begins by constructing a fine-grained knowledge graph from the source text，then identifies knowledge gaps in LLMs using the expected calibration error metric, prioritizing the generation of QA pairs that target high-value, long-tail knowledge.
 Furthermore, GraphGen incorporates multi-hop neighborhood sampling to capture complex relational information and employs style-controlled generation to diversify the resulting QA data.
 
+After data generation, you can use [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) and [xtuner](https://github.com/InternLM/xtuner) to finetune your LLMs.
+
 ## 📌 Latest Updates
 
 - **2025.08.14**: We have added support for community detection in knowledge graphs using the Leiden algorithm, enabling the synthesis of Chain-of-Thought (CoT) data.

diff --git a/README_ZH.md b/README_ZH.md
@@ -57,6 +57,8 @@ GraphGen 是一个基于知识图谱引导的合成数据生成框架。请查
 GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期望校准误差指标识别大语言模型中的知识缺口，优先生成针对高价值长尾知识的问答对。  
 此外，GraphGen 采用多跳邻域采样捕获复杂关系信息，并使用风格控制生成来丰富问答数据的多样性。
 
+在数据生成后，您可以使用[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) 和 [xtuner](https://github.com/InternLM/xtuner)对大语言模型进行微调。
+
 ## 📌 最新更新
 
 - **2025.08.14**：支持利用 Leiden 社区发现算法对知识图谱进行社区划分，合成 CoT 数据。

diff --git a/graphgen/configs/README.md b/graphgen/configs/README.md
@@ -0,0 +1 @@
+# Configs for GraphGen
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -1,18 +1,21 @@
-input_data_type: raw
-input_file: resources/input_examples/raw_demo.jsonl
-output_data_type: aggregated
-tokenizer: cl100k_base
-quiz_samples: 2
-traverse_strategy:
-  bidirectional: true
-  edge_sampling: max_loss
-  expand_method: max_width
-  isolated_node_strategy: ignore
-  max_depth: 5
-  max_extra_edges: 20
-  max_tokens: 256
-  loss_strategy: only_edge
-search:
-  enabled: false
-  search_types: ["google"]
-re_judge: false
+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: aggregated # atomic, aggregated, multi_hop, cot
+output_data_format: ChatML # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 5 # maximum depth for graph traversal
+  max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -1,18 +1,21 @@
-input_data_type: raw
-input_file: resources/input_examples/raw_demo.jsonl
-output_data_type: atomic
-tokenizer: cl100k_base
-quiz_samples: 2
-traverse_strategy:
-  bidirectional: true
-  edge_sampling: max_loss
-  expand_method: max_width
-  isolated_node_strategy: ignore
-  max_depth: 3
-  max_extra_edges: 5
-  max_tokens: 256
-  loss_strategy: only_edge
-search:
-  enabled: false
-  search_types: ["google"]
-re_judge: false
+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: atomic # atomic, aggregated, multi_hop, cot
+output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 3 # maximum depth for graph traversal
+  max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -1,10 +1,11 @@
-input_data_type: raw
-input_file: resources/input_examples/raw_demo.jsonl
-output_data_type: cot
-tokenizer: cl100k_base
-search:
-  enabled: false
-  search_types: []
+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl  # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: cot # atomic, aggregated, multi_hop, cot
+output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
 method_params:
   method: leiden
   max_size: 20 # Maximum size of communities

diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -1,18 +1,21 @@
-input_data_type: raw
-input_file: resources/input_examples/raw_demo.jsonl
-output_data_type: multi_hop
-tokenizer: cl100k_base
-quiz_samples: 2
-traverse_strategy:
-  bidirectional: true
-  edge_sampling: max_loss
-  expand_method: max_width
-  isolated_node_strategy: ignore
-  max_depth: 1
-  max_extra_edges: 2
-  max_tokens: 256
-  loss_strategy: only_edge
-search:
-  enabled: false
-  search_types: ["google"]
-re_judge: false
+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
+output_data_format: ChatML # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 1 # maximum depth for graph traversal
+  max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
diff --git a/graphgen/generate.py b/graphgen/generate.py
@@ -7,8 +7,7 @@
 from dotenv import load_dotenv
 
 from .graphgen import GraphGen
-from .models import OpenAIModel, Tokenizer, TraverseStrategy
-from .utils import logger, read_file, set_logger
+from .utils import logger, set_logger
 
 sys_path = os.path.abspath(os.path.dirname(__file__))
 
@@ -53,10 +52,8 @@ def main():
 
     with open(args.config_file, "r", encoding="utf-8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
-    input_file = config["input_file"]
-    data = read_file(input_file)
-    output_data_type = config["output_data_type"]
 
+    output_data_type = config["output_data_type"]
     unique_id = int(time.time())
     set_logger(
         os.path.join(
@@ -72,41 +69,26 @@ def main():
         ),
     )
 
-    tokenizer_instance = Tokenizer(model_name=config["tokenizer"])
-    synthesizer_llm_client = OpenAIModel(
-        model_name=os.getenv("SYNTHESIZER_MODEL"),
-        api_key=os.getenv("SYNTHESIZER_API_KEY"),
-        base_url=os.getenv("SYNTHESIZER_BASE_URL"),
-        tokenizer_instance=tokenizer_instance,
-    )
-    trainee_llm_client = OpenAIModel(
-        model_name=os.getenv("TRAINEE_MODEL"),
-        api_key=os.getenv("TRAINEE_API_KEY"),
-        base_url=os.getenv("TRAINEE_BASE_URL"),
-        tokenizer_instance=tokenizer_instance,
-    )
-
-    graph_gen = GraphGen(
-        working_dir=working_dir,
-        unique_id=unique_id,
-        synthesizer_llm_client=synthesizer_llm_client,
-        trainee_llm_client=trainee_llm_client,
-        search_config=config["search"],
-        tokenizer_instance=tokenizer_instance,
-    )
+    graph_gen = GraphGen(working_dir=working_dir, unique_id=unique_id, config=config)
 
-    graph_gen.insert(data, config["input_data_type"])
+    graph_gen.insert()
 
     if config["search"]["enabled"]:
         graph_gen.search()
 
     # Use pipeline according to the output data type
     if output_data_type in ["atomic", "aggregated", "multi_hop"]:
-        graph_gen.quiz(max_samples=config["quiz_samples"])
-        graph_gen.judge(re_judge=config["re_judge"])
-        traverse_strategy = TraverseStrategy(**config["traverse_strategy"])
-        traverse_strategy.qa_form = output_data_type
-        graph_gen.traverse(traverse_strategy=traverse_strategy)
+        if "quiz_and_judge_strategy" in config and config[
+            "quiz_and_judge_strategy"
+        ].get("enabled", False):
+            graph_gen.quiz()
+            graph_gen.judge()
+        else:
+            logger.warning(
+                "Quiz and Judge strategy is disabled. Edge sampling falls back to random."
+            )
+            graph_gen.traverse_strategy.edge_sampling = "random"
+        graph_gen.traverse()
     elif output_data_type == "cot":
         graph_gen.generate_reasoning(method_params=config["method_params"])
     else: