Intern-Science · ChenZiHong-Gavin · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025
diff --git a/.env.example b/.env.example
@@ -1,3 +1,4 @@
+TOKENIZER_MODEL=
 SYNTHESIZER_MODEL=
 SYNTHESIZER_BASE_URL=
 SYNTHESIZER_API_KEY=

diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -6,19 +6,21 @@ split:
 search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-output_data_type: aggregated # atomic, aggregated, multi_hop, cot
-output_data_format: ChatML # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: true
   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 5 # maximum depth for graph traversal
-  max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+partition: # graph partition configuration
+  method: ece # ece is a custom partition method based on comprehension loss
+  method_params:
+    bidirectional: true # whether to traverse the graph in both directions
+    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+    expand_method: max_width # expand method, support: max_width, max_depth
+    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+    max_depth: 5 # maximum depth for graph traversal
+    max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
+    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+generate:
+  mode: aggregated # atomic, aggregated, multi_hop, cot
+  data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -6,19 +6,21 @@ split:
 search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-output_data_type: atomic # atomic, aggregated, multi_hop, cot
-output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: true
   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 3 # maximum depth for graph traversal
-  max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+partition: # graph partition configuration
+  method: ece # ece is a custom partition method based on comprehension loss
+  method_params:
+    bidirectional: true # whether to traverse the graph in both directions
+    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+    expand_method: max_width # expand method, support: max_width, max_depth
+    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+    max_depth: 3 # maximum depth for graph traversal
+    max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
+    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+generate:
+  mode: atomic # atomic, aggregated, multi_hop, cot
+  data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -6,11 +6,14 @@ split:
 search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-output_data_type: cot # atomic, aggregated, multi_hop, cot
-output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-method_params:
-  method: leiden
-  max_size: 20 # Maximum size of communities
-  use_lcc: false
-  random_seed: 42
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
+  enabled: false
+partition: # graph partition configuration
+  method: leiden # leiden is a community detection algorithm
+  method_params:
+    max_size: 20 # Maximum size of communities
+    use_lcc: false
+    random_seed: 42
+generate:
+  mode: cot # atomic, aggregated, multi_hop, cot
+  data_format: Sharegpt # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -6,19 +6,21 @@ split:
 search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
-output_data_format: ChatML # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: false
   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 1 # maximum depth for graph traversal
-  max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+partition: # graph partition configuration
+  method: ece # ece is a custom partition method based on comprehension loss
+  method_params:
+    bidirectional: true # whether to traverse the graph in both directions
+    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+    expand_method: max_width # expand method, support: max_width, max_depth
+    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+    max_depth: 1 # maximum depth for graph traversal
+    max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
+    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+generate:
+  mode: multi_hop # strategy for generating multi-hop QA pairs
+  data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/generate.py b/graphgen/generate.py
@@ -6,8 +6,8 @@
 import yaml
 from dotenv import load_dotenv
 
-from .graphgen import GraphGen
-from .utils import logger, set_logger
+from graphgen.graphgen import GraphGen
+from graphgen.utils import logger, set_logger
 
 sys_path = os.path.abspath(os.path.dirname(__file__))
 
@@ -50,50 +50,51 @@ def main():
     with open(args.config_file, "r", encoding="utf-8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
 
-    output_data_type = config["output_data_type"]
+    mode = config["generate"]["mode"]
     unique_id = int(time.time())
 
-    output_path = os.path.join(
-        working_dir, "data", "graphgen", f"{unique_id}_{output_data_type}"
-    )
+    output_path = os.path.join(working_dir, "data", "graphgen", f"{unique_id}")
     set_working_dir(output_path)
 
     set_logger(
-        os.path.join(output_path, f"{unique_id}.log"),
+        os.path.join(output_path, f"{unique_id}_{mode}.log"),
         if_stream=True,
     )
     logger.info(
         "GraphGen with unique ID %s logging to %s",
         unique_id,
-        os.path.join(
-            working_dir, "logs", f"{unique_id}_graphgen_{output_data_type}.log"
-        ),
+        os.path.join(working_dir, f"{unique_id}_{mode}.log"),
     )
 
-    graph_gen = GraphGen(working_dir=working_dir, unique_id=unique_id, config=config)
+    graph_gen = GraphGen(unique_id=unique_id, working_dir=working_dir)
 
-    graph_gen.insert()
+    graph_gen.insert(read_config=config["read"], split_config=config["split"])
 
-    if config["search"]["enabled"]:
-        graph_gen.search()
+    graph_gen.search(search_config=config["search"])
 
     # Use pipeline according to the output data type
-    if output_data_type in ["atomic", "aggregated", "multi_hop"]:
-        if "quiz_and_judge_strategy" in config and config[
-            "quiz_and_judge_strategy"
-        ].get("enabled", False):
-            graph_gen.quiz()
-            graph_gen.judge()
+    if mode in ["atomic", "aggregated", "multi_hop"]:
+        logger.info("Generation mode set to '%s'. Start generation.", mode)
+        if "quiz_and_judge" in config and config["quiz_and_judge"]["enabled"]:
+            graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
         else:
             logger.warning(
                 "Quiz and Judge strategy is disabled. Edge sampling falls back to random."
             )
-            graph_gen.traverse_strategy.edge_sampling = "random"
-        graph_gen.traverse()
-    elif output_data_type == "cot":
-        graph_gen.generate_reasoning(method_params=config["method_params"])
+            assert (
+                config["partition"]["method"] == "ece"
+                and "ece_params" in config["partition"]
+            ), "Only ECE partition with edge sampling is supported."
+            config["partition"]["method_params"]["edge_sampling"] = "random"
+    elif mode == "cot":
+        logger.info("Generation mode set to 'cot'. Start generation.")
     else:
-        raise ValueError(f"Unsupported output data type: {output_data_type}")
+        raise ValueError(f"Unsupported output data type: {mode}")
+
+    graph_gen.generate(
+        partition_config=config["partition"],
+        generate_config=config["generate"],
+    )
 
     save_config(os.path.join(output_path, "config.yaml"), config)
     logger.info("GraphGen completed successfully. Data saved to %s", output_path)