Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
TOKENIZER_MODEL=
SYNTHESIZER_MODEL=
SYNTHESIZER_BASE_URL=
SYNTHESIZER_API_KEY=
Expand Down
28 changes: 15 additions & 13 deletions graphgen/configs/aggregated_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,21 @@ split:
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
output_data_type: aggregated # atomic, aggregated, multi_hop, cot
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: true
quiz_samples: 2 # number of quiz samples to generate
re_judge: false # whether to re-judge the existing quiz samples
traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
bidirectional: true # whether to traverse the graph in both directions
edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
expand_method: max_width # expand method, support: max_width, max_depth
isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
max_depth: 5 # maximum depth for graph traversal
max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
max_tokens: 256 # restricts input length (if expand_method="max_tokens")
loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
partition: # graph partition configuration
method: ece # ece is a custom partition method based on comprehension loss
method_params:
bidirectional: true # whether to traverse the graph in both directions
edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
expand_method: max_width # expand method, support: max_width, max_depth
isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
max_depth: 5 # maximum depth for graph traversal
max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
max_tokens: 256 # restricts input length (if expand_method="max_tokens")
loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
generate:
mode: aggregated # atomic, aggregated, multi_hop, cot
data_format: ChatML # Alpaca, Sharegpt, ChatML
28 changes: 15 additions & 13 deletions graphgen/configs/atomic_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,21 @@ split:
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
output_data_type: atomic # atomic, aggregated, multi_hop, cot
output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: true
quiz_samples: 2 # number of quiz samples to generate
re_judge: false # whether to re-judge the existing quiz samples
traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
bidirectional: true # whether to traverse the graph in both directions
edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
expand_method: max_width # expand method, support: max_width, max_depth
isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
max_depth: 3 # maximum depth for graph traversal
max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
max_tokens: 256 # restricts input length (if expand_method="max_tokens")
loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
partition: # graph partition configuration
method: ece # ece is a custom partition method based on comprehension loss
method_params:
bidirectional: true # whether to traverse the graph in both directions
edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
expand_method: max_width # expand method, support: max_width, max_depth
isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
max_depth: 3 # maximum depth for graph traversal
max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
max_tokens: 256 # restricts input length (if expand_method="max_tokens")
loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
generate:
mode: atomic # atomic, aggregated, multi_hop, cot
data_format: Alpaca # Alpaca, Sharegpt, ChatML
19 changes: 11 additions & 8 deletions graphgen/configs/cot_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@ split:
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
output_data_type: cot # atomic, aggregated, multi_hop, cot
output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
method_params:
method: leiden
max_size: 20 # Maximum size of communities
use_lcc: false
random_seed: 42
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: false
partition: # graph partition configuration
method: leiden # leiden is a community detection algorithm
method_params:
max_size: 20 # Maximum size of communities
use_lcc: false
random_seed: 42
generate:
mode: cot # atomic, aggregated, multi_hop, cot
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
28 changes: 15 additions & 13 deletions graphgen/configs/multi_hop_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,21 @@ split:
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: false
quiz_samples: 2 # number of quiz samples to generate
re_judge: false # whether to re-judge the existing quiz samples
traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
bidirectional: true # whether to traverse the graph in both directions
edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
expand_method: max_width # expand method, support: max_width, max_depth
isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
max_depth: 1 # maximum depth for graph traversal
max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
max_tokens: 256 # restricts input length (if expand_method="max_tokens")
loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
partition: # graph partition configuration
method: ece # ece is a custom partition method based on comprehension loss
method_params:
bidirectional: true # whether to traverse the graph in both directions
edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
expand_method: max_width # expand method, support: max_width, max_depth
isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
max_depth: 1 # maximum depth for graph traversal
max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
max_tokens: 256 # restricts input length (if expand_method="max_tokens")
loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
generate:
mode: multi_hop # strategy for generating multi-hop QA pairs
data_format: ChatML # Alpaca, Sharegpt, ChatML
51 changes: 26 additions & 25 deletions graphgen/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import yaml
from dotenv import load_dotenv

from .graphgen import GraphGen
from .utils import logger, set_logger
from graphgen.graphgen import GraphGen
from graphgen.utils import logger, set_logger

sys_path = os.path.abspath(os.path.dirname(__file__))

Expand Down Expand Up @@ -50,50 +50,51 @@ def main():
with open(args.config_file, "r", encoding="utf-8") as f:
config = yaml.load(f, Loader=yaml.FullLoader)

output_data_type = config["output_data_type"]
mode = config["generate"]["mode"]
unique_id = int(time.time())

output_path = os.path.join(
working_dir, "data", "graphgen", f"{unique_id}_{output_data_type}"
)
output_path = os.path.join(working_dir, "data", "graphgen", f"{unique_id}")
set_working_dir(output_path)

set_logger(
os.path.join(output_path, f"{unique_id}.log"),
os.path.join(output_path, f"{unique_id}_{mode}.log"),
if_stream=True,
)
logger.info(
"GraphGen with unique ID %s logging to %s",
unique_id,
os.path.join(
working_dir, "logs", f"{unique_id}_graphgen_{output_data_type}.log"
),
os.path.join(working_dir, f"{unique_id}_{mode}.log"),
)

graph_gen = GraphGen(working_dir=working_dir, unique_id=unique_id, config=config)
graph_gen = GraphGen(unique_id=unique_id, working_dir=working_dir)

graph_gen.insert()
graph_gen.insert(read_config=config["read"], split_config=config["split"])

if config["search"]["enabled"]:
graph_gen.search()
graph_gen.search(search_config=config["search"])

# Use pipeline according to the output data type
if output_data_type in ["atomic", "aggregated", "multi_hop"]:
if "quiz_and_judge_strategy" in config and config[
"quiz_and_judge_strategy"
].get("enabled", False):
graph_gen.quiz()
graph_gen.judge()
if mode in ["atomic", "aggregated", "multi_hop"]:
logger.info("Generation mode set to '%s'. Start generation.", mode)
if "quiz_and_judge" in config and config["quiz_and_judge"]["enabled"]:
graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
else:
logger.warning(
"Quiz and Judge strategy is disabled. Edge sampling falls back to random."
)
graph_gen.traverse_strategy.edge_sampling = "random"
graph_gen.traverse()
elif output_data_type == "cot":
graph_gen.generate_reasoning(method_params=config["method_params"])
assert (
config["partition"]["method"] == "ece"
and "ece_params" in config["partition"]
), "Only ECE partition with edge sampling is supported."
config["partition"]["method_params"]["edge_sampling"] = "random"
elif mode == "cot":
logger.info("Generation mode set to 'cot'. Start generation.")
else:
raise ValueError(f"Unsupported output data type: {output_data_type}")
raise ValueError(f"Unsupported output data type: {mode}")

graph_gen.generate(
partition_config=config["partition"],
generate_config=config["generate"],
)

save_config(os.path.join(output_path, "config.yaml"), config)
logger.info("GraphGen completed successfully. Data saved to %s", output_path)
Expand Down
Loading