diff --git a/graphgen/generate.py b/graphgen/generate.py index eec168d6..d0ba3baa 100644 --- a/graphgen/generate.py +++ b/graphgen/generate.py @@ -16,8 +16,6 @@ def set_working_dir(folder): os.makedirs(folder, exist_ok=True) - os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True) - os.makedirs(os.path.join(folder, "logs"), exist_ok=True) def save_config(config_path, global_config): @@ -48,17 +46,20 @@ def main(): args = parser.parse_args() working_dir = args.output_dir - set_working_dir(working_dir) with open(args.config_file, "r", encoding="utf-8") as f: config = yaml.load(f, Loader=yaml.FullLoader) output_data_type = config["output_data_type"] unique_id = int(time.time()) + + output_path = os.path.join( + working_dir, "data", "graphgen", f"{unique_id}_{output_data_type}" + ) + set_working_dir(output_path) + set_logger( - os.path.join( - working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log" - ), + os.path.join(output_path, f"{unique_id}.log"), if_stream=True, ) logger.info( @@ -94,8 +95,7 @@ def main(): else: raise ValueError(f"Unsupported output data type: {output_data_type}") - output_path = os.path.join(working_dir, "data", "graphgen", str(unique_id)) - save_config(os.path.join(output_path, f"config-{unique_id}.yaml"), config) + save_config(os.path.join(output_path, "config.yaml"), config) logger.info("GraphGen completed successfully. Data saved to %s", output_path) diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 44d530bc..ab2be0df 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -102,8 +102,13 @@ def __post_init__(self): self.working_dir, namespace="rephrase" ) self.qa_storage: JsonListStorage = JsonListStorage( - os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)), - namespace=f"qa-{self.unique_id}", + os.path.join( + self.working_dir, + "data", + "graphgen", + f"{self.unique_id}_{self.config['output_data_type']}", + ), + namespace="qa", ) async def async_split_chunks(self, data: List[Union[List, Dict]]) -> dict: diff --git a/tests/e2e_tests/__init__.py b/tests/e2e_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/e2e_tests/test_generate_aggregated.py b/tests/e2e_tests/test_generate_aggregated.py new file mode 100644 index 00000000..7d1023a8 --- /dev/null +++ b/tests/e2e_tests/test_generate_aggregated.py @@ -0,0 +1,50 @@ +import json +import os +import subprocess +from pathlib import Path + + +def test_generate_aggregated(tmp_path: Path): + repo_root = Path(__file__).resolve().parents[2] + os.chdir(repo_root) + + config_path = repo_root / "graphgen" / "configs" / "aggregated_config.yaml" + output_dir = tmp_path / "output" + output_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "python", + "-m", + "graphgen.generate", + "--config_file", + str(config_path), + "--output_dir", + str(output_dir), + ], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"Script failed with error: {result.stderr}" + + data_root = output_dir / "data" / "graphgen" + assert data_root.exists(), f"{data_root} does not exist" + run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True) + assert run_folders, f"No run folders found in {data_root}" + run_folder = run_folders[0] + + config_saved = run_folder / "config.yaml" + assert config_saved.exists(), f"{config_saved} not found" + + json_files = list(run_folder.glob("*.json")) + assert json_files, f"No JSON output found in {run_folder}" + + log_files = list(run_folder.glob("*.log")) + assert log_files, "No log file generated" + + with open(json_files[0], "r", encoding="utf-8") as f: + data = json.load(f) + assert ( + isinstance(data, list) and len(data) > 0 + ), "JSON output is empty or not a list" diff --git a/tests/e2e_tests/test_generate_atomic.py b/tests/e2e_tests/test_generate_atomic.py new file mode 100644 index 00000000..a48341dd --- /dev/null +++ b/tests/e2e_tests/test_generate_atomic.py @@ -0,0 +1,50 @@ +import json +import os +import subprocess +from pathlib import Path + + +def test_generate_atomic(tmp_path: Path): + repo_root = Path(__file__).resolve().parents[2] + os.chdir(repo_root) + + config_path = repo_root / "graphgen" / "configs" / "atomic_config.yaml" + output_dir = tmp_path / "output" + output_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "python", + "-m", + "graphgen.generate", + "--config_file", + str(config_path), + "--output_dir", + str(output_dir), + ], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"Script failed with error: {result.stderr}" + + data_root = output_dir / "data" / "graphgen" + assert data_root.exists(), f"{data_root} does not exist" + run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True) + assert run_folders, f"No run folders found in {data_root}" + run_folder = run_folders[0] + + config_saved = run_folder / "config.yaml" + assert config_saved.exists(), f"{config_saved} not found" + + json_files = list(run_folder.glob("*.json")) + assert json_files, f"No JSON output found in {run_folder}" + + log_files = list(run_folder.glob("*.log")) + assert log_files, "No log file generated" + + with open(json_files[0], "r", encoding="utf-8") as f: + data = json.load(f) + assert ( + isinstance(data, list) and len(data) > 0 + ), "JSON output is empty or not a list" diff --git a/tests/e2e_tests/test_generate_cot.py b/tests/e2e_tests/test_generate_cot.py new file mode 100644 index 00000000..c7617a81 --- /dev/null +++ b/tests/e2e_tests/test_generate_cot.py @@ -0,0 +1,50 @@ +import json +import os +import subprocess +from pathlib import Path + + +def test_generate_aggregated(tmp_path: Path): + repo_root = Path(__file__).resolve().parents[2] + os.chdir(repo_root) + + config_path = repo_root / "graphgen" / "configs" / "cot_config.yaml" + output_dir = tmp_path / "output" + output_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "python", + "-m", + "graphgen.generate", + "--config_file", + str(config_path), + "--output_dir", + str(output_dir), + ], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"Script failed with error: {result.stderr}" + + data_root = output_dir / "data" / "graphgen" + assert data_root.exists(), f"{data_root} does not exist" + run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True) + assert run_folders, f"No run folders found in {data_root}" + run_folder = run_folders[0] + + config_saved = run_folder / "config.yaml" + assert config_saved.exists(), f"{config_saved} not found" + + json_files = list(run_folder.glob("*.json")) + assert json_files, f"No JSON output found in {run_folder}" + + log_files = list(run_folder.glob("*.log")) + assert log_files, "No log file generated" + + with open(json_files[0], "r", encoding="utf-8") as f: + data = json.load(f) + assert ( + isinstance(data, list) and len(data) > 0 + ), "JSON output is empty or not a list" diff --git a/tests/e2e_tests/test_generate_multi_hop.py b/tests/e2e_tests/test_generate_multi_hop.py new file mode 100644 index 00000000..13e76ce7 --- /dev/null +++ b/tests/e2e_tests/test_generate_multi_hop.py @@ -0,0 +1,50 @@ +import json +import os +import subprocess +from pathlib import Path + + +def test_generate_aggregated(tmp_path: Path): + repo_root = Path(__file__).resolve().parents[2] + os.chdir(repo_root) + + config_path = repo_root / "graphgen" / "configs" / "multi_hop_config.yaml" + output_dir = tmp_path / "output" + output_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "python", + "-m", + "graphgen.generate", + "--config_file", + str(config_path), + "--output_dir", + str(output_dir), + ], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"Script failed with error: {result.stderr}" + + data_root = output_dir / "data" / "graphgen" + assert data_root.exists() and data_root.is_dir(), f"{data_root} does not exist or is not a directory" + run_folders = sorted(list(data_root.iterdir()), key=lambda p: p.name, reverse=True) + assert run_folders, f"No run folders found in {data_root}" + run_folder = run_folders[0] + + config_saved = run_folder / "config.yaml" + assert config_saved.exists(), f"{config_saved} not found" + + json_files = list(run_folder.glob("*.json")) + assert json_files, f"No JSON output found in {run_folder}" + + log_files = list(run_folder.glob("*.log")) + assert log_files, "No log file generated" + + with open(json_files[0], "r", encoding="utf-8") as f: + data = json.load(f) + assert ( + isinstance(data, list) and len(data) > 0 + ), "JSON output is empty or not a list"