Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions graphgen/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

def set_working_dir(folder):
os.makedirs(folder, exist_ok=True)
os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
os.makedirs(os.path.join(folder, "logs"), exist_ok=True)


def save_config(config_path, global_config):
Expand Down Expand Up @@ -48,17 +46,20 @@ def main():
args = parser.parse_args()

working_dir = args.output_dir
set_working_dir(working_dir)

with open(args.config_file, "r", encoding="utf-8") as f:
config = yaml.load(f, Loader=yaml.FullLoader)

output_data_type = config["output_data_type"]
unique_id = int(time.time())

output_path = os.path.join(
working_dir, "data", "graphgen", f"{unique_id}_{output_data_type}"
)
set_working_dir(output_path)

set_logger(
os.path.join(
working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
),
os.path.join(output_path, f"{unique_id}.log"),
if_stream=True,
)
logger.info(
Expand Down Expand Up @@ -94,8 +95,7 @@ def main():
else:
raise ValueError(f"Unsupported output data type: {output_data_type}")

output_path = os.path.join(working_dir, "data", "graphgen", str(unique_id))
save_config(os.path.join(output_path, f"config-{unique_id}.yaml"), config)
save_config(os.path.join(output_path, "config.yaml"), config)
logger.info("GraphGen completed successfully. Data saved to %s", output_path)


Expand Down
9 changes: 7 additions & 2 deletions graphgen/graphgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,13 @@ def __post_init__(self):
self.working_dir, namespace="rephrase"
)
self.qa_storage: JsonListStorage = JsonListStorage(
os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)),
namespace=f"qa-{self.unique_id}",
os.path.join(
self.working_dir,
"data",
"graphgen",
f"{self.unique_id}_{self.config['output_data_type']}",
),
namespace="qa",
)

async def async_split_chunks(self, data: List[Union[List, Dict]]) -> dict:
Expand Down
Empty file added tests/e2e_tests/__init__.py
Empty file.
50 changes: 50 additions & 0 deletions tests/e2e_tests/test_generate_aggregated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import json
import os
import subprocess
from pathlib import Path


def test_generate_aggregated(tmp_path: Path):
repo_root = Path(__file__).resolve().parents[2]
os.chdir(repo_root)

config_path = repo_root / "graphgen" / "configs" / "aggregated_config.yaml"
output_dir = tmp_path / "output"
output_dir.mkdir(parents=True, exist_ok=True)

result = subprocess.run(
[
"python",
"-m",
"graphgen.generate",
"--config_file",
str(config_path),
"--output_dir",
str(output_dir),
],
capture_output=True,
text=True,
check=False,
)
assert result.returncode == 0, f"Script failed with error: {result.stderr}"

data_root = output_dir / "data" / "graphgen"
assert data_root.exists(), f"{data_root} does not exist"
run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True)
assert run_folders, f"No run folders found in {data_root}"
run_folder = run_folders[0]

config_saved = run_folder / "config.yaml"
assert config_saved.exists(), f"{config_saved} not found"

json_files = list(run_folder.glob("*.json"))
assert json_files, f"No JSON output found in {run_folder}"

log_files = list(run_folder.glob("*.log"))
assert log_files, "No log file generated"

with open(json_files[0], "r", encoding="utf-8") as f:
data = json.load(f)
assert (
isinstance(data, list) and len(data) > 0
), "JSON output is empty or not a list"
50 changes: 50 additions & 0 deletions tests/e2e_tests/test_generate_atomic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import json
import os
import subprocess
from pathlib import Path


def test_generate_atomic(tmp_path: Path):
repo_root = Path(__file__).resolve().parents[2]
os.chdir(repo_root)

config_path = repo_root / "graphgen" / "configs" / "atomic_config.yaml"
output_dir = tmp_path / "output"
output_dir.mkdir(parents=True, exist_ok=True)

result = subprocess.run(
[
"python",
"-m",
"graphgen.generate",
"--config_file",
str(config_path),
"--output_dir",
str(output_dir),
],
capture_output=True,
text=True,
check=False,
)
assert result.returncode == 0, f"Script failed with error: {result.stderr}"

data_root = output_dir / "data" / "graphgen"
assert data_root.exists(), f"{data_root} does not exist"
run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True)
assert run_folders, f"No run folders found in {data_root}"
run_folder = run_folders[0]

config_saved = run_folder / "config.yaml"
assert config_saved.exists(), f"{config_saved} not found"

json_files = list(run_folder.glob("*.json"))
assert json_files, f"No JSON output found in {run_folder}"

log_files = list(run_folder.glob("*.log"))
assert log_files, "No log file generated"

with open(json_files[0], "r", encoding="utf-8") as f:
data = json.load(f)
assert (
isinstance(data, list) and len(data) > 0
), "JSON output is empty or not a list"
50 changes: 50 additions & 0 deletions tests/e2e_tests/test_generate_cot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import json
import os
import subprocess
from pathlib import Path


def test_generate_aggregated(tmp_path: Path):
repo_root = Path(__file__).resolve().parents[2]
os.chdir(repo_root)

config_path = repo_root / "graphgen" / "configs" / "cot_config.yaml"
output_dir = tmp_path / "output"
output_dir.mkdir(parents=True, exist_ok=True)

result = subprocess.run(
[
"python",
"-m",
"graphgen.generate",
"--config_file",
str(config_path),
"--output_dir",
str(output_dir),
],
capture_output=True,
text=True,
check=False,
)
assert result.returncode == 0, f"Script failed with error: {result.stderr}"

data_root = output_dir / "data" / "graphgen"
assert data_root.exists(), f"{data_root} does not exist"
run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True)
assert run_folders, f"No run folders found in {data_root}"
run_folder = run_folders[0]

config_saved = run_folder / "config.yaml"
assert config_saved.exists(), f"{config_saved} not found"

json_files = list(run_folder.glob("*.json"))
assert json_files, f"No JSON output found in {run_folder}"

log_files = list(run_folder.glob("*.log"))
assert log_files, "No log file generated"

with open(json_files[0], "r", encoding="utf-8") as f:
data = json.load(f)
assert (
isinstance(data, list) and len(data) > 0
), "JSON output is empty or not a list"
50 changes: 50 additions & 0 deletions tests/e2e_tests/test_generate_multi_hop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import json
import os
import subprocess
from pathlib import Path


def test_generate_aggregated(tmp_path: Path):
repo_root = Path(__file__).resolve().parents[2]
os.chdir(repo_root)

config_path = repo_root / "graphgen" / "configs" / "multi_hop_config.yaml"
output_dir = tmp_path / "output"
output_dir.mkdir(parents=True, exist_ok=True)

result = subprocess.run(
[
"python",
"-m",
"graphgen.generate",
"--config_file",
str(config_path),
"--output_dir",
str(output_dir),
],
capture_output=True,
text=True,
check=False,
)
assert result.returncode == 0, f"Script failed with error: {result.stderr}"

data_root = output_dir / "data" / "graphgen"
assert data_root.exists() and data_root.is_dir(), f"{data_root} does not exist or is not a directory"
run_folders = sorted(list(data_root.iterdir()), key=lambda p: p.name, reverse=True)
assert run_folders, f"No run folders found in {data_root}"
run_folder = run_folders[0]

config_saved = run_folder / "config.yaml"
assert config_saved.exists(), f"{config_saved} not found"

json_files = list(run_folder.glob("*.json"))
assert json_files, f"No JSON output found in {run_folder}"

log_files = list(run_folder.glob("*.log"))
assert log_files, "No log file generated"

with open(json_files[0], "r", encoding="utf-8") as f:
data = json.load(f)
assert (
isinstance(data, list) and len(data) > 0
), "JSON output is empty or not a list"