Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/scripts.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ Export every point in a collection to a `.qql` script file. The generated file i
**CLI usage:**
```bash
qql dump <collection_name> <output.qql>

# Override the default 50 points/INSERT BULK batch
qql dump <collection_name> <output.qql> --batch-size 200
```

**In-shell usage (inside the QQL REPL):**
Expand Down
13 changes: 11 additions & 2 deletions src/qql/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,14 @@ def execute(file: str, stop_on_error: bool) -> None:
@main.command()
@click.argument("collection")
@click.argument("output", type=click.Path())
def dump(collection: str, output: str) -> None:
@click.option(
"--batch-size",
type=click.IntRange(min=1),
default=50,
show_default=True,
help="Points per INSERT BULK batch in the generated script.",
)
def dump(collection: str, output: str, batch_size: int) -> None:
"""Dump a collection to a .qql script file.

OUTPUT is the path for the generated .qql file.
Expand Down Expand Up @@ -230,7 +237,9 @@ def dump(collection: str, output: str) -> None:
console.print(
f"[bold cyan]Dumping:[/bold cyan] '{collection}' -> {output}\n"
)
written, skipped = dump_collection(collection, output, client, console, err_console)
written, skipped = dump_collection(
collection, output, client, console, err_console, batch_size=batch_size
)

if written == 0 and skipped == 0:
# collection not found — error already printed by dump_collection
Expand Down
17 changes: 11 additions & 6 deletions src/qql/dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
The generated file contains:
1. A header comment with metadata
2. CREATE COLLECTION <name> [HYBRID]
3. One INSERT BULK statement per batch of _DUMP_BATCH_SIZE points
3. One INSERT BULK statement per batch of *batch_size* points
(default _DEFAULT_DUMP_BATCH_SIZE = 50, overridable via the CLI flag)
4. A footer comment with totals

The file is valid QQL and can be re-executed with ``qql execute <file>``.
Expand All @@ -20,7 +21,7 @@
from qdrant_client import QdrantClient
from rich.console import Console

_DUMP_BATCH_SIZE = 50
_DEFAULT_DUMP_BATCH_SIZE = 50


# ── Value serializer ──────────────────────────────────────────────────────────
Expand Down Expand Up @@ -81,12 +82,16 @@ def dump_collection(
client: QdrantClient,
console: Console,
err_console: Console,
batch_size: int = _DEFAULT_DUMP_BATCH_SIZE,
) -> tuple[int, int]:
"""Export every point in *collection* to a .qql script at *output_path*.

Returns ``(points_written, points_skipped)`` counts.
Points without a ``'text'`` key are skipped and counted in *points_skipped*.
"""
if batch_size <= 0:
raise ValueError(f"batch_size must be a positive integer, got {batch_size}")

if not client.collection_exists(collection):
err_console.print(
f"[bold red]Error:[/bold red] Collection '{collection}' does not exist."
Expand All @@ -100,13 +105,13 @@ def dump_collection(
# ── First pass: count total points for the header ─────────────────────
count_info = client.count(collection_name=collection, exact=True)
total_points = count_info.count
total_batches = max(1, math.ceil(total_points / _DUMP_BATCH_SIZE))
total_batches = max(1, math.ceil(total_points / batch_size))

console.print(
f" Collection type : [cyan]{col_type}[/cyan]\n"
f" Points : [cyan]{total_points}[/cyan]\n"
f" Batches : [cyan]{total_batches}[/cyan] "
f"([dim]{_DUMP_BATCH_SIZE} points/batch[/dim])\n"
f"([dim]{batch_size} points/batch[/dim])\n"
)

out = Path(output_path)
Expand Down Expand Up @@ -140,7 +145,7 @@ def dump_collection(
while True:
records, next_offset = client.scroll(
collection_name=collection,
limit=_DUMP_BATCH_SIZE,
limit=batch_size,
offset=offset,
with_payload=True,
with_vectors=False,
Expand All @@ -150,7 +155,7 @@ def dump_collection(
break

batch_num += 1
batch_start = (batch_num - 1) * _DUMP_BATCH_SIZE + 1
batch_start = (batch_num - 1) * batch_size + 1
batch_end = batch_start + len(records) - 1

# Filter points that have a 'text' field
Expand Down
44 changes: 39 additions & 5 deletions tests/test_dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from rich.console import Console

from qql.dumper import (
_DUMP_BATCH_SIZE,
_DEFAULT_DUMP_BATCH_SIZE,
_is_hybrid,
_serialize_dict,
_serialize_value,
Expand All @@ -32,7 +32,7 @@ def _make_client(mocker, *, exists=True, hybrid=False, points=None, total=None):
"""Build a mock QdrantClient for dump tests.

*points* is a list of payload dicts. scroll() returns them all in one
batch when len(points) <= _DUMP_BATCH_SIZE, else two batches.
batch when len(points) <= _DEFAULT_DUMP_BATCH_SIZE, else two batches.
"""
points = points or []
client = mocker.MagicMock()
Expand Down Expand Up @@ -202,10 +202,10 @@ def test_batches_multiple_scroll_pages(self, tmp_path, mocker):
client.collection_exists.return_value = True
client.get_collection.return_value.config.params.vectors = mocker.MagicMock(spec=[])
cnt = mocker.MagicMock()
cnt.count = _DUMP_BATCH_SIZE + 1
cnt.count = _DEFAULT_DUMP_BATCH_SIZE + 1
client.count.return_value = cnt

batch1 = [_make_record(mocker, {"text": f"doc {i}"}, f"id-{i}") for i in range(_DUMP_BATCH_SIZE)]
batch1 = [_make_record(mocker, {"text": f"doc {i}"}, f"id-{i}") for i in range(_DEFAULT_DUMP_BATCH_SIZE)]
batch2 = [_make_record(mocker, {"text": "last doc"}, "id-last")]
# First scroll call returns batch1 with a non-None offset; second returns batch2 + None
client.scroll.side_effect = [
Expand All @@ -215,7 +215,7 @@ def test_batches_multiple_scroll_pages(self, tmp_path, mocker):

written, skipped = dump_collection("col", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert written == _DUMP_BATCH_SIZE + 1
assert written == _DEFAULT_DUMP_BATCH_SIZE + 1
assert content.count("INSERT BULK") == 2

def test_header_contains_collection_name(self, tmp_path, mocker):
Expand All @@ -230,3 +230,37 @@ def test_output_file_created_in_nested_directory(self, tmp_path, mocker):
client = _make_client(mocker, points=[{"text": "x"}])
dump_collection("col", out, client, null_console(), null_console())
assert (tmp_path / "sub" / "dir" / "dump.qql").exists()

def test_custom_batch_size_splits_pages(self, tmp_path, mocker):
"""A batch_size of 2 over 3 points should produce two INSERT BULK blocks."""
out = str(tmp_path / "dump.qql")
client = mocker.MagicMock()
client.collection_exists.return_value = True
client.get_collection.return_value.config.params.vectors = mocker.MagicMock(spec=[])
cnt = mocker.MagicMock()
cnt.count = 3
client.count.return_value = cnt

batch1 = [_make_record(mocker, {"text": f"doc {i}"}, f"id-{i}") for i in range(2)]
batch2 = [_make_record(mocker, {"text": "last"}, "id-last")]
client.scroll.side_effect = [
(batch1, "offset-1"),
(batch2, None),
]

written, _ = dump_collection(
"col", out, client, null_console(), null_console(), batch_size=2
)
content = (tmp_path / "dump.qql").read_text()
assert written == 3
assert content.count("INSERT BULK") == 2
# client.scroll should have been called with limit=2
assert client.scroll.call_args_list[0].kwargs["limit"] == 2

def test_invalid_batch_size_raises(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, points=[{"text": "x"}])
with pytest.raises(ValueError):
dump_collection(
"col", out, client, null_console(), null_console(), batch_size=0
)