pavanjava · pavanjava · May 12, 2026 · May 4, 2026
diff --git a/docs/scripts.md b/docs/scripts.md
@@ -79,6 +79,9 @@ Export every point in a collection to a `.qql` script file. The generated file i
 **CLI usage:**
 ```bash
 qql dump <collection_name> <output.qql>
+
+# Override the default 50 points/INSERT BULK batch
+qql dump <collection_name> <output.qql> --batch-size 200
 ```
 
 **In-shell usage (inside the QQL REPL):**

diff --git a/src/qql/cli.py b/src/qql/cli.py
@@ -201,7 +201,14 @@ def execute(file: str, stop_on_error: bool) -> None:
 @main.command()
 @click.argument("collection")
 @click.argument("output", type=click.Path())
-def dump(collection: str, output: str) -> None:
+@click.option(
+    "--batch-size",
+    type=click.IntRange(min=1),
+    default=50,
+    show_default=True,
+    help="Points per INSERT BULK batch in the generated script.",
+)
+def dump(collection: str, output: str, batch_size: int) -> None:
     """Dump a collection to a .qql script file.
 
     OUTPUT is the path for the generated .qql file.
@@ -230,7 +237,9 @@ def dump(collection: str, output: str) -> None:
     console.print(
         f"[bold cyan]Dumping:[/bold cyan] '{collection}'  ->  {output}\n"
     )
-    written, skipped = dump_collection(collection, output, client, console, err_console)
+    written, skipped = dump_collection(
+        collection, output, client, console, err_console, batch_size=batch_size
+    )
 
     if written == 0 and skipped == 0:
         # collection not found — error already printed by dump_collection

diff --git a/src/qql/dumper.py b/src/qql/dumper.py
@@ -3,7 +3,8 @@
 The generated file contains:
   1. A header comment with metadata
   2. CREATE COLLECTION <name> [HYBRID]
-  3. One INSERT BULK statement per batch of _DUMP_BATCH_SIZE points
+  3. One INSERT BULK statement per batch of *batch_size* points
+     (default _DEFAULT_DUMP_BATCH_SIZE = 50, overridable via the CLI flag)
   4. A footer comment with totals
 
 The file is valid QQL and can be re-executed with ``qql execute <file>``.
@@ -20,7 +21,7 @@
 from qdrant_client import QdrantClient
 from rich.console import Console
 
-_DUMP_BATCH_SIZE = 50
+_DEFAULT_DUMP_BATCH_SIZE = 50
 
 
 # ── Value serializer ──────────────────────────────────────────────────────────
@@ -81,12 +82,16 @@ def dump_collection(
     client: QdrantClient,
     console: Console,
     err_console: Console,
+    batch_size: int = _DEFAULT_DUMP_BATCH_SIZE,
 ) -> tuple[int, int]:
     """Export every point in *collection* to a .qql script at *output_path*.
 
     Returns ``(points_written, points_skipped)`` counts.
     Points without a ``'text'`` key are skipped and counted in *points_skipped*.
     """
+    if batch_size <= 0:
+        raise ValueError(f"batch_size must be a positive integer, got {batch_size}")
+
     if not client.collection_exists(collection):
         err_console.print(
             f"[bold red]Error:[/bold red] Collection '{collection}' does not exist."
@@ -100,13 +105,13 @@ def dump_collection(
     # ── First pass: count total points for the header ─────────────────────
     count_info = client.count(collection_name=collection, exact=True)
     total_points = count_info.count
-    total_batches = max(1, math.ceil(total_points / _DUMP_BATCH_SIZE))
+    total_batches = max(1, math.ceil(total_points / batch_size))
 
     console.print(
         f"  Collection type : [cyan]{col_type}[/cyan]\n"
         f"  Points          : [cyan]{total_points}[/cyan]\n"
         f"  Batches         : [cyan]{total_batches}[/cyan] "
-        f"([dim]{_DUMP_BATCH_SIZE} points/batch[/dim])\n"
+        f"([dim]{batch_size} points/batch[/dim])\n"
     )
 
     out = Path(output_path)
@@ -140,7 +145,7 @@ def dump_collection(
         while True:
             records, next_offset = client.scroll(
                 collection_name=collection,
-                limit=_DUMP_BATCH_SIZE,
+                limit=batch_size,
                 offset=offset,
                 with_payload=True,
                 with_vectors=False,
@@ -150,7 +155,7 @@ def dump_collection(
                 break
 
             batch_num += 1
-            batch_start = (batch_num - 1) * _DUMP_BATCH_SIZE + 1
+            batch_start = (batch_num - 1) * batch_size + 1
             batch_end = batch_start + len(records) - 1
 
             # Filter points that have a 'text' field

diff --git a/tests/test_dumper.py b/tests/test_dumper.py
@@ -5,7 +5,7 @@
 from rich.console import Console
 
 from qql.dumper import (
-    _DUMP_BATCH_SIZE,
+    _DEFAULT_DUMP_BATCH_SIZE,
     _is_hybrid,
     _serialize_dict,
     _serialize_value,
@@ -32,7 +32,7 @@ def _make_client(mocker, *, exists=True, hybrid=False, points=None, total=None):
     """Build a mock QdrantClient for dump tests.
 
     *points* is a list of payload dicts.  scroll() returns them all in one
-    batch when len(points) <= _DUMP_BATCH_SIZE, else two batches.
+    batch when len(points) <= _DEFAULT_DUMP_BATCH_SIZE, else two batches.
     """
     points = points or []
     client = mocker.MagicMock()
@@ -202,10 +202,10 @@ def test_batches_multiple_scroll_pages(self, tmp_path, mocker):
         client.collection_exists.return_value = True
         client.get_collection.return_value.config.params.vectors = mocker.MagicMock(spec=[])
         cnt = mocker.MagicMock()
-        cnt.count = _DUMP_BATCH_SIZE + 1
+        cnt.count = _DEFAULT_DUMP_BATCH_SIZE + 1
         client.count.return_value = cnt
 
-        batch1 = [_make_record(mocker, {"text": f"doc {i}"}, f"id-{i}") for i in range(_DUMP_BATCH_SIZE)]
+        batch1 = [_make_record(mocker, {"text": f"doc {i}"}, f"id-{i}") for i in range(_DEFAULT_DUMP_BATCH_SIZE)]
         batch2 = [_make_record(mocker, {"text": "last doc"}, "id-last")]
         # First scroll call returns batch1 with a non-None offset; second returns batch2 + None
         client.scroll.side_effect = [
@@ -215,7 +215,7 @@ def test_batches_multiple_scroll_pages(self, tmp_path, mocker):
 
         written, skipped = dump_collection("col", out, client, null_console(), null_console())
         content = (tmp_path / "dump.qql").read_text()
-        assert written == _DUMP_BATCH_SIZE + 1
+        assert written == _DEFAULT_DUMP_BATCH_SIZE + 1
         assert content.count("INSERT BULK") == 2
 
     def test_header_contains_collection_name(self, tmp_path, mocker):
@@ -230,3 +230,37 @@ def test_output_file_created_in_nested_directory(self, tmp_path, mocker):
         client = _make_client(mocker, points=[{"text": "x"}])
         dump_collection("col", out, client, null_console(), null_console())
         assert (tmp_path / "sub" / "dir" / "dump.qql").exists()
+
+    def test_custom_batch_size_splits_pages(self, tmp_path, mocker):
+        """A batch_size of 2 over 3 points should produce two INSERT BULK blocks."""
+        out = str(tmp_path / "dump.qql")
+        client = mocker.MagicMock()
+        client.collection_exists.return_value = True
+        client.get_collection.return_value.config.params.vectors = mocker.MagicMock(spec=[])
+        cnt = mocker.MagicMock()
+        cnt.count = 3
+        client.count.return_value = cnt
+
+        batch1 = [_make_record(mocker, {"text": f"doc {i}"}, f"id-{i}") for i in range(2)]
+        batch2 = [_make_record(mocker, {"text": "last"}, "id-last")]
+        client.scroll.side_effect = [
+            (batch1, "offset-1"),
+            (batch2, None),
+        ]
+
+        written, _ = dump_collection(
+            "col", out, client, null_console(), null_console(), batch_size=2
+        )
+        content = (tmp_path / "dump.qql").read_text()
+        assert written == 3
+        assert content.count("INSERT BULK") == 2
+        # client.scroll should have been called with limit=2
+        assert client.scroll.call_args_list[0].kwargs["limit"] == 2
+
+    def test_invalid_batch_size_raises(self, tmp_path, mocker):
+        out = str(tmp_path / "dump.qql")
+        client = _make_client(mocker, points=[{"text": "x"}])
+        with pytest.raises(ValueError):
+            dump_collection(
+                "col", out, client, null_console(), null_console(), batch_size=0
+            )