Skip to content

Commit

Permalink
Enhancement: add --id-range for document_retagger (#4080)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Trenton H <797416+stumpylog@users.noreply.github.com>
  • Loading branch information
kamilkosek and stumpylog committed Sep 8, 2023
1 parent a8e13df commit b238ba0
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 1 deletion.
8 changes: 7 additions & 1 deletion docs/administration.md
Original file line number Diff line number Diff line change
Expand Up @@ -351,14 +351,15 @@ currently-imported docs. This problem is common enough that there are
tools for it.

```
document_retagger [-h] [-c] [-T] [-t] [-i] [--use-first] [-f]
document_retagger [-h] [-c] [-T] [-t] [-i] [--id-range] [--use-first] [-f]
optional arguments:
-c, --correspondent
-T, --tags
-t, --document_type
-s, --storage_path
-i, --inbox-only
--id-range
--use-first
-f, --overwrite
```
Expand All @@ -375,6 +376,11 @@ Specify `-i` to have the document retagger work on documents tagged with
inbox tags only. This is useful when you don't want to mess with your
already processed documents.

Specify `--id-range 1 100` to have the document retagger work only on a
specific range of document id´s. This can be useful if you have a lot of
documents and want to test the matching rules only on a subset of
documents.

When multiple document types or correspondents match a single document,
the retagger won't assign these to the document. Specify `--use-first`
to override this behavior and just use the first correspondent or type
Expand Down
12 changes: 12 additions & 0 deletions src/documents/management/commands/document_retagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ def add_arguments(self, parser):
"--base-url",
help="The base URL to use to build the link to the documents.",
)
parser.add_argument(
"--id-range",
help="A range of document ids on which the retagging should be applied.",
nargs=2,
type=int,
)

def handle(self, *args, **options):
# Detect if we support color
Expand All @@ -72,6 +78,12 @@ def handle(self, *args, **options):
queryset = Document.objects.filter(tags__is_inbox_tag=True)
else:
queryset = Document.objects.all()

if options["id_range"]:
queryset = queryset.filter(
id__range=(options["id_range"][0], options["id_range"][1]),
)

documents = queryset.distinct()

classifier = load_classifier()
Expand Down
36 changes: 36 additions & 0 deletions src/documents/tests/test_management_retagger.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from django.core.management import call_command
from django.core.management.base import CommandError
from django.test import TestCase

from documents.models import Correspondent
Expand Down Expand Up @@ -258,3 +259,38 @@ def test_overwrite_storage_path(self):
self.assertEqual(d_auto.storage_path, self.sp1)
self.assertIsNone(d_second.storage_path)
self.assertEqual(d_unrelated.storage_path, self.sp2)

def test_id_range_parameter(self):
commandOutput = ""
Document.objects.create(
checksum="E",
title="E",
content="NOT the first document",
)
call_command("document_retagger", "--tags", "--id-range", "1", "2")
# The retagger shouldn`t apply the 'first' tag to our new document
self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 1)

try:
commandOutput = call_command("document_retagger", "--tags", "--id-range")
except CommandError:
# Just ignore the error
None
self.assertIn(commandOutput, "Error: argument --id-range: expected 2 arguments")

try:
commandOutput = call_command(
"document_retagger",
"--tags",
"--id-range",
"a",
"b",
)
except CommandError:
# Just ignore the error
None
self.assertIn(commandOutput, "error: argument --id-range: invalid int value:")

call_command("document_retagger", "--tags", "--id-range", "1", "9999")
# Now we should have 2 documents
self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 2)

0 comments on commit b238ba0

Please sign in to comment.