From 24bfd2a26b0a74a0c16a681dfb5345f61a360d6c Mon Sep 17 00:00:00 2001 From: James Baster Date: Tue, 22 Jan 2019 12:41:09 +0000 Subject: [PATCH 1/2] New command check-collections https://github.com/open-contracting/kingfisher-process/issues/18 --- docs/cli-check-collections.rst | 27 +++++++++++++++++++ docs/cli.rst | 1 + ocdskingfisherprocess/checks.py | 18 ++++++++++++- .../cli/commands/check_collections.py | 22 +++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 docs/cli-check-collections.rst create mode 100644 ocdskingfisherprocess/cli/commands/check_collections.py diff --git a/docs/cli-check-collections.rst b/docs/cli-check-collections.rst new file mode 100644 index 00000000..6f31b507 --- /dev/null +++ b/docs/cli-check-collections.rst @@ -0,0 +1,27 @@ +Command line tool - check-collections option +=========================================== + +This command checks all data so far in all collections. + +It can be run multiple times, and data already checked will not be rechecked. + +You should only run one of these at once, as if two are run at once they may try and do the same work. + +.. code-block:: shell-session + + python ocdskingfisher-process-cli check-collections + +Running from cron +----------------- + +You can also pass a maximum number of seconds that the process should run for. Soon after that number of seconds has passed, the command will exit. + +.. code-block:: shell-session + + python ocdskingfisher-process-cli check-collections --runforseconds 60 + +You can use this option with a cron entry; set a cron entry for this command to run every hour and pass runforseconds as 3540 (60 seconds/minute * 59 minutes). + +Then when new data appears in the system, there is no need for someone to run :doc:`cli-check-collection` by hand - the process run by cron will pick up the new data itself eventually. + +The runforseconds option will make sure that only one of these cron processes runs at once. diff --git a/docs/cli.rst b/docs/cli.rst index da19907e..c842b58d 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -18,6 +18,7 @@ You can pass the `verbose` flag to all sub commands, to get more output printed cli-list-collections.rst cli-local-load.rst cli-check-collection.rst + cli-check-collections.rst cli-new-transform-compile-releases.rst cli-new-transform-upgrade-1-0-to-1-1.rst cli-transform-collection.rst diff --git a/ocdskingfisherprocess/checks.py b/ocdskingfisherprocess/checks.py index 70e117b5..7af1432b 100644 --- a/ocdskingfisherprocess/checks.py +++ b/ocdskingfisherprocess/checks.py @@ -1,21 +1,31 @@ from libcoveocds.api import ocds_json_output, APIException import tempfile import shutil +import datetime class Checks: - def __init__(self, database, collection): + def __init__(self, database, collection, run_until_timestamp=None): self.database = database self.collection = collection + self.run_until_timestamp = run_until_timestamp def process_all_files(self): + if not self.collection.check_data and not self.collection.check_older_data_with_schema_version_1_1: + # nothing to do here, so ... + return + for file_model in self.database.get_all_files_in_collection(self.collection.database_id): self.process_file(file_model=file_model) + if self.run_until_timestamp and self.run_until_timestamp < datetime.datetime.utcnow().timestamp(): + return def process_file(self, file_model): for file_item_model in self.database.get_all_files_items_in_file(file_model): self.process_file_item(file_item_model=file_item_model) + if self.run_until_timestamp and self.run_until_timestamp < datetime.datetime.utcnow().timestamp(): + return def process_file_item(self, file_item_model): with self.database.get_engine().begin() as connection: @@ -32,6 +42,9 @@ def process_file_item(self, file_item_model): and self.is_schema_version_less_than_1_1(release_row['package_data_id']) \ and not self.database.is_release_check_done(release_row['id'], override_schema_version="1.1"): self.check_release_row(release_row, override_schema_version="1.1") + # Early return? + if self.run_until_timestamp and self.run_until_timestamp < datetime.datetime.utcnow().timestamp(): + return del release_rows @@ -49,6 +62,9 @@ def process_file_item(self, file_item_model): and self.is_schema_version_less_than_1_1(record_row['package_data_id']) \ and not self.database.is_record_check_done(record_row['id'], override_schema_version="1.1"): self.check_record_row(record_row, override_schema_version="1.1") + # Early return? + if self.run_until_timestamp and self.run_until_timestamp < datetime.datetime.utcnow().timestamp(): + return def handle_package(self, package): cove_temp_folder = tempfile.mkdtemp(prefix='ocdskingfisher-cove-', dir=tempfile.gettempdir()) diff --git a/ocdskingfisherprocess/cli/commands/check_collections.py b/ocdskingfisherprocess/cli/commands/check_collections.py new file mode 100644 index 00000000..8bd2827e --- /dev/null +++ b/ocdskingfisherprocess/cli/commands/check_collections.py @@ -0,0 +1,22 @@ +import ocdskingfisherprocess.database +import ocdskingfisherprocess.cli.commands.base +from ocdskingfisherprocess.checks import Checks +import datetime + + +class CheckCollectionsCLICommand(ocdskingfisherprocess.cli.commands.base.CLICommand): + command = 'check-collections' + + def configure_subparser(self, subparser): + subparser.add_argument("--runforseconds", + help="Run for this many seconds only.") + + def run_command(self, args): + run_until_timestamp = None + run_for_seconds = int(args.runforseconds) if args.runforseconds else 0 + if run_for_seconds > 0: + run_until_timestamp = datetime.datetime.utcnow().timestamp() + run_for_seconds + + for collection in self.database.get_all_collections(): + checks = Checks(self.database, collection, run_until_timestamp=run_until_timestamp) + checks.process_all_files() From 0ac5d20c91b13be91f77ef820e11dade2e09951c Mon Sep 17 00:00:00 2001 From: James Baster Date: Thu, 24 Jan 2019 11:11:46 +0000 Subject: [PATCH 2/2] check_collections - clarify docs and add a safeguard --- docs/cli-check-collections.rst | 5 ++++- ocdskingfisherprocess/cli/commands/check_collections.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/cli-check-collections.rst b/docs/cli-check-collections.rst index 6f31b507..90f3f35d 100644 --- a/docs/cli-check-collections.rst +++ b/docs/cli-check-collections.rst @@ -14,12 +14,15 @@ You should only run one of these at once, as if two are run at once they may try Running from cron ----------------- -You can also pass a maximum number of seconds that the process should run for. Soon after that number of seconds has passed, the command will exit. +You can also pass a maximum number of seconds that the process should run for. .. code-block:: shell-session python ocdskingfisher-process-cli check-collections --runforseconds 60 +Soon after that number of seconds has passed, the command will exit. +(The command will finish the check it's currently doing before stopping, so it may run slightly longer than specified. Allow a minute extra to be safe.) + You can use this option with a cron entry; set a cron entry for this command to run every hour and pass runforseconds as 3540 (60 seconds/minute * 59 minutes). Then when new data appears in the system, there is no need for someone to run :doc:`cli-check-collection` by hand - the process run by cron will pick up the new data itself eventually. diff --git a/ocdskingfisherprocess/cli/commands/check_collections.py b/ocdskingfisherprocess/cli/commands/check_collections.py index 8bd2827e..d2640eb9 100644 --- a/ocdskingfisherprocess/cli/commands/check_collections.py +++ b/ocdskingfisherprocess/cli/commands/check_collections.py @@ -2,6 +2,8 @@ import ocdskingfisherprocess.cli.commands.base from ocdskingfisherprocess.checks import Checks import datetime +from threading import Timer +import os class CheckCollectionsCLICommand(ocdskingfisherprocess.cli.commands.base.CLICommand): @@ -17,6 +19,12 @@ def run_command(self, args): if run_for_seconds > 0: run_until_timestamp = datetime.datetime.utcnow().timestamp() + run_for_seconds + # This is a safeguard - the process should stop itself but this will kill it if it does not. + def exitfunc(): + os._exit(0) + + Timer(run_for_seconds + 60, exitfunc).start() + for collection in self.database.get_all_collections(): checks = Checks(self.database, collection, run_until_timestamp=run_until_timestamp) checks.process_all_files()