diff --git a/docs/cli-check-collections.rst b/docs/cli-check-collections.rst new file mode 100644 index 00000000..90f3f35d --- /dev/null +++ b/docs/cli-check-collections.rst @@ -0,0 +1,30 @@ +Command line tool - check-collections option +=========================================== + +This command checks all data so far in all collections. + +It can be run multiple times, and data already checked will not be rechecked. + +You should only run one of these at once, as if two are run at once they may try and do the same work. + +.. code-block:: shell-session + + python ocdskingfisher-process-cli check-collections + +Running from cron +----------------- + +You can also pass a maximum number of seconds that the process should run for. + +.. code-block:: shell-session + + python ocdskingfisher-process-cli check-collections --runforseconds 60 + +Soon after that number of seconds has passed, the command will exit. +(The command will finish the check it's currently doing before stopping, so it may run slightly longer than specified. Allow a minute extra to be safe.) + +You can use this option with a cron entry; set a cron entry for this command to run every hour and pass runforseconds as 3540 (60 seconds/minute * 59 minutes). + +Then when new data appears in the system, there is no need for someone to run :doc:`cli-check-collection` by hand - the process run by cron will pick up the new data itself eventually. + +The runforseconds option will make sure that only one of these cron processes runs at once. diff --git a/docs/cli.rst b/docs/cli.rst index da19907e..c842b58d 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -18,6 +18,7 @@ You can pass the `verbose` flag to all sub commands, to get more output printed cli-list-collections.rst cli-local-load.rst cli-check-collection.rst + cli-check-collections.rst cli-new-transform-compile-releases.rst cli-new-transform-upgrade-1-0-to-1-1.rst cli-transform-collection.rst diff --git a/ocdskingfisherprocess/checks.py b/ocdskingfisherprocess/checks.py index 70e117b5..7af1432b 100644 --- a/ocdskingfisherprocess/checks.py +++ b/ocdskingfisherprocess/checks.py @@ -1,21 +1,31 @@ from libcoveocds.api import ocds_json_output, APIException import tempfile import shutil +import datetime class Checks: - def __init__(self, database, collection): + def __init__(self, database, collection, run_until_timestamp=None): self.database = database self.collection = collection + self.run_until_timestamp = run_until_timestamp def process_all_files(self): + if not self.collection.check_data and not self.collection.check_older_data_with_schema_version_1_1: + # nothing to do here, so ... + return + for file_model in self.database.get_all_files_in_collection(self.collection.database_id): self.process_file(file_model=file_model) + if self.run_until_timestamp and self.run_until_timestamp < datetime.datetime.utcnow().timestamp(): + return def process_file(self, file_model): for file_item_model in self.database.get_all_files_items_in_file(file_model): self.process_file_item(file_item_model=file_item_model) + if self.run_until_timestamp and self.run_until_timestamp < datetime.datetime.utcnow().timestamp(): + return def process_file_item(self, file_item_model): with self.database.get_engine().begin() as connection: @@ -32,6 +42,9 @@ def process_file_item(self, file_item_model): and self.is_schema_version_less_than_1_1(release_row['package_data_id']) \ and not self.database.is_release_check_done(release_row['id'], override_schema_version="1.1"): self.check_release_row(release_row, override_schema_version="1.1") + # Early return? + if self.run_until_timestamp and self.run_until_timestamp < datetime.datetime.utcnow().timestamp(): + return del release_rows @@ -49,6 +62,9 @@ def process_file_item(self, file_item_model): and self.is_schema_version_less_than_1_1(record_row['package_data_id']) \ and not self.database.is_record_check_done(record_row['id'], override_schema_version="1.1"): self.check_record_row(record_row, override_schema_version="1.1") + # Early return? + if self.run_until_timestamp and self.run_until_timestamp < datetime.datetime.utcnow().timestamp(): + return def handle_package(self, package): cove_temp_folder = tempfile.mkdtemp(prefix='ocdskingfisher-cove-', dir=tempfile.gettempdir()) diff --git a/ocdskingfisherprocess/cli/commands/check_collections.py b/ocdskingfisherprocess/cli/commands/check_collections.py new file mode 100644 index 00000000..d2640eb9 --- /dev/null +++ b/ocdskingfisherprocess/cli/commands/check_collections.py @@ -0,0 +1,30 @@ +import ocdskingfisherprocess.database +import ocdskingfisherprocess.cli.commands.base +from ocdskingfisherprocess.checks import Checks +import datetime +from threading import Timer +import os + + +class CheckCollectionsCLICommand(ocdskingfisherprocess.cli.commands.base.CLICommand): + command = 'check-collections' + + def configure_subparser(self, subparser): + subparser.add_argument("--runforseconds", + help="Run for this many seconds only.") + + def run_command(self, args): + run_until_timestamp = None + run_for_seconds = int(args.runforseconds) if args.runforseconds else 0 + if run_for_seconds > 0: + run_until_timestamp = datetime.datetime.utcnow().timestamp() + run_for_seconds + + # This is a safeguard - the process should stop itself but this will kill it if it does not. + def exitfunc(): + os._exit(0) + + Timer(run_for_seconds + 60, exitfunc).start() + + for collection in self.database.get_all_collections(): + checks = Checks(self.database, collection, run_until_timestamp=run_until_timestamp) + checks.process_all_files()