From 96c35598fe7ac60bedd0513cb0a6df4527d9e1e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Kopeck=C3=BD?= Date: Mon, 7 Dec 2020 11:29:17 +0100 Subject: [PATCH 1/2] Added feature to synchronize particular records --- oarepo_oai_pmh_harvester/cli.py | 39 ++++++++++++++------- oarepo_oai_pmh_harvester/ext.py | 8 ++++- oarepo_oai_pmh_harvester/synchronization.py | 22 +++++++++--- tests/test_cli.py | 9 +++++ tests/test_synchronization.py | 16 +++++++++ 5 files changed, 76 insertions(+), 18 deletions(-) diff --git a/oarepo_oai_pmh_harvester/cli.py b/oarepo_oai_pmh_harvester/cli.py index 699b808..328a762 100644 --- a/oarepo_oai_pmh_harvester/cli.py +++ b/oarepo_oai_pmh_harvester/cli.py @@ -26,21 +26,36 @@ def oai(): @click.option("-i", "--start_id", default=0, type=int, help="The serial number from which the synchronization starts. This is useful if " "for some reason the previous synchronization was interrupted at some point.") +@click.option("-a", "--oai", default=None, type=str, multiple=True, + help="OAI identifier that will be fetched and synchronized. The field is " + "repeatable. If this option is used, the provider and synchronizer must be " + "specified and " + "star_id or start_oai must not be used") @cli.with_appcontext -def run(provider, synchronizer, break_on_error, start_oai, start_id): +def run(provider, synchronizer, break_on_error, start_oai, start_id, oai): """ Starts harvesting the resources set in invenio.cfg through the OAREPO_OAI_PROVIDERS environment variable. """ - if not provider: - provider = None + l = len(oai) + if l > 0 and provider and synchronizer and not start_oai and not start_id: + assert len(provider) <= 1, "OAI option is only for one provider and synchronizer" + assert len(synchronizer) <= 1, "OAI option is only for one provider and synchronizer" + provider = provider[0] + synchronizer = synchronizer[0] + current_oai_client.run_synchronizer_by_ids(list(oai), provider, synchronizer, + break_on_error=break_on_error) else: - provider = list(provider) - if not synchronizer: - synchronizer = None - else: - synchronizer = list(synchronizer) - current_oai_client.run(providers_codes=provider, synchronizers_codes=synchronizer, - break_on_error=break_on_error, start_oai=start_oai, start_id=start_id) - -# TODO: použít minter/nepoužít minter + assert l == 0, " If OAI option is used, the provider and synchronizer must be " \ + "specified and star_id or start_oai must not be used" + if not provider: + provider = None + else: + provider = list(provider) + if not synchronizer: + synchronizer = None + else: + synchronizer = list(synchronizer) + current_oai_client.run(providers_codes=provider, synchronizers_codes=synchronizer, + break_on_error=break_on_error, start_oai=start_oai, + start_id=start_id) diff --git a/oarepo_oai_pmh_harvester/ext.py b/oarepo_oai_pmh_harvester/ext.py index 01500ca..216a2d8 100644 --- a/oarepo_oai_pmh_harvester/ext.py +++ b/oarepo_oai_pmh_harvester/ext.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import List +from typing import List, Union from pkg_resources import iter_entry_points @@ -216,6 +216,12 @@ def _run_synchronizer(self, provider: str, synchronizer: str, start_oai: str = N synchronizer = provider.synchronizers[synchronizer] synchronizer.run(start_oai=start_oai, start_id=start_id, break_on_error=break_on_error) + def run_synchronizer_by_ids(self, oai_id: Union[str, List[str]], provider: str, + synchronizer: str, break_on_error: bool = True): + provider = self.providers[provider] + synchronizer = provider.synchronizers[synchronizer] + synchronizer.run(break_on_error=break_on_error, oai_id=oai_id) + class OArepoOAIClient: diff --git a/oarepo_oai_pmh_harvester/synchronization.py b/oarepo_oai_pmh_harvester/synchronization.py index 68cccf2..db2bb5d 100644 --- a/oarepo_oai_pmh_harvester/synchronization.py +++ b/oarepo_oai_pmh_harvester/synchronization.py @@ -1,8 +1,7 @@ -import logging import traceback import uuid from itertools import islice -from typing import Callable, List +from typing import Callable, List, Union import arrow from arrow import Arrow @@ -101,7 +100,8 @@ def from_(self, value): else: self._from = None - def run(self, start_oai: str = None, start_id: int = 0, break_on_error: bool = True): + def run(self, start_oai: str = None, start_id: int = 0, break_on_error: bool = True, + oai_id: Union[str, List[str]] = None): """ :return: @@ -116,8 +116,20 @@ def run(self, start_oai: str = None, start_id: int = 0, break_on_error: bool = T db.session.add(self.oai_sync) db.session.commit() try: - self.synchronize(start_oai=start_oai, start_id=start_id, break_on_error=break_on_error) - self.update_oai_sync("ok") + if oai_id: + if isinstance(oai_id, str): + oai_ids = [oai_id] + elif isinstance(oai_id, list): + oai_ids = oai_id + else: + raise Exception("OAI identifier must be string or list of strings") + identifiers = self._get_oai_identifiers(identifiers_list=oai_ids) + for idx, identifier in enumerate(identifiers, start=start_id): + self.record_handling(idx, start_oai, break_on_error, identifier) + self.update_oai_sync("ok") + else: + self.synchronize(start_oai=start_oai, start_id=start_id, break_on_error=break_on_error) + self.update_oai_sync("ok") except: self.update_oai_sync("failed") raise diff --git a/tests/test_cli.py b/tests/test_cli.py index 5fdf3f1..37a1e01 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -11,3 +11,12 @@ def test_run(load_entry_points, app, db): result = runner.invoke(run, ["-p", "uk"]) assert result.exit_code == 0 patch.stop() + + +def test_run_2(load_entry_points, app, db): + patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest) + patch.start() + runner = app.test_cli_runner() + result = runner.invoke(run, ["-p", "uk", "-s", "xoai", "-a", "oai:test.example.com:1996652"]) + assert result.exit_code == 0 + patch.stop() diff --git a/tests/test_synchronization.py b/tests/test_synchronization.py index e30909c..292e6b4 100644 --- a/tests/test_synchronization.py +++ b/tests/test_synchronization.py @@ -334,3 +334,19 @@ def test_run_2(self, load_entry_points, app, db, record_xml): assert oai_rec.pid == "1" record = Record.get_record(id_=oai_rec.id) assert record["title"] == "Testovací záznam" + + def test_run_by_id(self, load_entry_points, app, db, record_xml): + patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest) + synchronizer = current_oai_client.providers["uk"].synchronizers["xoai"] + synchronizer.bulk = False + patch.start() + synchronizer.run(oai_id=["oai:test.example.com:1996652"]) + patch.stop() + + oai_sync = OAISync.query.get(1) + assert oai_sync.status == "ok" + assert oai_sync.records_created == 1 + oai_rec = OAIRecord.query.all()[-1] + assert oai_rec.pid == "1" + record = Record.get_record(id_=oai_rec.id) + assert record["title"] == "Testovací záznam" From b107f9ccf566dd6a8aff04ef5d8027d36633aeb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Kopeck=C3=BD?= Date: Mon, 7 Dec 2020 11:31:12 +0100 Subject: [PATCH 2/2] Upgraded version --- oarepo_oai_pmh_harvester/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oarepo_oai_pmh_harvester/version.py b/oarepo_oai_pmh_harvester/version.py index 5b02c24..60513fa 100644 --- a/oarepo_oai_pmh_harvester/version.py +++ b/oarepo_oai_pmh_harvester/version.py @@ -13,4 +13,4 @@ from __future__ import absolute_import, print_function -__version__ = '2.0.0a15' +__version__ = '2.0.0a16'