From d7441107045e76c774d77e3e04a6ff5d6e16b09b Mon Sep 17 00:00:00 2001 From: "Shawn M. Jones" Date: Mon, 13 Jul 2020 20:44:17 -0600 Subject: [PATCH] the number of clusters for time-slicing is now configurable --- docs/source/conf.py | 2 +- hypercane/actions/cluster.py | 9 +++++++-- hypercane/cluster/time_slice.py | 7 ++++--- hypercane/version.py | 2 +- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index fbd7fce..4906c32 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -26,7 +26,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.2020.07.13.224110' +release = u'0.2020.07.14.023052' # -- General configuration --------------------------------------------------- diff --git a/hypercane/actions/cluster.py b/hypercane/actions/cluster.py index 5218b8a..49f9d7e 100644 --- a/hypercane/actions/cluster.py +++ b/hypercane/actions/cluster.py @@ -174,6 +174,11 @@ def time_slice(args): prog="hc cluster time-slice" ) + parser.add_argument('-k', dest='k', + default=None, type=int, + help='The number of clusters to create.' + ) + args = process_input_args(args, parser) output_type = 'mementos' @@ -187,7 +192,6 @@ def time_slice(args): session = get_web_session(cache_storage=args.cache_storage) - urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type @@ -195,7 +199,8 @@ def time_slice(args): logger.info("There were {} mementos discovered in the input".format(len(urimdata))) - urimdata_with_slices = execute_time_slice(urimdata, args.cache_storage) + urimdata_with_slices = execute_time_slice( + urimdata, args.cache_storage, number_of_slices=args.k) # we use urimdata and urimdata_with_slices because they should match, if they don't we will detect an error save_resource_data(args.output_filename, urimdata_with_slices, 'mementos', list(urimdata.keys())) diff --git a/hypercane/cluster/time_slice.py b/hypercane/cluster/time_slice.py index 5023787..0519110 100644 --- a/hypercane/cluster/time_slice.py +++ b/hypercane/cluster/time_slice.py @@ -3,7 +3,7 @@ module_logger = logging.getLogger('hypercane.cluster.time_slice') -def execute_time_slice(urimdata, cache_storage): +def execute_time_slice(urimdata, cache_storage, number_of_slices=None): import concurrent.futures import math @@ -30,8 +30,9 @@ def execute_time_slice(urimdata, cache_storage): module_logger.exception('URI-M [{}] generated an exception: [{}], skipping...'.format(urim, exc)) hypercane.errors.errorstore.add(urim, traceback.format_exc()) - # calculate the number of slices 28 + math.log(len(mementos)) - number_of_slices = math.ceil(28 + math.log(len(mementos))) + if number_of_slices is None: + # calculate the number of slices 28 + math.log(len(mementos)) + number_of_slices = math.ceil(28 + math.log(len(mementos))) module_logger.info("The collection will be divided into {} slices".format(number_of_slices)) diff --git a/hypercane/version.py b/hypercane/version.py index 47f4819..c602928 100644 --- a/hypercane/version.py +++ b/hypercane/version.py @@ -1,3 +1,3 @@ __appname__ = "hypercane" -__appversion__ = '0.2020.07.13.224110' +__appversion__ = '0.2020.07.14.023052' __useragent__ = "{}/{}".format(__appname__, __appversion__)