Skip to content

Commit

Permalink
Merge pull request #19 from oduwsdl/configurable-time-slice
Browse files Browse the repository at this point in the history
the number of clusters for time-slicing is now configurable
  • Loading branch information
shawnmjones committed Jul 14, 2020
2 parents 17cd8a3 + d744110 commit de95ace
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/source/conf.py
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = u''
# The full version, including alpha/beta/rc tags
release = u'0.2020.07.13.224110'
release = u'0.2020.07.14.023052'

# -- General configuration ---------------------------------------------------

Expand Down
9 changes: 7 additions & 2 deletions hypercane/actions/cluster.py
Expand Up @@ -174,6 +174,11 @@ def time_slice(args):
prog="hc cluster time-slice"
)

parser.add_argument('-k', dest='k',
default=None, type=int,
help='The number of clusters to create.'
)

args = process_input_args(args, parser)
output_type = 'mementos'

Expand All @@ -187,15 +192,15 @@ def time_slice(args):

session = get_web_session(cache_storage=args.cache_storage)


urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, args.crawl_depth,
session, discover_mementos_by_input_type
)

logger.info("There were {} mementos discovered in the input".format(len(urimdata)))

urimdata_with_slices = execute_time_slice(urimdata, args.cache_storage)
urimdata_with_slices = execute_time_slice(
urimdata, args.cache_storage, number_of_slices=args.k)

# we use urimdata and urimdata_with_slices because they should match, if they don't we will detect an error
save_resource_data(args.output_filename, urimdata_with_slices, 'mementos', list(urimdata.keys()))
Expand Down
7 changes: 4 additions & 3 deletions hypercane/cluster/time_slice.py
Expand Up @@ -3,7 +3,7 @@

module_logger = logging.getLogger('hypercane.cluster.time_slice')

def execute_time_slice(urimdata, cache_storage):
def execute_time_slice(urimdata, cache_storage, number_of_slices=None):

import concurrent.futures
import math
Expand All @@ -30,8 +30,9 @@ def execute_time_slice(urimdata, cache_storage):
module_logger.exception('URI-M [{}] generated an exception: [{}], skipping...'.format(urim, exc))
hypercane.errors.errorstore.add(urim, traceback.format_exc())

# calculate the number of slices 28 + math.log(len(mementos))
number_of_slices = math.ceil(28 + math.log(len(mementos)))
if number_of_slices is None:
# calculate the number of slices 28 + math.log(len(mementos))
number_of_slices = math.ceil(28 + math.log(len(mementos)))

module_logger.info("The collection will be divided into {} slices".format(number_of_slices))

Expand Down
2 changes: 1 addition & 1 deletion hypercane/version.py
@@ -1,3 +1,3 @@
__appname__ = "hypercane"
__appversion__ = '0.2020.07.13.224110'
__appversion__ = '0.2020.07.14.023052'
__useragent__ = "{}/{}".format(__appname__, __appversion__)

0 comments on commit de95ace

Please sign in to comment.