work for sample CLI that should be available to sample GUI script #41

oduwsdl · Sep 9, 2021 · e912717 · e912717
1 parent 358da9f
commit e912717
Show file tree

Hide file tree

Showing 2 changed files with 138 additions and 292 deletions.
diff --git a/hypercane/actions/sample.py b/hypercane/actions/sample.py
@@ -289,303 +289,147 @@ def sample_with_systematic(args):
 
     module_logger.info("Done sampling.")
 
-# def sample_with_stratified_random(args):
+def sample_with_stratified_random(args):
 
-#     from hypercane.sample.probability import select_random_per_cluster
-#     from hypercane.actions import get_logger, calculate_loglevel
-#     from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
-#     from hypercane.identify import discover_resource_data_by_input_type, \
-#         discover_mementos_by_input_type
-#     import argparse
-#     from hypercane.actions import add_input_args, add_default_args
-
-#     parser = argparse.ArgumentParser(
-#         description="Sample random URLs from a web archive collection.",
-#         prog="hc sample stratified-random"
-#         )
-
-#     parser = add_input_args(parser)
-
-#     parser.add_argument('-j', '--j', required=True, help="the number of items to randomly sample from each cluster", dest='j')
-
-#     parser = add_default_args(parser)
-
-#     args = parser.parse_args(args)
-
-#     logger = get_logger(
-#         __name__,
-#         calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
-#         args.logfile
-#     )
-
-#     if args.errorfilename is not None:
-#         hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
+    from hypercane.sample.probability import select_random_per_cluster
+    from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
+    from hypercane.identify import discover_resource_data_by_input_type, \
+        discover_mementos_by_input_type
 
-#     session = get_web_session(cache_storage=args.cache_storage)
-#     output_type = 'mementos'
+    session = get_web_session(cache_storage=args.cache_storage)
+    output_type = 'mementos'
 
-#     logger.info("Starting random sampling of URI-Ms.")
+    module_logger.info("Starting random sampling of URI-Ms.")
 
-#     urimdata = discover_resource_data_by_input_type(
-#         args.input_type, output_type, args.input_arguments, args.crawl_depth,
-#         session, discover_mementos_by_input_type
-#     )
+    urimdata = discover_resource_data_by_input_type(
+        args.input_type, output_type, args.input_arguments, args.crawl_depth,
+        session, discover_mementos_by_input_type
+    )
 
-#     memento_clusters = organize_mementos_by_cluster(urimdata)
+    memento_clusters = organize_mementos_by_cluster(urimdata)
 
-#     logger.info("Executing stratified random sample to select {} items each from {} clusters of {} URI-Ms".format(
-#         int(args.j), len(memento_clusters), len(urimdata.keys())))
+    module_logger.info("Executing stratified random sample to select {} items each from {} clusters of {} URI-Ms".format(
+        int(args.j), len(memento_clusters), len(urimdata.keys())))
 
-#     sampled_urims = select_random_per_cluster(memento_clusters, int(args.j))
-
-#     logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
-#     save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
-
-#     logger.info("Done sampling.")
-
-# def sample_with_stratified_systematic(args):
-
-#     from hypercane.sample.probability import select_systematic_per_cluster
-#     from hypercane.actions import get_logger, calculate_loglevel
-#     from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
-#     from hypercane.identify import discover_resource_data_by_input_type, \
-#         discover_mementos_by_input_type
-#     import argparse
-#     from hypercane.actions import add_input_args, add_default_args
-
-#     parser = argparse.ArgumentParser(
-#         description="Sample random URLs from a web archive collection.",
-#         prog="hc sample stratified-systematic"
-#         )
+    sampled_urims = select_random_per_cluster(memento_clusters, int(args.j))
 
-#     parser = add_input_args(parser)
-
-#     parser.add_argument('-j', '--j', required=True, help="the iteration of the item to sample from each cluster, e.g., --j 5 for every 5th item from each cluster", dest='iteration')
+    module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
+    save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
 
-#     parser = add_default_args(parser)
+    module_logger.info("Done sampling.")
 
-#     args = parser.parse_args(args)
+def sample_with_stratified_systematic(args):
 
-#     logger = get_logger(
-#         __name__,
-#         calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
-#         args.logfile
-#     )
-
-#     if args.errorfilename is not None:
-#         hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
+    from hypercane.sample.probability import select_systematic_per_cluster
+    from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
+    from hypercane.identify import discover_resource_data_by_input_type, \
+        discover_mementos_by_input_type
 
-#     session = get_web_session(cache_storage=args.cache_storage)
-#     output_type = 'mementos'
+    session = get_web_session(cache_storage=args.cache_storage)
+    output_type = 'mementos'
 
-#     logger.info("Starting random sampling of URI-Ms.")
+    module_logger.info("Starting stratified systematic sampling of URI-Ms.")
 
-#     urimdata = discover_resource_data_by_input_type(
-#         args.input_type, output_type, args.input_arguments, args.crawl_depth,
-#         session, discover_mementos_by_input_type
-#     )
+    urimdata = discover_resource_data_by_input_type(
+        args.input_type, output_type, args.input_arguments, args.crawl_depth,
+        session, discover_mementos_by_input_type
+    )
 
-#     memento_clusters = organize_mementos_by_cluster(urimdata)
+    memento_clusters = organize_mementos_by_cluster(urimdata)
 
-#     logger.info("Executing stratified systematic sample to select each {} item each from {} clusters of {} URI-Ms".format(
-#         int(args.iteration), len(memento_clusters), len(urimdata.keys())))
+    module_logger.info("Executing stratified systematic sample to select each {} item each from {} clusters of {} URI-Ms".format(
+        int(args.iteration), len(memento_clusters), len(urimdata.keys())))
 
-#     sampled_urims = select_systematic_per_cluster(memento_clusters, int(args.iteration))
+    sampled_urims = select_systematic_per_cluster(memento_clusters, int(args.iteration))
 
-#     logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
-#     save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
-
-#     logger.info("Done sampling.")
-
-# def sample_with_random_cluster(args):
-
-#     from hypercane.sample.probability import select_random_clusters
-#     from hypercane.actions import get_logger, calculate_loglevel
-#     from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
-#     from hypercane.identify import discover_resource_data_by_input_type, \
-#         discover_mementos_by_input_type
-#     import argparse
-#     from hypercane.actions import add_input_args, add_default_args
-
-#     parser = argparse.ArgumentParser(
-#         description="Sample random URLs from a web archive collection.",
-#         prog="hc sample random-cluster"
-#         )
-
-#     parser = add_input_args(parser)
-
-#     parser.add_argument('-j', '--cluster-count', required=True, help="the number of clusters to randomly sample, e.g., --cluster-count 5 for every 5th item from each cluster", dest='cluster_count')
+    module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
+    save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
 
-#     parser = add_default_args(parser)
+    module_logger.info("Done sampling.")
 
-#     args = parser.parse_args(args)
+def sample_with_random_cluster(args):
 
-#     logger = get_logger(
-#         __name__,
-#         calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
-#         args.logfile
-#     )
-
-#     if args.errorfilename is not None:
-#         hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
+    from hypercane.sample.probability import select_random_clusters
+    from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
+    from hypercane.identify import discover_resource_data_by_input_type, \
+        discover_mementos_by_input_type
 
-#     session = get_web_session(cache_storage=args.cache_storage)
-#     output_type = 'mementos'
+    session = get_web_session(cache_storage=args.cache_storage)
+    output_type = 'mementos'
 
-#     logger.info("Starting random sampling of URI-Ms.")
+    module_logger.info("Starting sampling of random clusters of URI-Ms.")
 
-#     urimdata = discover_resource_data_by_input_type(
-#         args.input_type, output_type, args.input_arguments, args.crawl_depth,
-#         session, discover_mementos_by_input_type
-#     )
+    urimdata = discover_resource_data_by_input_type(
+        args.input_type, output_type, args.input_arguments, args.crawl_depth,
+        session, discover_mementos_by_input_type
+    )
 
-#     memento_clusters = organize_mementos_by_cluster(urimdata)
+    memento_clusters = organize_mementos_by_cluster(urimdata)
 
-#     logger.info("Executing random cluster selection to sample {} clusters from {} clusters of {} URI-Ms".format(
-#         int(args.cluster_count), len(memento_clusters), len(urimdata.keys())))
+    module_logger.info("Executing random cluster selection to sample {} clusters from {} clusters of {} URI-Ms".format(
+        int(args.cluster_count), len(memento_clusters), len(urimdata.keys())))
 
-#     sampled_urims = select_random_clusters(memento_clusters, int(args.cluster_count))
-
-#     logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
-#     save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
-
-#     logger.info("Done sampling.")
-
-# def sample_with_random_oversample(args):
-
-#     from hypercane.sample.probability import select_by_random_oversampling
-#     from hypercane.actions import get_logger, calculate_loglevel
-#     from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
-#     from hypercane.identify import discover_resource_data_by_input_type, \
-#         discover_mementos_by_input_type
-#     import argparse
-#     from hypercane.actions import add_input_args, add_default_args
+    sampled_urims = select_random_clusters(memento_clusters, int(args.cluster_count))
 
-#     parser = argparse.ArgumentParser(
-#         description="Sample random URLs from a web archive collection.",
-#         prog="hc sample random-oversample"
-#         )
-
-#     parser = add_input_args(parser)
-#     parser = add_default_args(parser)
+    module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
+    save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
 
-#     args = parser.parse_args(args)
+    module_logger.info("Done sampling.")
 
-#     logger = get_logger(
-#         __name__,
-#         calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
-#         args.logfile
-#     )
+def sample_with_random_oversample(args):
 
-#     if args.errorfilename is not None:
-#         hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
+    from hypercane.sample.probability import select_by_random_oversampling
+    from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
+    from hypercane.identify import discover_resource_data_by_input_type, \
+        discover_mementos_by_input_type
 
-#     session = get_web_session(cache_storage=args.cache_storage)
-#     output_type = 'mementos'
+    session = get_web_session(cache_storage=args.cache_storage)
+    output_type = 'mementos'
 
-#     logger.info("Starting random sampling of URI-Ms.")
+    module_logger.info("Starting random oversampling of clusters of URI-Ms.")
 
-#     urimdata = discover_resource_data_by_input_type(
-#         args.input_type, output_type, args.input_arguments, args.crawl_depth,
-#         session, discover_mementos_by_input_type
-#     )
+    urimdata = discover_resource_data_by_input_type(
+        args.input_type, output_type, args.input_arguments, args.crawl_depth,
+        session, discover_mementos_by_input_type
+    )
 
-#     memento_clusters = organize_mementos_by_cluster(urimdata)
+    memento_clusters = organize_mementos_by_cluster(urimdata)
 
-#     logger.info("Executing random oversample from {} clusters of {} URI-Ms".format(
-#         len(memento_clusters), len(urimdata.keys())))
+    module_logger.info("Executing random oversample from {} clusters of {} URI-Ms".format(
+        len(memento_clusters), len(urimdata.keys())))
 
-#     sampled_urims = select_by_random_oversampling(memento_clusters)
-
-#     logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
-#     save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
-
-#     logger.info("Done sampling.")
+    sampled_urims = select_by_random_oversampling(memento_clusters)
 
-# def sample_with_random_undersample(args):
-
-#     from hypercane.sample.probability import select_by_random_undersamping
-#     from hypercane.actions import get_logger, calculate_loglevel
-#     from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
-#     from hypercane.identify import discover_resource_data_by_input_type, \
-#         discover_mementos_by_input_type
-#     import argparse
-#     from hypercane.actions import add_input_args, add_default_args
-
-#     parser = argparse.ArgumentParser(
-#         description="Sample random URLs from a web archive collection.",
-#         prog="hc sample random-undersample"
-#         )
+    module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
+    save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
 
-#     parser = add_input_args(parser)
-#     parser = add_default_args(parser)
+    module_logger.info("Done sampling.")
 
-#     args = parser.parse_args(args)
+def sample_with_random_undersample(args):
 
-#     logger = get_logger(
-#         __name__,
-#         calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
-#         args.logfile
-#     )
-
-#     if args.errorfilename is not None:
-#         hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
+    from hypercane.sample.probability import select_by_random_undersamping
+    from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
+    from hypercane.identify import discover_resource_data_by_input_type, \
+        discover_mementos_by_input_type
 
-#     session = get_web_session(cache_storage=args.cache_storage)
-#     output_type = 'mementos'
+    session = get_web_session(cache_storage=args.cache_storage)
+    output_type = 'mementos'
 
-#     logger.info("Starting random sampling of URI-Ms.")
+    module_logger.info("Starting random undersampling of clusters of URI-Ms.")
 
-#     urimdata = discover_resource_data_by_input_type(
-#         args.input_type, output_type, args.input_arguments, args.crawl_depth,
-#         session, discover_mementos_by_input_type
-#     )
+    urimdata = discover_resource_data_by_input_type(
+        args.input_type, output_type, args.input_arguments, args.crawl_depth,
+        session, discover_mementos_by_input_type
+    )
 
-#     memento_clusters = organize_mementos_by_cluster(urimdata)
+    memento_clusters = organize_mementos_by_cluster(urimdata)
 
-#     logger.info("Executing random undersample from {} clusters of {} URI-Ms".format(
-#         len(memento_clusters), len(urimdata.keys())))
+    module_logger.info("Executing random undersample from {} clusters of {} URI-Ms".format(
+        len(memento_clusters), len(urimdata.keys())))
 
-#     sampled_urims = select_by_random_undersamping(memento_clusters)
+    sampled_urims = select_by_random_undersamping(memento_clusters)
 
-#     logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
-#     save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
-
-#     logger.info("Done sampling.")
-
-# def print_usage():
-
-#     print("""hc sample is used execute different algorithms for selecting mementos from a web archive collection, document collection, a list of TimeMaps, or a directory containing WARCs
-
-#     Supported commands:
-#     * true-random - randomly chooses n URI-Ms from the input
-#     * dsa1 - select URI-Ms using the DSA1 (AlNoamany's) Algorithm
-#     * alnoamany - alias for dsa1
-#     * filtered-random - filters off-topic mementos, filters near-duplicates, and then samples k of the remainder, randomly
-#     * systematic - returns every jth memento from the input
-#     * stratified-random - returns j items randomly chosen from each cluster, requries that the input be clustered with the cluster action
-#     * stratified-systematic - returns every jth URI-M from each cluster, requries that the input be clustered witht he cluster action
-#     * random-cluster - return j randomly selected clusters from the sample, requires that the input be clustered with the cluster action
-#     * random-oversample - randomly duplicates URI-Ms in the smaller clusters until they match the size of the largest cluster, requires input be clustered with the cluster action
-#     * random-undersample - randomly chooses URI-Ms from the larger clusters until they match the size of the smallest cluster, requires input be clustered with the cluster action
-
-#     Examples:
-
-#     hc sample true-random -i archiveit -a 8788 -o seed-output-file.txt -k 10 -cs mongodb://localhost/cache
-
-#     hc sample dsa1 -i timemaps -a timemaps.tsv -o dsa1-sample.tsv -cs mongodb://localhost/cache
-
-# """)
+    module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
+    save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)
 
-# supported_commands = {
-#     "true-random": sample_with_true_random,
-#     "dsa1": sample_with_dsa1,
-#     "alnoamany": sample_with_dsa1,
-#     "filtered-random": sample_with_filtered_random,
-#     "systematic": sample_with_systematic,
-#     "stratified-random": sample_with_stratified_random,
-#     "stratified-systematic": sample_with_stratified_systematic,
-#     "random-cluster": sample_with_random_cluster,
-#     "random-oversample": sample_with_random_oversample,
-#     "random-undersample": sample_with_random_undersample
-# }
+    module_logger.info("Done sampling.")