Skip to content

Commit

Permalink
work for sample CLI that should be available to sample GUI script #41
Browse files Browse the repository at this point in the history
  • Loading branch information
shawnmjones committed Sep 9, 2021
1 parent 358da9f commit e912717
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 292 deletions.
346 changes: 95 additions & 251 deletions hypercane/actions/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,303 +289,147 @@ def sample_with_systematic(args):

module_logger.info("Done sampling.")

# def sample_with_stratified_random(args):
def sample_with_stratified_random(args):

# from hypercane.sample.probability import select_random_per_cluster
# from hypercane.actions import get_logger, calculate_loglevel
# from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
# from hypercane.identify import discover_resource_data_by_input_type, \
# discover_mementos_by_input_type
# import argparse
# from hypercane.actions import add_input_args, add_default_args

# parser = argparse.ArgumentParser(
# description="Sample random URLs from a web archive collection.",
# prog="hc sample stratified-random"
# )

# parser = add_input_args(parser)

# parser.add_argument('-j', '--j', required=True, help="the number of items to randomly sample from each cluster", dest='j')

# parser = add_default_args(parser)

# args = parser.parse_args(args)

# logger = get_logger(
# __name__,
# calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
# args.logfile
# )

# if args.errorfilename is not None:
# hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
from hypercane.sample.probability import select_random_per_cluster
from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
from hypercane.identify import discover_resource_data_by_input_type, \
discover_mementos_by_input_type

# session = get_web_session(cache_storage=args.cache_storage)
# output_type = 'mementos'
session = get_web_session(cache_storage=args.cache_storage)
output_type = 'mementos'

# logger.info("Starting random sampling of URI-Ms.")
module_logger.info("Starting random sampling of URI-Ms.")

# urimdata = discover_resource_data_by_input_type(
# args.input_type, output_type, args.input_arguments, args.crawl_depth,
# session, discover_mementos_by_input_type
# )
urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, args.crawl_depth,
session, discover_mementos_by_input_type
)

# memento_clusters = organize_mementos_by_cluster(urimdata)
memento_clusters = organize_mementos_by_cluster(urimdata)

# logger.info("Executing stratified random sample to select {} items each from {} clusters of {} URI-Ms".format(
# int(args.j), len(memento_clusters), len(urimdata.keys())))
module_logger.info("Executing stratified random sample to select {} items each from {} clusters of {} URI-Ms".format(
int(args.j), len(memento_clusters), len(urimdata.keys())))

# sampled_urims = select_random_per_cluster(memento_clusters, int(args.j))

# logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
# save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# logger.info("Done sampling.")

# def sample_with_stratified_systematic(args):

# from hypercane.sample.probability import select_systematic_per_cluster
# from hypercane.actions import get_logger, calculate_loglevel
# from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
# from hypercane.identify import discover_resource_data_by_input_type, \
# discover_mementos_by_input_type
# import argparse
# from hypercane.actions import add_input_args, add_default_args

# parser = argparse.ArgumentParser(
# description="Sample random URLs from a web archive collection.",
# prog="hc sample stratified-systematic"
# )
sampled_urims = select_random_per_cluster(memento_clusters, int(args.j))

# parser = add_input_args(parser)

# parser.add_argument('-j', '--j', required=True, help="the iteration of the item to sample from each cluster, e.g., --j 5 for every 5th item from each cluster", dest='iteration')
module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# parser = add_default_args(parser)
module_logger.info("Done sampling.")

# args = parser.parse_args(args)
def sample_with_stratified_systematic(args):

# logger = get_logger(
# __name__,
# calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
# args.logfile
# )

# if args.errorfilename is not None:
# hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
from hypercane.sample.probability import select_systematic_per_cluster
from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
from hypercane.identify import discover_resource_data_by_input_type, \
discover_mementos_by_input_type

# session = get_web_session(cache_storage=args.cache_storage)
# output_type = 'mementos'
session = get_web_session(cache_storage=args.cache_storage)
output_type = 'mementos'

# logger.info("Starting random sampling of URI-Ms.")
module_logger.info("Starting stratified systematic sampling of URI-Ms.")

# urimdata = discover_resource_data_by_input_type(
# args.input_type, output_type, args.input_arguments, args.crawl_depth,
# session, discover_mementos_by_input_type
# )
urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, args.crawl_depth,
session, discover_mementos_by_input_type
)

# memento_clusters = organize_mementos_by_cluster(urimdata)
memento_clusters = organize_mementos_by_cluster(urimdata)

# logger.info("Executing stratified systematic sample to select each {} item each from {} clusters of {} URI-Ms".format(
# int(args.iteration), len(memento_clusters), len(urimdata.keys())))
module_logger.info("Executing stratified systematic sample to select each {} item each from {} clusters of {} URI-Ms".format(
int(args.iteration), len(memento_clusters), len(urimdata.keys())))

# sampled_urims = select_systematic_per_cluster(memento_clusters, int(args.iteration))
sampled_urims = select_systematic_per_cluster(memento_clusters, int(args.iteration))

# logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
# save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# logger.info("Done sampling.")

# def sample_with_random_cluster(args):

# from hypercane.sample.probability import select_random_clusters
# from hypercane.actions import get_logger, calculate_loglevel
# from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
# from hypercane.identify import discover_resource_data_by_input_type, \
# discover_mementos_by_input_type
# import argparse
# from hypercane.actions import add_input_args, add_default_args

# parser = argparse.ArgumentParser(
# description="Sample random URLs from a web archive collection.",
# prog="hc sample random-cluster"
# )

# parser = add_input_args(parser)

# parser.add_argument('-j', '--cluster-count', required=True, help="the number of clusters to randomly sample, e.g., --cluster-count 5 for every 5th item from each cluster", dest='cluster_count')
module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# parser = add_default_args(parser)
module_logger.info("Done sampling.")

# args = parser.parse_args(args)
def sample_with_random_cluster(args):

# logger = get_logger(
# __name__,
# calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
# args.logfile
# )

# if args.errorfilename is not None:
# hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
from hypercane.sample.probability import select_random_clusters
from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
from hypercane.identify import discover_resource_data_by_input_type, \
discover_mementos_by_input_type

# session = get_web_session(cache_storage=args.cache_storage)
# output_type = 'mementos'
session = get_web_session(cache_storage=args.cache_storage)
output_type = 'mementos'

# logger.info("Starting random sampling of URI-Ms.")
module_logger.info("Starting sampling of random clusters of URI-Ms.")

# urimdata = discover_resource_data_by_input_type(
# args.input_type, output_type, args.input_arguments, args.crawl_depth,
# session, discover_mementos_by_input_type
# )
urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, args.crawl_depth,
session, discover_mementos_by_input_type
)

# memento_clusters = organize_mementos_by_cluster(urimdata)
memento_clusters = organize_mementos_by_cluster(urimdata)

# logger.info("Executing random cluster selection to sample {} clusters from {} clusters of {} URI-Ms".format(
# int(args.cluster_count), len(memento_clusters), len(urimdata.keys())))
module_logger.info("Executing random cluster selection to sample {} clusters from {} clusters of {} URI-Ms".format(
int(args.cluster_count), len(memento_clusters), len(urimdata.keys())))

# sampled_urims = select_random_clusters(memento_clusters, int(args.cluster_count))

# logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
# save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# logger.info("Done sampling.")

# def sample_with_random_oversample(args):

# from hypercane.sample.probability import select_by_random_oversampling
# from hypercane.actions import get_logger, calculate_loglevel
# from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
# from hypercane.identify import discover_resource_data_by_input_type, \
# discover_mementos_by_input_type
# import argparse
# from hypercane.actions import add_input_args, add_default_args
sampled_urims = select_random_clusters(memento_clusters, int(args.cluster_count))

# parser = argparse.ArgumentParser(
# description="Sample random URLs from a web archive collection.",
# prog="hc sample random-oversample"
# )

# parser = add_input_args(parser)
# parser = add_default_args(parser)
module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# args = parser.parse_args(args)
module_logger.info("Done sampling.")

# logger = get_logger(
# __name__,
# calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
# args.logfile
# )
def sample_with_random_oversample(args):

# if args.errorfilename is not None:
# hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
from hypercane.sample.probability import select_by_random_oversampling
from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
from hypercane.identify import discover_resource_data_by_input_type, \
discover_mementos_by_input_type

# session = get_web_session(cache_storage=args.cache_storage)
# output_type = 'mementos'
session = get_web_session(cache_storage=args.cache_storage)
output_type = 'mementos'

# logger.info("Starting random sampling of URI-Ms.")
module_logger.info("Starting random oversampling of clusters of URI-Ms.")

# urimdata = discover_resource_data_by_input_type(
# args.input_type, output_type, args.input_arguments, args.crawl_depth,
# session, discover_mementos_by_input_type
# )
urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, args.crawl_depth,
session, discover_mementos_by_input_type
)

# memento_clusters = organize_mementos_by_cluster(urimdata)
memento_clusters = organize_mementos_by_cluster(urimdata)

# logger.info("Executing random oversample from {} clusters of {} URI-Ms".format(
# len(memento_clusters), len(urimdata.keys())))
module_logger.info("Executing random oversample from {} clusters of {} URI-Ms".format(
len(memento_clusters), len(urimdata.keys())))

# sampled_urims = select_by_random_oversampling(memento_clusters)

# logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
# save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# logger.info("Done sampling.")
sampled_urims = select_by_random_oversampling(memento_clusters)

# def sample_with_random_undersample(args):

# from hypercane.sample.probability import select_by_random_undersamping
# from hypercane.actions import get_logger, calculate_loglevel
# from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
# from hypercane.identify import discover_resource_data_by_input_type, \
# discover_mementos_by_input_type
# import argparse
# from hypercane.actions import add_input_args, add_default_args

# parser = argparse.ArgumentParser(
# description="Sample random URLs from a web archive collection.",
# prog="hc sample random-undersample"
# )
module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# parser = add_input_args(parser)
# parser = add_default_args(parser)
module_logger.info("Done sampling.")

# args = parser.parse_args(args)
def sample_with_random_undersample(args):

# logger = get_logger(
# __name__,
# calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
# args.logfile
# )

# if args.errorfilename is not None:
# hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(args.errorfilename)
from hypercane.sample.probability import select_by_random_undersamping
from hypercane.utils import get_web_session, save_resource_data, organize_mementos_by_cluster
from hypercane.identify import discover_resource_data_by_input_type, \
discover_mementos_by_input_type

# session = get_web_session(cache_storage=args.cache_storage)
# output_type = 'mementos'
session = get_web_session(cache_storage=args.cache_storage)
output_type = 'mementos'

# logger.info("Starting random sampling of URI-Ms.")
module_logger.info("Starting random undersampling of clusters of URI-Ms.")

# urimdata = discover_resource_data_by_input_type(
# args.input_type, output_type, args.input_arguments, args.crawl_depth,
# session, discover_mementos_by_input_type
# )
urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, args.crawl_depth,
session, discover_mementos_by_input_type
)

# memento_clusters = organize_mementos_by_cluster(urimdata)
memento_clusters = organize_mementos_by_cluster(urimdata)

# logger.info("Executing random undersample from {} clusters of {} URI-Ms".format(
# len(memento_clusters), len(urimdata.keys())))
module_logger.info("Executing random undersample from {} clusters of {} URI-Ms".format(
len(memento_clusters), len(urimdata.keys())))

# sampled_urims = select_by_random_undersamping(memento_clusters)
sampled_urims = select_by_random_undersamping(memento_clusters)

# logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
# save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# logger.info("Done sampling.")

# def print_usage():

# print("""hc sample is used execute different algorithms for selecting mementos from a web archive collection, document collection, a list of TimeMaps, or a directory containing WARCs

# Supported commands:
# * true-random - randomly chooses n URI-Ms from the input
# * dsa1 - select URI-Ms using the DSA1 (AlNoamany's) Algorithm
# * alnoamany - alias for dsa1
# * filtered-random - filters off-topic mementos, filters near-duplicates, and then samples k of the remainder, randomly
# * systematic - returns every jth memento from the input
# * stratified-random - returns j items randomly chosen from each cluster, requries that the input be clustered with the cluster action
# * stratified-systematic - returns every jth URI-M from each cluster, requries that the input be clustered witht he cluster action
# * random-cluster - return j randomly selected clusters from the sample, requires that the input be clustered with the cluster action
# * random-oversample - randomly duplicates URI-Ms in the smaller clusters until they match the size of the largest cluster, requires input be clustered with the cluster action
# * random-undersample - randomly chooses URI-Ms from the larger clusters until they match the size of the smallest cluster, requires input be clustered with the cluster action

# Examples:

# hc sample true-random -i archiveit -a 8788 -o seed-output-file.txt -k 10 -cs mongodb://localhost/cache

# hc sample dsa1 -i timemaps -a timemaps.tsv -o dsa1-sample.tsv -cs mongodb://localhost/cache

# """)
module_logger.info("Writing {} sampled URI-Ms out to {}".format(len(sampled_urims), args.output_filename))
save_resource_data(args.output_filename, urimdata, 'mementos', sampled_urims)

# supported_commands = {
# "true-random": sample_with_true_random,
# "dsa1": sample_with_dsa1,
# "alnoamany": sample_with_dsa1,
# "filtered-random": sample_with_filtered_random,
# "systematic": sample_with_systematic,
# "stratified-random": sample_with_stratified_random,
# "stratified-systematic": sample_with_stratified_systematic,
# "random-cluster": sample_with_random_cluster,
# "random-oversample": sample_with_random_oversample,
# "random-undersample": sample_with_random_undersample
# }
module_logger.info("Done sampling.")
Loading

0 comments on commit e912717

Please sign in to comment.