Skip to content

Commit

Permalink
filtering by score
Browse files Browse the repository at this point in the history
  • Loading branch information
shawnmjones committed May 20, 2021
1 parent 375939f commit c9a4fcc
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 74 deletions.
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = u''
# The full version, including alpha/beta/rc tags
release = u'0.2021.05.20.141223'
release = u'0.2021.05.20.144300'

# -- General configuration ---------------------------------------------------

Expand Down
97 changes: 27 additions & 70 deletions hypercane/actions/hfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def extract_rank_key_from_input(urimdata):

return rankkey

def include_rank(args):
def include_score_range(args):

import argparse
from hypercane.actions import process_input_args, get_logger, \
Expand All @@ -217,66 +217,12 @@ def include_rank(args):
)

parser.add_argument('--criteria', default=1, dest='criteria',
help="The numeric criteria to use when selecting which values to keep."
)

args = process_input_args(args, parser)
output_type = 'mementos'

logger = get_logger(
__name__,
calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
args.logfile
)

logger.info("Starting detection of documents meeting the criteria for score ...")

session = get_web_session(cache_storage=args.cache_storage)

# TODO: add a note about no crawling for this filter
urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, 1,
session, discover_mementos_by_input_type
)

rankkey = extract_rank_key_from_input(urimdata)

filtered_urims = []

for urim in urimdata:
if eval("{}{}".format(
urimdata[urim][rankkey], args.criteria
)):
filtered_urims.append(urim)

logger.info("Saving {} filtered URI-Ms to {}".format(
len(filtered_urims), args.output_filename))

save_resource_data(
args.output_filename, urimdata, 'mementos', filtered_urims)

logger.info("Done filtering mementos by score, output is saved to {}".format(
args.output_filename
))


def exclude_rank(args):

import argparse
from hypercane.actions import process_input_args, get_logger, \
calculate_loglevel
from hypercane.identify import discover_resource_data_by_input_type, \
discover_mementos_by_input_type
from hypercane.utils import get_web_session
from hypercane.utils import save_resource_data

parser = argparse.ArgumentParser(
description="Include only mementos containing a score meeting the given criteria.",
prog="hc filter include-only score"
help="The numeric criteria to use when selecting which values to keep.",
required=True
)

parser.add_argument('--criteria', default=1, dest='criteria',
help="The numeric criteria to use when selecting which values to keep."
parser.add_argument('--scoring-field', help="Specify the scoring field to sort by, default is first encountered",
default=None, dest='scoring_field'
)

args = process_input_args(args, parser)
Expand All @@ -292,19 +238,29 @@ def exclude_rank(args):

session = get_web_session(cache_storage=args.cache_storage)

# TODO: add a note about no crawling for this filter
urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, 1,
session, discover_mementos_by_input_type
)
if args.crawl_depth > 1:
logger.warning("Refusing to crawl when only analyzing prior score data")

rankkey = extract_rank_key_from_input(urimdata)
if args.input_type == 'mementos':
urimdata = discover_resource_data_by_input_type(
args.input_type, output_type, args.input_arguments, 1,
session, discover_mementos_by_input_type
)
else:
# TODO: derive URI-Ms from input type
raise NotImplementedError("Input type of {} not yet supported for filtering by score, score information must come from a prior execution of the score command".format(args.input_type))

if args.scoring_field is None:
scoring_fields = list(urimdata[list(urimdata.keys())[0]].keys())
scoring_field = scoring_fields[0]
else:
scoring_field = args.scoring_field

filtered_urims = []

for urim in urimdata:
if not eval("{}{}".format(
urimdata[urim][rankkey], args.criteria
if eval("{}{}".format(
urimdata[urim][scoring_field], args.criteria
)):
filtered_urims.append(urim)

Expand All @@ -314,7 +270,7 @@ def exclude_rank(args):
save_resource_data(
args.output_filename, urimdata, 'mementos', filtered_urims)

logger.info("Done filtering mementos by scor, output is saved to {}".format(
logger.info("Done filtering mementos by score, output is saved to {}".format(
args.output_filename
))

Expand Down Expand Up @@ -743,10 +699,11 @@ def print_exclude_usage():
""")

include_criteria = {
"language": include_languages,
"languages": include_languages,
"non-duplicates": include_nonduplicates,
"on-topic": include_ontopic,
# "rank": include_rank,
"score": include_score_range,
"highest-score-per-cluster": include_highest_score_per_cluster,
"containing-pattern": include_containing_pattern,
"near-datetime": include_near_datetime,
Expand All @@ -755,10 +712,10 @@ def print_exclude_usage():
}

exclude_criteria = {
"language": exclude_languages,
"languages": exclude_languages,
"near-duplicates": exclude_nearduplicates,
"off-topic": exclude_offtopic,
# "rank": exclude_rank,
"containing-pattern": exclude_containing_pattern
}

Expand Down
2 changes: 0 additions & 2 deletions hypercane/actions/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,6 @@ def score_sort(args):
# TODO: derive URI-Ms from input type
raise NotImplementedError("Input type of {} not yet supported for ordering".format(args.input_type))

from pprint import PrettyPrinter

if args.scoring_field is None:
scoring_fields = list(urimdata[list(urimdata.keys())[0]].keys())
scoring_field = scoring_fields[0]
Expand Down
2 changes: 1 addition & 1 deletion hypercane/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__appname__ = "hypercane"
__appversion__ = '0.2021.05.20.141223'
__appversion__ = '0.2021.05.20.144300'
__useragent__ = "{}/{}".format(__appname__, __appversion__)

0 comments on commit c9a4fcc

Please sign in to comment.