Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-42324: Fix min/max rows logic for Deduplication. #11

Merged
merged 2 commits into from
Jan 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion python/lsst/daf/relation/_operations/_deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,13 @@ def __str__(self) -> str:

def applied_min_rows(self, target: Relation) -> int:
# Docstring inherited.
return 1
return 1 if target.min_rows >= 1 else 0

def applied_max_rows(self, target: Relation) -> int | None:
# Docstring inherited.
if not target.columns:
return 1 if target.max_rows is None or target.max_rows >= 1 else 0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity why can this ever be '1' instead of '0'? Doesn't no columns also imply no rows?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not quite; if you think of it as a list of dict rows it's the difference between [] and [{}]. But where it really matters is actually in what happens when you join a relation like this to a regular one with nonzero numbers of rows and columns: [{}] is the "join identity" - a join between it and any other relation is the other relation, while [] is the "join zero" relation - a join between it and any other relation yields a relation with no rows (but the other relation's columns).

return target.max_rows

def commute(self, current: UnaryOperationRelation) -> UnaryCommutator:
# Docstring inherited.
Expand Down
16 changes: 7 additions & 9 deletions python/lsst/daf/relation/_relation.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,15 +665,13 @@ class BaseRelation:
"""

def __init_subclass__(cls) -> None:
assert (
cls.__name__
in {
"LeafRelation",
"UnaryOperationRelation",
"BinaryOperationRelation",
"MarkerRelation",
}
or cls.__base__.__name__ != "Relation"
assert cls.__name__ in {
"LeafRelation",
"UnaryOperationRelation",
"BinaryOperationRelation",
"MarkerRelation",
} or (
cls.__base__ is not None and cls.__base__.__name__ != "Relation"
), "Relation inheritance is closed to predefined types in daf_relation and MarkerRelation subclasses."

@property
Expand Down
7 changes: 2 additions & 5 deletions python/lsst/daf/relation/iteration/_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,7 @@ def convert_column_expression(
function = self.get_function(name)
if function is not None:
arg_callables = [self.convert_column_expression(arg) for arg in args]
# MyPy doesn't see 'function' as not-None for some reason.
return lambda row: function(*[c(row) for c in arg_callables]) # type: ignore
return lambda row: function(*[c(row) for c in arg_callables])
first, *rest = (self.convert_column_expression(arg) for arg in args)
return lambda row: getattr(first(row), name)(*[r(row) for r in rest])
raise AssertionError("matches should be exhaustive and all branches should return")
Expand Down Expand Up @@ -336,9 +335,7 @@ def convert_predicate(self, predicate: Predicate) -> Callable[[Mapping[ColumnTag
function = self.get_function(name)
if function is not None:
arg_callables = [self.convert_column_expression(arg) for arg in args]
# MyPy doesn't see 'function' as not-None in the capture
# for some reason.
return lambda row: function(*[c(row) for c in arg_callables]) # type: ignore
return lambda row: function(*[c(row) for c in arg_callables])
first, *rest = (self.convert_column_expression(arg) for arg in args)
return lambda row: getattr(first(row), name)(*[r(row) for r in rest])
case LogicalAnd(operands=operands):
Expand Down
10 changes: 10 additions & 0 deletions tests/test_deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ def test_attributes(self) -> None:
self.assertFalse(operation.is_order_dependent)
self.assertFalse(operation.is_count_dependent)

def test_min_max_rows(self) -> None:
"""Test min and max rows for edge-case deduplications."""
self.assertEqual(self.leaf.with_only_columns(set()).without_duplicates().min_rows, 1)
self.assertEqual(self.leaf.with_only_columns(set()).without_duplicates().max_rows, 1)
leaf0 = self.engine.make_leaf({self.a}, payload=iteration.RowSequence([]), name="leaf")
self.assertEqual(leaf0.without_duplicates().min_rows, 0)
self.assertEqual(leaf0.without_duplicates().max_rows, 0)
self.assertEqual(leaf0.with_only_columns([]).without_duplicates().min_rows, 0)
self.assertEqual(leaf0.with_only_columns([]).without_duplicates().max_rows, 0)

def test_backtracking_apply(self) -> None:
"""Test apply logic that involves reordering operations in the existing
tree to perform the new operation in a preferred engine.
Expand Down