Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #3105 - ColumnValuesToMatchRegex & other fixes #3149

Merged
merged 6 commits into from
Mar 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"$id": "https://open-metadata.org/schema/tests/column/columnValuesToMatchRegex.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "columnValuesToBeUnique",
"title": "columnValuesToMatchRegex",
"description": "This schema defines the test ColumnValuesToMatchRegex. Test the values in a column to match a given regular expression. ",
"type": "object",
"javaType": "org.openmetadata.catalog.tests.column.ColumnValuesToMatchRegex",
Expand Down
2 changes: 1 addition & 1 deletion ingestion-core/src/metadata/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@

from incremental import Version

__version__ = Version("metadata", 0, 9, 0, dev=19)
__version__ = Version("metadata", 0, 9, 0, dev=20)
__all__ = ["__version__"]
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""
ILIKE Count Metric definition
"""
from sqlalchemy import func
from sqlalchemy import case, func

from metadata.orm_profiler.metrics.core import StaticMetric, _label

Expand All @@ -38,4 +38,4 @@ def fn(self):
raise AttributeError(
"ILike Count requires an expression to be set: add_props(expression=...)(Metrics.ILIKE_COUNT)"
)
return func.count(self.col.ilike(self.expression))
return func.sum(case([(self.col.ilike(self.expression), 1)], else_=0))
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""
Like Count Metric definition
"""
from sqlalchemy import func
from sqlalchemy import case, func

from metadata.orm_profiler.metrics.core import StaticMetric, _label

Expand All @@ -38,4 +38,4 @@ def fn(self):
raise AttributeError(
"Like Count requires an expression to be set: add_props(expression=...)(Metrics.LIKE_COUNT)"
)
return func.count(self.col.like(self.expression))
return func.sum(case([(self.col.like(self.expression), 1)], else_=0))
20 changes: 17 additions & 3 deletions ingestion/src/metadata/orm_profiler/profiles/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def sql_col_run(self, col: Column):

This should only execute column metrics.
"""
logger.debug("Running SQL Profiler...")
logger.debug(f"Running SQL Profiler for {col.name}")

col_metrics = self.get_col_metrics(self.static_metrics)

Expand All @@ -208,8 +208,15 @@ def sql_table_run(self):
metric for metric in self.static_metrics if not metric.is_col_metric()
]

for metric in table_metrics:
row = self.session.query(metric().fn()).select_from(self.table).first()
if not table_metrics:
return

query = self.session.query(
*[metric().fn() for metric in table_metrics]
).select_from(self.table)

row = query.first()
if row:
self._table_results.update(dict(row))

def sql_col_query_run(self, col: Column) -> None:
Expand All @@ -218,6 +225,7 @@ def sql_col_query_run(self, col: Column) -> None:
"""

for metric in self.get_col_metrics(self.query_metrics):
logger.debug(f"Running query metric {metric.name()} for {col.name}")
try:
metric_query = metric(col).query(session=self.session)

Expand Down Expand Up @@ -259,6 +267,7 @@ def post_col_run(self, col: Column):

for metric in self.get_col_metrics(self.composed_metrics):
# Composed metrics require the results as an argument
logger.debug(f"Running composed metric {metric.name()} for {col.name}")

self._column_results[col.name][metric.name()] = metric(col).fn(
current_col_results
Expand Down Expand Up @@ -290,12 +299,17 @@ def execute(self) -> "Profiler":
Run the whole profiling
"""

logger.debug(f"Running profiler for {self.table.__tablename__}")

self.execute_table()

for col in self.columns:

# Skip not supported types
if col.type.__class__ in NOT_COMPUTE:
logger.debug(
f"Skipping profile computation for {col.name}. Unsupported type {col.type.__class__}"
)
continue

# Init column results dict
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
ColumnValuesToBeNotNull validation implementation
"""

from datetime import datetime
from typing import Optional

from sqlalchemy import inspect
from sqlalchemy.orm import DeclarativeMeta, Session

from metadata.generated.schema.entity.data.table import ColumnProfile
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
from metadata.generated.schema.tests.column.columnValuesToMatchRegex import (
ColumnValuesToMatchRegex,
)
from metadata.orm_profiler.metrics.core import add_props
from metadata.orm_profiler.metrics.registry import Metrics
from metadata.orm_profiler.profiles.core import Profiler
from metadata.orm_profiler.utils import logger

logger = logger()


def column_values_to_match_regex(
test_case: ColumnValuesToMatchRegex,
col_profile: ColumnProfile,
execution_date: datetime,
session: Optional[Session] = None,
table: Optional[DeclarativeMeta] = None,
) -> TestCaseResult:
"""
Validate Column Values metric
:param test_case: ColumnValuesToMatchRegex
:param col_profile: should contain count and distinct count metrics
:param execution_date: Datetime when the tests ran
:param session: SQLAlchemy Session, for tests that need to compute new metrics
:param table: SQLAlchemy Table, for tests that need to compute new metrics
:return: TestCaseResult with status and results
"""

like_count = add_props(expression=test_case.regex)(Metrics.LIKE_COUNT.value)

if not col_profile.valuesCount:
msg = "We expect `valuesCount` to be informed for ColumnValuesToMatchRegex."
logger.error(msg)
return TestCaseResult(
executionTime=execution_date.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=msg,
)

try:
col = next(
iter([col for col in inspect(table).c if col.name == col_profile.name]),
None,
)

if col is None:
raise ValueError(
f"Cannot find the configured column {col_profile.name} for ColumnValuesToMatchRegex"
)

res = (
Profiler(like_count, session=session, table=table, use_cols=[col])
.execute()
.column_results
)
like_count_res = res.get(col.name)[Metrics.LIKE_COUNT.name]

except Exception as err: # pylint: disable=broad-except
session.rollback()
msg = f"Error computing ColumnValuesToMatchRegex for {col_profile.name} - {err}"
logger.error(msg)
return TestCaseResult(
executionTime=execution_date.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=msg,
)

status = (
TestCaseStatus.Success
if col_profile.valuesCount == like_count_res
else TestCaseStatus.Failed
)
result = f"Found likeCount={like_count_res} & valuesCount={col_profile.valuesCount}. They should be equal."

return TestCaseResult(
executionTime=execution_date.timestamp(), testCaseStatus=status, result=result
)
4 changes: 4 additions & 0 deletions ingestion/src/metadata/orm_profiler/validations/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
from metadata.orm_profiler.validations.column.column_values_to_be_unique import (
column_values_to_be_unique,
)
from metadata.orm_profiler.validations.column.column_values_to_match_regex import (
column_values_to_match_regex,
)
from metadata.orm_profiler.validations.table.table_column_count_to_equal import (
table_column_count_to_equal,
)
Expand Down Expand Up @@ -73,3 +76,4 @@ def validate(test_case, **kwargs) -> TestCaseResult:

# Column Session Tests
validate.register(column_values_not_in_set)
validate.register(column_values_to_match_regex)
28 changes: 23 additions & 5 deletions ingestion/tests/unit/profiler/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,21 @@ def test_like_count(self):
# that the metrics runs correctly rather than the implementation logic.
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
res = (
Profiler(like, session=self.session, table=User, use_cols=[User.age])
Profiler(like, session=self.session, table=User, use_cols=[User.name])
.execute()
._column_results
)

assert res.get(User.age.name)[Metrics.LIKE_COUNT.name] == 2
assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 2

like = add_props(expression="Jo%")(Metrics.LIKE_COUNT.value)
res = (
Profiler(like, session=self.session, table=User, use_cols=[User.name])
.execute()
._column_results
)

assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 1

# Running safely
# with pytest.raises(AttributeError):
Expand All @@ -249,14 +258,23 @@ def test_ilike_count(self):
"""
Check ILIKE count: case-insensitive LIKE
"""
ilike = add_props(expression="J%")(Metrics.ILIKE_COUNT.value)
ilike = add_props(expression="j%")(Metrics.ILIKE_COUNT.value)
res = (
Profiler(ilike, session=self.session, table=User, use_cols=[User.name])
.execute()
._column_results
)

assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 2

ilike = add_props(expression="ja%")(Metrics.ILIKE_COUNT.value)
res = (
Profiler(ilike, session=self.session, table=User, use_cols=[User.age])
Profiler(ilike, session=self.session, table=User, use_cols=[User.name])
.execute()
._column_results
)

assert res.get(User.age.name)[Metrics.ILIKE_COUNT.name] == 2
assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 1

# Running safely
# with pytest.raises(AttributeError):
Expand Down
53 changes: 53 additions & 0 deletions ingestion/tests/unit/profiler/test_session_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import (
ColumnValuesToBeNotInSet,
)
from metadata.generated.schema.tests.column.columnValuesToMatchRegex import (
ColumnValuesToMatchRegex,
)
from metadata.orm_profiler.engines import create_and_bind_session
from metadata.orm_profiler.validations.core import validate

Expand Down Expand Up @@ -124,3 +127,53 @@ def test_column_values_not_in_set(self):
+ " the configured column random for ColumnValuesToBeNotInSet"
),
)

def test_column_values_to_match_regex(self):
"""
Check that the metric runs and the results are correctly validated
"""
column_profile = ColumnProfile(name="name", valuesCount=2) # column name

res_ok = validate(
ColumnValuesToMatchRegex(regex="J%"),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)

assert res_ok == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Success,
result="Found likeCount=2 & valuesCount=2.0. They should be equal.",
)

res_ko = validate(
ColumnValuesToMatchRegex(regex="Jo%"),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)

assert res_ko == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Failed,
result="Found likeCount=1 & valuesCount=2.0. They should be equal.",
)

res_aborted = validate(
ColumnValuesToMatchRegex(regex="J%"),
col_profile=ColumnProfile(name="name"),
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)

assert res_aborted == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=(
"We expect `valuesCount` to be informed for ColumnValuesToMatchRegex."
),
)