Skip to content

Commit

Permalink
Fix #2981 - Update Profile to match TableProfile (#2982)
Browse files Browse the repository at this point in the history
  • Loading branch information
pmbrull committed Feb 25, 2022
1 parent 6dadbc1 commit 9906085
Show file tree
Hide file tree
Showing 36 changed files with 782 additions and 529 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,18 @@
},
"min": {
"description": "Minimum value in a column.",
"type": "number"
"type": ["number", "integer", "string"]
},
"max": {
"description": "Maximum value in a column.",
"type": ["number", "integer", "string"]
},
"minLength": {
"description": "Minimum string length in a column.",
"type": "number"
},
"maxLength": {
"description": "Maximum string length in a column.",
"type": "number"
},
"mean": {
Expand Down Expand Up @@ -625,4 +633,4 @@
},
"required": ["id", "name", "columns"],
"additionalProperties": false
}
}
2 changes: 1 addition & 1 deletion ingestion-core/src/metadata/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@

from incremental import Version

__version__ = Version("metadata", 0, 9, 0, dev=14)
__version__ = Version("metadata", 0, 9, 0, dev=15)
__all__ = ["__version__"]
16 changes: 3 additions & 13 deletions ingestion/src/metadata/orm_profiler/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from pydantic import BaseModel

from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.entity.data.table import Table, TableProfile
from metadata.orm_profiler.profiles.core import Profiler
from metadata.orm_profiler.profiles.models import ProfilerDef
from metadata.orm_profiler.validations.models import TestDef
Expand All @@ -30,17 +30,6 @@ class Config:
arbitrary_types_allowed = True


class ColumnProfiler(WorkflowResult):
column: str
profiler: Profiler


class ProfilerResult(WorkflowResult):
table: Table # Table Entity
table_profiler: Profiler # Profiler with table results
column_profilers: List[ColumnProfiler] # Profiler with col results


class ProfilerProcessorConfig(BaseModel):
"""
Defines how we read the processor information
Expand All @@ -59,5 +48,6 @@ class ProfileAndTests(BaseModel):
the ran tests, if any.
"""

profile: ProfilerResult
table: Table
profile: TableProfile
tests: Optional[TestDef] = None
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from metadata.orm_profiler.metrics.core import ComposedMetric
from metadata.orm_profiler.metrics.static.count import Count
from metadata.orm_profiler.metrics.static.distinct import Distinct
from metadata.orm_profiler.metrics.static.distinct_count import DistinctCount


class DuplicateCount(ComposedMetric):
Expand All @@ -25,8 +25,13 @@ class DuplicateCount(ComposedMetric):
compute the number of rows that are duplicates
"""

def required_metrics(self) -> Tuple[str, ...]:
return Count.name(), Distinct.name()
@classmethod
def name(cls):
return "duplicateCount"

@classmethod
def required_metrics(cls) -> Tuple[str, ...]:
return Count.name(), DistinctCount.name()

@property
def metric_type(self):
Expand All @@ -38,7 +43,7 @@ def fn(self, res: Dict[str, Any]) -> Optional[float]:
results of other Metrics
"""
count = res.get(Count.name())
distinct_count = res.get(Distinct.name())
distinct_count = res.get(DistinctCount.name())

if count is not None and distinct_count is not None:
return count - distinct_count
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,16 @@

class ILikeRatio(ComposedMetric):
"""
Given the total count and LIKE count,
compute the LIKE ratio
Given the total count and ILIKE count,
compute the ILIKE ratio
"""

def required_metrics(self) -> Tuple[str, ...]:
@classmethod
def name(cls):
return "iLikeRatio"

@classmethod
def required_metrics(cls) -> Tuple[str, ...]:
return Count.name(), ILikeCount.name()

@property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@ class LikeRatio(ComposedMetric):
compute the LIKE ratio
"""

def required_metrics(self) -> Tuple[str, ...]:
@classmethod
def name(cls):
return "likeRatio"

@classmethod
def required_metrics(cls) -> Tuple[str, ...]:
return Count.name(), LikeCount.name()

@property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@ class NullRatio(ComposedMetric):
compute the null ratio
"""

def required_metrics(self) -> Tuple[str, ...]:
@classmethod
def name(cls):
return "nullProportion"

@classmethod
def required_metrics(cls) -> Tuple[str, ...]:
return Count.name(), NullCount.name()

@property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@ class UniqueRatio(ComposedMetric):
compute the unique ratio
"""

def required_metrics(self) -> Tuple[str, ...]:
@classmethod
def name(cls):
return "uniqueProportion"

@classmethod
def required_metrics(cls) -> Tuple[str, ...]:
return Count.name(), UniqueCount.name()

@property
Expand Down
100 changes: 52 additions & 48 deletions ingestion/src/metadata/orm_profiler/metrics/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from functools import wraps
from typing import Any, Dict, Optional, Tuple, TypeVar

from sqlalchemy import Column
from sqlalchemy.orm import Session
from sqlalchemy.orm.attributes import InstrumentedAttribute

# When creating complex metrics, use inherit_cache = CACHE
CACHE = True
Expand Down Expand Up @@ -51,6 +51,41 @@ def inner(self, *args, **kwargs):
return inner


def add_props(**kwargs):
"""
Sometimes we might need to add some
flavour dynamically to our Metrics definition.
For example, when passing the `bins` for the HISTOGRAM
or `expression` for LIKE & ILIKE.
This function is a class decorator that we can run as:
new_hist = add_props(bins=5)(Metrics.HISTOGRAM.value)
new_hist will still be a class, so we can safely pass it
to the profiler to be initialized for all the columns.
"""

def inner(cls):

# Create a new cls instance to avoid updating the original ref
# In these scenarios, deepcopy(cls) just returns a pointer
# to the same reference
_new_cls = type("_new_cls", cls.__bases__, dict(cls.__dict__))
_orig = cls.__init__

def _new_init(self, *args, **kw):
for key, value in kwargs.items():
setattr(self, key, value)
_orig(self, *args, **kw)

_new_cls.__init__ = _new_init

return _new_cls

return inner


class Metric(ABC):
"""
Parent class metric
Expand All @@ -64,19 +99,29 @@ class Metric(ABC):
If not specified, it is a Table metric.
"""

def __init__(self, col: Optional[InstrumentedAttribute] = None, **kwargs):
def __init__(self, col: Optional[Column] = None, **kwargs):
self.col = col

# We allow to pass any metric specific kwarg
for key, value in kwargs.items():
self.__setattr__(key, value)

@classmethod
@abstractmethod
def name(cls) -> str:
"""
Metric name
Metric name. Follow JSON Schema specifications
"""
return cls.__name__.upper()

@classmethod
def is_col_metric(cls) -> bool:
"""
Marks the metric as table or column metric.
By default, assume that a metric is a column
metric. Table metrics should override this.
"""
return True

@property
def metric_type(self):
Expand All @@ -91,7 +136,7 @@ def metric_type(self):
We can override this for things like
variance, where it will be a float
"""
return self.col.type.python_type
return self.col.type.python_type if self.col else None


MetricType = TypeVar("MetricType", bound=Metric)
Expand Down Expand Up @@ -126,25 +171,6 @@ def query(self, session: Optional[Session] = None):
"""


class TimeMetric(Metric, ABC):
"""
Time Metric definition
"""

@property
@abstractmethod
def window(self):
"""
Window time to run the validation
"""

@abstractmethod
def fn(self):
"""
SQLAlchemy function to be executed in Query
"""


class CustomMetric(Metric, ABC):
"""
Custom metric definition
Expand All @@ -167,8 +193,9 @@ class ComposedMetric(Metric, ABC):
directly in the profiler.
"""

@classmethod
@abstractmethod
def required_metrics(self) -> Tuple[str, ...]:
def required_metrics(cls) -> Tuple[str, ...]:
"""
Return a tuple of the required metrics' names
necessary to compute the composed metric.
Expand All @@ -184,26 +211,3 @@ def fn(self, res: Dict[str, Any]):
This metric computes its value based on
the results already present in the Profiler
"""


class RuleMetric(Metric, ABC):
"""
Useful when we need to take into consideration the
state of more than one column at a time.
E.g., the validation would be:
if `state` is `delivered`, `payment` should be informed.
This Metric is based on a target column, the one we will
use to inform the results, and the filters, which will
define the domain.
TODO: Figure out the filters signature. We might need
to come back here after defining the validations.
"""

def __init__(
self, target_col: InstrumentedAttribute, *filters: InstrumentedAttribute
):
super().__init__(col=target_col)
self._filters = filters
12 changes: 6 additions & 6 deletions ingestion/src/metadata/orm_profiler/metrics/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@
from metadata.orm_profiler.metrics.composed.like_ratio import LikeRatio
from metadata.orm_profiler.metrics.composed.null_ratio import NullRatio
from metadata.orm_profiler.metrics.composed.unique_ratio import UniqueRatio
from metadata.orm_profiler.metrics.static.avg import Avg
from metadata.orm_profiler.metrics.static.count import Count
from metadata.orm_profiler.metrics.static.distinct import Distinct
from metadata.orm_profiler.metrics.static.distinct_count import DistinctCount
from metadata.orm_profiler.metrics.static.histogram import Histogram
from metadata.orm_profiler.metrics.static.ilike_count import ILikeCount
from metadata.orm_profiler.metrics.static.like_count import LikeCount
from metadata.orm_profiler.metrics.static.max import Max
from metadata.orm_profiler.metrics.static.max_length import MaxLength
from metadata.orm_profiler.metrics.static.mean import Mean
from metadata.orm_profiler.metrics.static.min import Min
from metadata.orm_profiler.metrics.static.min_length import MinLength
from metadata.orm_profiler.metrics.static.null_count import NullCount
from metadata.orm_profiler.metrics.static.row_number import RowNumber
from metadata.orm_profiler.metrics.static.row_count import RowCount
from metadata.orm_profiler.metrics.static.stddev import StdDev
from metadata.orm_profiler.metrics.static.sum import Sum
from metadata.orm_profiler.metrics.static.unique_count import UniqueCount
Expand All @@ -49,9 +49,9 @@ class Metrics(MetricRegistry):
"""

# Static Metrics
AVG = Avg
MEAN = Mean
COUNT = Count
DISTINCT = Distinct
DISTINCT_COUNT = DistinctCount
HISTOGRAM = Histogram
ILIKE_COUNT = ILikeCount
LIKE_COUNT = LikeCount
Expand All @@ -60,7 +60,7 @@ class Metrics(MetricRegistry):
MIN = Min
MIN_LENGTH = MinLength
NULL_COUNT = NullCount
ROW_NUMBER = RowNumber
ROW_COUNT = RowCount
STDDEV = StdDev
SUM = Sum
UNIQUE_COUNT = UniqueCount
Expand Down
4 changes: 4 additions & 0 deletions ingestion/src/metadata/orm_profiler/metrics/static/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ class Count(StaticMetric):
Given a column, return the count. Ignores NULL values
"""

@classmethod
def name(cls):
return "valuesCount"

def metric_type(self):
return int

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@
from metadata.orm_profiler.metrics.core import StaticMetric, _label


class Distinct(StaticMetric):
class DistinctCount(StaticMetric):
"""
Distinct COUNT Metric
Given a column, return the Distinct count. Ignores NULL values
"""

@classmethod
def name(cls):
return "distinctCount"

def metric_type(self):
return int

Expand Down

0 comments on commit 9906085

Please sign in to comment.