Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature Store] Add MinMaxLenValidator and RegexValidator #1702

Merged
merged 10 commits into from Mar 21, 2022
102 changes: 102 additions & 0 deletions mlrun/features.py
@@ -1,3 +1,4 @@
import re
from typing import Dict, List, Optional

from .data_types import ValueType
Expand Down Expand Up @@ -146,7 +147,108 @@ def check(self, value):
return ok, args


class MinMaxLenValidator(Validator):
"""Validate min/max length value ranges"""

kind = "minmaxlen"
_dict_fields = Validator._dict_fields + ["min", "max"]

def __init__(self, check_type=None, severity=None, min=None, max=None):
"""Validate min/max length value ranges

example::

from mlrun.features import MinMaxValidator
george0st marked this conversation as resolved.
Show resolved Hide resolved

# Add length validator to the feature 'ticker', where valid
# minimal length is 1 and maximal length is 10
quotes_set["ticker"].validator = MinMaxLenValidator(
min=1,
max=10,
severity="info"
)

:param check_type: ..
george0st marked this conversation as resolved.
Show resolved Hide resolved
:param severity: severity name e.g. info, warning, etc.
:param min: minimal valid length size
:param max: maximal valid length size
"""
super().__init__(check_type, severity)
self.min = min
self.max = max

def check(self, value):
ok, args = super().check(value)
if ok:
if self.min is not None:
if len(value) < self.min:
george0st marked this conversation as resolved.
Show resolved Hide resolved
return (
False,
{
"message": "Length value is smaller than min",
"min": self.min,
"length value": len(value),
},
)
if self.max is not None:
if len(value) > self.max:
return (
False,
{
"message": "Length value is greater than max",
"max": self.max,
"length value": len(value),
},
)
return ok, args


class RegexValidator(Validator):
"""Validate value based on regular expression"""

kind = "regex"
_dict_fields = Validator._dict_fields + ["regex"]

def __init__(self, check_type=None, severity=None, regex=None):
"""Validate value based on regular expression

example::

from mlrun.features import RegexValidator

# Add regular expression validator to the feature 'name'
quotes_set["name"].validator = RegexValidator(
regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)",
severity="info"
)

:param check_type: ..
george0st marked this conversation as resolved.
Show resolved Hide resolved
:param severity: severity name e.g. info, warning, etc.
:param regex: regular expression for validation
"""
super().__init__(check_type, severity)
self.regex = regex
self.regex_compile = re.compile(self.regex)

def check(self, value):
ok, args = super().check(value)
if ok:
if self.regex is not None:
if not re.fullmatch(self.regex_compile, value):
george0st marked this conversation as resolved.
Show resolved Hide resolved
return (
False,
{
"message": "Value is not valid with regular expression",
"regexp": self.regex,
"value": value,
},
)
return ok, args


validator_kinds = {
"": Validator,
"minmax": MinMaxValidator,
"minmaxlen": MinMaxLenValidator,
"regex": RegexValidator,
}
5 changes: 4 additions & 1 deletion tests/system/feature_store/test_feature_store.py
Expand Up @@ -39,7 +39,7 @@
from mlrun.feature_store.feature_set import aggregates_step
from mlrun.feature_store.feature_vector import FixedWindowType
from mlrun.feature_store.steps import FeaturesetValidator
from mlrun.features import MinMaxValidator
from mlrun.features import MinMaxLenValidator, MinMaxValidator
from tests.system.base import TestMLRunSystem

from .data_sample import quotes, stocks, trades
Expand Down Expand Up @@ -141,6 +141,9 @@ def _ingest_quotes_featureset(self):
self._logger.info(f"quotes spec: {quotes_set.spec.to_yaml()}")
assert df["zz"].mean() == 9, "map didnt set the zz column properly"
quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")
quotes_set["ticker"].validator = MinMaxLenValidator(
george0st marked this conversation as resolved.
Show resolved Hide resolved
min=1, max=10, severity="info"
)

quotes_set.plot(
str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True
Expand Down