In [1]:
import whylogs as why
import pandas as pd
import time

from ludwig.automl.base_config import get_dataset_info
from ludwig.constants import AUDIO, BINARY, CATEGORY, DATE, IMAGE, NUMBER, TEXT
from ludwig.datasets import titanic

pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


torchtext>=0.13.0 is not installed, so the following tokenizers are not available: {'bert'}




In [None]:
# def infer_type(field: FieldInfo, missing_value_percent: float, row_count: int) -> str:
#     """Perform type inference on field.

#     # Inputs
#     :param field: (FieldInfo) object describing field
#     :param missing_value_percent: (float) percent of missing values in the column
#     :param row_count: (int) total number of entries in original dataset

#     # Return
#     :return: (str) feature type
#     """
#     if field.dtype == DATE:
#         return DATE

#     num_distinct_values = field.num_distinct_values
#     if num_distinct_values == 0:
#         return CATEGORY
#     distinct_values = field.distinct_values
#     if num_distinct_values <= 2 and missing_value_percent == 0:
#         # Check that all distinct values are conventional bools.
#         if strings_utils.are_conventional_bools(distinct_values):
#             return BINARY

#     if field.image_values >= 3:
#         return IMAGE

#     if field.audio_values >= 3:
#         return AUDIO

#     # Use CATEGORY if:
#     # - The number of distinct values is significantly less than the total number of examples.
#     # - The distinct values are not all numbers.
#     # - The distinct values are all numbers but comprise of a perfectly sequential list of integers that suggests the
#     #   values represent categories.
#     if num_distinct_values < row_count * CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF and (
#         (not strings_utils.are_all_numbers(distinct_values)) or strings_utils.are_sequential_integers(distinct_values)
#     ):
#         return CATEGORY

#     # Use NUMBER if all of the distinct values are numbers.
#     if strings_utils.are_all_numbers(distinct_values):
#         return NUMBER

#     # TODO (ASN): add other modalities (image, etc. )
#     # Fallback to TEXT.
#     return TEXT

In [15]:
# Custom resolver.

from whylogs.core.resolvers import Resolver
from whylogs.core.datatypes import DataType, Fractional, Integral
from typing import Dict, List
from whylogs.core.metrics import StandardMetric
from whylogs.core.metrics.metrics import Metric, OperationResult, MetricConfig
from whylogs.core.metrics.metric_components import FractionalComponent
from whylogs.core.preprocessing import PreprocessedColumn
from typing import Any, Optional, Union
from dataclasses import dataclass
from whylogs.core.configs import SummaryConfig
from whylogs.core.datatypes import DataType, Fractional, Integral, String

IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif")


def is_image(src_path: str, img_entry: Union[bytes, str], column: str) -> bool:
    if not isinstance(img_entry, str):
        return False
    try:
        import imghdr

        path = get_abs_path(src_path, img_entry)
        bytes_obj = get_bytes_obj_from_path(path)
        if isinstance(bytes_obj, bytes):
            return imghdr.what(None, bytes_obj) is not None
        return imghdr.what(bytes_obj) is not None
    except Exception:
        return False


def is_image_score(src_path, img_entry, column: str):
    """Used for AutoML For image inference, want to bias towards both readable images, but also account for
    unreadable (i.e. expired) urls with image extensions."""
    if is_image(src_path, img_entry, column):
        return 1
    elif isinstance(img_entry, str) and img_entry.lower().endswith(IMAGE_EXTENSIONS):
        return 0.5
    return 0


@dataclass(frozen=True)
class IsImageMetric(Metric):
    score: FractionalComponent
    name = "ludwig_metric"
    
    @property
    def namespace(self) -> str:
        return "is_image"

    def columnar_update(self, view: PreprocessedColumn) -> OperationResult:
        successes = 0
        # if view.numpy.len > 0:
        #     if view.numpy.ints is not None:
        #         self.hll.value.update_np(view.numpy.ints)
        #         successes += len(view.numpy.ints)
        #     if view.numpy.floats is not None:
        #         self.hll.value.update_np(view.numpy.floats)
        #         successes += len(view.numpy.floats)
        if view.pandas.strings is not None:
            # self.hll.value.update_str_list(view.pandas.strings.to_list())
            # self.score.value.set(
            #     is_image_score(None, view.pandas.strings.to_list()[0], column=""))
            # self.score.value = is_image_score(None, view.pandas.strings.to_list()[0], column="")
            self.score.set(is_image_score(None, view.pandas.strings.to_list()[0], column=""))
            successes += len(view.pandas.strings)

        # update everything in the remaining lists
        # if view.list.ints:
        #     self.hll.value.update_int_list(view.list.ints)
        #     successes += len(view.list.ints)
        # if view.list.floats:
        #     self.hll.value.update_double_list(view.list.floats)
        #     successes += len(view.list.floats)
        if view.list.strings:
            # self.hll.value.update_str_list(view.list.strings)
            successes += len(view.list.strings)

        failures = 0
        if view.list.objs:
            failures = len(view.list.objs)
        return OperationResult(successes=successes, failures=failures)

    def to_summary_dict(self, cfg: SummaryConfig) -> Dict[str, Any]:
        return {
            "image_score": self.score.value
            # "est": self.hll.value.get_estimate(),
            # f"upper_{cfg.hll_stddev}": self.hll.value.get_upper_bound(cfg.hll_stddev),
            # f"lower_{cfg.hll_stddev}": self.hll.value.get_lower_bound(cfg.hll_stddev),
        }

    @classmethod
    def zero(cls, config: MetricConfig) -> "IsImageMetric":
        return IsImageMetric(score=FractionalComponent(0.0))


class LudwigWhyResolver(Resolver):
    """Resolver that keeps distribution metrics for Fractional and frequent items for Integral, and counters and types metrics for all data types."""

    def resolve(self, name: str, why_type: DataType, column_schema) -> Dict[str, Metric]:
        metrics: List[StandardMetric] = [StandardMetric.counts, StandardMetric.types]

        if isinstance(why_type, Integral):
            metrics.append(StandardMetric.distribution)
            metrics.append(StandardMetric.ints)
            metrics.append(StandardMetric.cardinality)
            metrics.append(StandardMetric.frequent_items)
        elif isinstance(why_type, Fractional):
            metrics.append(StandardMetric.cardinality)
            metrics.append(StandardMetric.distribution)
        elif isinstance(why_type, String):  # Catch all category as we map 'object' here
            # NOTE(justin):
            # Could implement image functions as custom metric under String.
            # Could implement image functions as custom type with custom metric.
            # Need to implement custom metric anyways.
            metrics.append(StandardMetric.cardinality)
            metrics.append(IsImageMetric)
            if column_schema.cfg.track_unicode_ranges:
                metrics.append(StandardMetric.unicode_range)
            metrics.append(StandardMetric.distribution)  # 'object' columns can contain Decimal
            metrics.append(StandardMetric.frequent_items)

        if column_schema.cfg.fi_disabled:
            metrics.remove(StandardMetric.frequent_items)

        result: Dict[str, Metric] = {}
        for m in metrics:
            result[m.name] = m.zero(column_schema.cfg)
        return result
    

    
class ImageType(DataType[int]):
    def __init__(self) -> None:
        super().__init__(str)

    @classmethod
    def _do_match(cls, dtype_or_type: Any, maybe_type: Optional[Any]) -> bool:
        if maybe_type:
            dtype_or_type = maybe_type  # type: ignore

        if not isinstance(dtype_or_type, type):
            # Potentially handle numpy arrays?
            return False
        
        return True
    

class AudioType(DataType[int]):
    def __init__(self) -> None:
        super().__init__(str)

    @classmethod
    def _do_match(cls, dtype_or_type: Any, maybe_type: Optional[Any]) -> bool:
        if maybe_type:
            dtype_or_type = maybe_type  # type: ignore

        if not isinstance(dtype_or_type, type):
            # Potentially handle numpy arrays?
            return False
        
        return True
    

from whylogs.core import DatasetSchema


class LudwigWhySchema(DatasetSchema):
    # resolvers = LudwigWhyResolver()
    types: Dict[str, Any] = {}
    default_configs: MetricConfig = MetricConfig()
    # type_mapper: TypeMapper = StandardTypeMapper(custom_types=[ImageType, AudioType])
    # type_mapper: TypeMapper = StandardTypeMapper()
    resolvers: Resolver = LudwigWhyResolver()
    cache_size: int = 1024
    schema_based_automerge: bool = False

In [36]:
training_set, test_set, _ = titanic.load(split=True)

# start = time.time()
# get_dataset_info(training_set)
# end = time.time()
# print(end - start)

# Log data with whylogs & create profile
results = why.log(pandas=training_set, schema=LudwigWhySchema())

# start = time.time()
profile = results.profile()
prof_view = profile.view()
# print(prof_view.to_pandas())

# Create dictionary of feature name -> ColumnView
column_profiles = {}
for feature_name in training_set.keys():
    # print("---------------------------------------------------------------------------------------------")
    # print(f"FEATURE NAME: {feature_name}")
    # https://github.com/whylabs/whylogs/blob/d22609ef684805bdbc7cf734eb9265bca0dcb5b9/python/whylogs/core/view/column_profile_view.py#L67-L72
    # print(prof_view.get_columns()[feature_name].to_protobuf())
    column_profiles[feature_name] = prof_view.get_columns()[feature_name]
    # dataset_profile.feature_profiles[feature_name].whylogs_metrics = prof_view.get_columns()[feature_name].to_protobuf()

# end = time.time()
# print(end - start)

# Create profile view dataframe
prof_view = profile.view()
# print(prof_view.to_pandas())
prof_view_pandas = prof_view.to_pandas()

In [94]:
# prof_view_pandas.loc[["PassengerId"]]
# prof_view_pandas.iloc[0]
# for x in prof_view_pandas.index.values:
#     print(x)

# prof_view_pandas.iloc[0]["counts/n"]
from pprint import pprint
print(prof_view_pandas.index.values)
pprint(column_profiles['PassengerId'].to_summary_dict())
pprint(column_profiles['Embarked'].to_summary_dict())
# pprint(column_profiles['Embarked'].get_metric("frequent_items").frequent_strings.value)
# pprint(column_profiles['Embarked'].to_summary_dict()["frequent_items/frequent_strings"])
max_occurence = column_profiles['Embarked'].to_summary_dict()["frequent_items/frequent_strings"][0].est
min_occurence = column_profiles['Embarked'].to_summary_dict()["frequent_items/frequent_strings"][-1].est
for frequent_item in column_profiles['Embarked'].to_summary_dict()["frequent_items/frequent_strings"]:
    print(frequent_item)
    

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
{'cardinality/est': 901.7294570137847,
 'cardinality/lower_1': 890.2134308483327,
 'cardinality/upper_1': 913.5326015182238,
 'counts/n': 891,
 'counts/null': 0,
 'distribution/max': 891.0,
 'distribution/mean': 446.0,
 'distribution/median': 449.0,
 'distribution/min': 1.0,
 'distribution/n': 891,
 'distribution/q_01': 14.0,
 'distribution/q_05': 50.0,
 'distribution/q_10': 94.0,
 'distribution/q_25': 226.0,
 'distribution/q_75': 669.0,
 'distribution/q_90': 802.0,
 'distribution/q_95': 847.0,
 'distribution/q_99': 883.0,
 'distribution/stddev': 257.3538420152301,
 'frequent_items/frequent_strings': [],
 'ints/max': 891,
 'ints/min': 1,
 'types/boolean': 0,
 'types/fractional': 0,
 'types/integral': 891,
 'types/object': 0,
 'types/string': 0}
{'cardinality/est': 3.000000014901161,
 'cardinality/lower_1': 3.0,
 'cardinality/upper_1': 3.0001498026537594,
 'counts/n': 891,
 'counts

In [17]:
def get_type_from_column_profile(column_profile):
    """Returns the Ludwig data type from a whylogs column profile."""
    column_profile_summary = column_profile.to_summary_dict()

    cardinality_est = int(column_profile_summary["cardinality/est"])
    counts_n = column_profile_summary["counts/n"]
    
    if not cardinality_est:
        return CATEGORY
    
    if column_profile_summary["types/boolean"] > counts_n * 0.5:
        return BOOLEAN
    
    if column_profile_summary["types/integral"] > counts_n * 0.5:
        if cardinality_est < counts_n * 0.5:
            return CATEGORY
        return NUMBER
    
    if column_profile_summary["types/fractional"] > counts_n * 0.5:
        return NUMBER
    
    if cardinality_est < counts_n * 0.5:
        return CATEGORY

    # Fallback to text.
    return TEXT

    # Lets say we're interested in defining a constraint on the number of "legs". From output above we see
    # that there are the following metrics on column "legs": [counts, types, distribution, ints, cardinality, frequent_items]
    # lets look at what the distribution metric contains:
    # distribution_values = profile_view.get_column("legs").get_metric("distribution").to_summary_dict()
    # distribution_values

print(training_set["Pclass"])
print(column_profiles["2"].to_summary_dict())

for feature_name, column_profile in column_profiles.items():
    print(f"{feature_name}: {get_type_from_column_profile(column_profile)}")

0      3
1      1
2      3
3      1
4      3
      ..
886    2
887    1
888    3
889    1
890    3
Name: Pclass, Length: 891, dtype: int64


KeyError: '2'

In [None]:
# input_features:
#     - name: Pclass
#       type: category
#     - name: Sex
#       type: category
#     - name: Age
#       type: number
#       preprocessing:
#           missing_value_strategy: fill_with_mean
#     - name: SibSp
#       type: number
#     - name: Parch
#       type: number
#     - name: Fare
#       type: number
#       preprocessing:
#           missing_value_strategy: fill_with_mean
#     - name: Embarked
#       type: category

# output_features:
#     - name: Survived
#       type: binary