In [2]:
import time
from functools import lru_cache
from urllib.parse import urlparse

import boto3
import datahub.emitter.mce_builder as builder
import pandas as pd


@lru_cache
def get_athena_table_dataset_urn(catalog: str, database: str, table: str, region: str) -> str:
    """
    e.g. urn:li:dataset:(urn:li:dataPlatform:hive,/iceberg/yellow_rides_hourly_actuals,PROD)
    """
    session = boto3.Session(profile_name="sandbox")
    athena_client = session.client("athena", region_name=region)
    table_metadata = athena_client.get_table_metadata(CatalogName=catalog, DatabaseName=database, TableName=table)

    # Dataset has also its' physical location which we can add in symlink facet.
    s3_location = table_metadata["TableMetadata"]["Parameters"]["location"]
    parsed_path = urlparse(s3_location)

    return builder.make_dataset_urn(
        platform="hive",
        name=parsed_path.path,
    )

In [3]:
from datahub.emitter.serialization_helper import pre_json_transform


def make_assertion_urn(dataset_urn: str, assertion_name: str) -> str:
    return builder.make_assertion_urn(
        builder.datahub_guid(
            pre_json_transform(
                # these key-val pairs are essentially hashed; we want to choose pairs
                # that make the assertions unique (example: https://github.com/datahub-project/datahub/blob/d2d9d36987f20a9f7d6c973073d1404edf33e667/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py#L277-L289)
                {
                    "platform": "pattern-ds-dqv",
                    # bad name since assertions and datasets have a many-to-many relationship
                    "dataset_urn": dataset_urn,
                    "assertion_name": assertion_name,
                }
            )
        )
    )

In [3]:
# Inlined from /metadata-ingestion/examples/library/upsert_custom_assertion.py

from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph

graph = DataHubGraph(config=DatahubClientConfig(server="http://localhost:8091"))

[methods for methods in graph.__dir__() if "assertion" in methods]

['_assertion_result_shared',
 '_run_assertion_result_shared',
 '_run_assertion_build_params',
 'run_assertion',
 'run_assertions',
 'run_assertions_for_asset',
 'upsert_custom_assertion',
 'report_assertion_result']

In [5]:
entity_urn = get_athena_table_dataset_urn(
    catalog="AwsDataCatalog",
    database="nyc_taxi",
    table="yellow_rides_hourly_actuals",
    region="us-east-1",
)

assertion_urn = make_assertion_urn(
    dataset_urn=entity_urn,
    assertion_name="test_assertion",
)

In [6]:
# Upsert the assertion
res = graph.upsert_custom_assertion(
    urn=assertion_urn,  # If the assertion already exists, provide the URN
    entity_urn=entity_urn,
    type="DQV",  # This categorizes your assertion in DataHub
    description="The description of my external assertion for my dataset",
    # platform_urn="urn:li:dataPlatform:great-expectations", # OR you can provide 'platformName="My Custom Platform"'
    platform_name="metaflow",
    field_path="field_foo",  # Optional: if you want to associate it with a specific field
    # external_url="https://my-monitoring-tool.com/result-for-this-assertion",  # Optional: link to monitoring tool
    # logic="SELECT * FROM X WHERE Y",  # Optional: custom SQL for the assertion, rendered in the UI
)
print(res)

{'urn': 'urn:li:assertion:eb902459c44c5c939f980eee43d7b73c'}


In [26]:
res = graph.report_assertion_result(
    urn=assertion_urn,  # Replace with your actual assertion URN
    timestamp_millis=int(time.time() * 1000),  # Current Unix timestamp in milliseconds
    type="SUCCESS",  # Can be 'SUCCESS', 'FAILURE', 'ERROR', or 'INIT'
    properties=[
        {"key": "expected value", "value": "less than 20"},  # Example property, can be any key-value pair
        {"key": "actual value", "value": "10"},
    ],
    # external_url="https://my-great-expectations.com/results/1234",  # Optional: URL to the results in the external tool
    # Uncomment the following section and use if type is 'ERROR'
    # error_type="UNKNOWN_ERROR",  # Can be 'VALIDATION_ERROR', 'SYSTEM_ERROR', or 'OTHER' ## ENUM
    # error_message="<ERROR MESSAGE>",  ## does not show in UI
)

print("Successfully reported Assertion Result!")

Successfully reported Assertion Result!


In [5]:
from ds_dqv_tool import dqv_check, recipes

In [6]:
foo = pd.DataFrame({"foo1": [1, 2, 3], "foo2": [4, 5, 6]})

In [7]:
dqv_results = dqv_check(
    dataset_type="pandas",
    dataset_name="foo",
    dataset=foo,
    checks={
        "foo1": {
            "missing_percent": [["eq", 0, "fail"]],
            "min": [["gt", 0, "fail"]],
            "max": [["lt", 10, "fail"]],
            "mean": [["gt", 4, "fail"]],
        }
    },
)

In [8]:
dqv_results

{'passed': [{'dataset_name': 'foo',
   'dataset_owner': {},
   'dataset_type': 'pandas',
   'checks': {'foo1': {'missing_percent': [{'condition': 'eq',
       'value': 0,
       'criticality': 'fail',
       'calculated_value': np.float64(0.0)}],
     'min': [{'condition': 'gt',
       'value': 0,
       'criticality': 'fail',
       'calculated_value': np.int64(1)}],
     'max': [{'condition': 'lt',
       'value': 10,
       'criticality': 'fail',
       'calculated_value': np.int64(3)}]}}}],
 'failed': [{'dataset_name': 'foo',
   'dataset_owner': {},
   'dataset_type': 'pandas',
   'checks': {'foo1': {'mean': [{'condition': 'gt',
       'value': 4,
       'criticality': 'fail',
       'calculated_value': np.float64(2.0)}]}}}]}

In [None]:
def datahub_report_update(
    entity_urn,
    assertion_urn,
    status,
    checks_description,
    properties,
):
    graph.upsert_custom_assertion(
        urn=assertion_urn,
        entity_urn=entity_urn,
        type="DQV",  # This categorizes your assertion in DataHub
        description=checks_description,
        # platform_urn="urn:li:dataPlatform:great-expectations", # OR you can provide 'platformName="My Custom Platform"'
        platform_name="metaflow",
        # external_url="https://my-monitoring-tool.com/result-for-this-assertion",  # Optional: link to monitoring tool
    )

    graph.report_assertion_result(
        urn=assertion_urn,
        timestamp_millis=int(time.time() * 1000),
        type=status,
        properties=properties,
    )

In [68]:
dataset_name_to_urn = {
    "foo": entity_urn,
}

In [84]:
from ds_dqv_tool.recipes import condition_description_map, metric_description_map


def log_dqv_report_datahub(dqv_results, dataset_name_to_urn):
    for status in ["passed", "failed"]:
        for result in dqv_results[status]:
            for column, metrics in result["checks"].items():
                for metric, conditions in metrics.items():
                    metric_desc = metric_description_map.get(metric, metric)
                    for condition_tuple in conditions:
                        condition = condition_tuple["condition"]
                        value = condition_tuple["value"]
                        actual = condition_tuple["calculated_value"]
                        cond_desc = condition_description_map.get(condition, condition)
                        properties = [
                            {"key": "column", "value": column},
                            {"key": "metric", "value": metric},
                            {"key": "condition", "value": condition},
                            {"key": "expected", "value": value},
                            {"key": "actual", "value": float(actual)},
                        ]
                        datahub_report_update(
                            entity_urn=dataset_name_to_urn[result["dataset_name"]],
                            assertion_urn=make_assertion_urn(
                                dataset_urn=dataset_name_to_urn[result["dataset_name"]],
                                assertion_name=f"{column}_{metric}_{condition}_{value}",
                            ),
                            status="SUCCESS" if status == "passed" else "FAILURE",
                            checks_description=f"Column: {column} - {metric_desc} value {cond_desc} {value}",
                            properties=properties,
                        )

In [None]:
log_dqv_report_datahub(dqv_results, dataset_name_to_urn={""})