In [2]:
from functools import lru_cache
from urllib.parse import urlparse

import boto3
import datahub.emitter.mce_builder as builder


@lru_cache
def get_athena_table_dataset_urn(catalog: str, database: str, table: str, region: str) -> str:
    """
    e.g. urn:li:dataset:(urn:li:dataPlatform:hive,/iceberg/yellow_rides_hourly_actuals,PROD)
    """
    athena_client = boto3.client("athena", region_name=region)
    table_metadata = athena_client.get_table_metadata(CatalogName=catalog, DatabaseName=database, TableName=table)

    # Dataset has also its' physical location which we can add in symlink facet.
    s3_location = table_metadata["TableMetadata"]["Parameters"]["location"]
    parsed_path = urlparse(s3_location)

    return builder.make_dataset_urn(
        platform="hive",
        name=parsed_path.path,
    )

In [3]:
assert (
    get_athena_table_dataset_urn(
        catalog="AwsDataCatalog",
        database="nyc_taxi",
        table="yellow_rides_hourly_actuals",
        region="us-east-1",
    )
    == "urn:li:dataset:(urn:li:dataPlatform:hive,/iceberg/yellow_rides_hourly_actuals,PROD)"
)

In [8]:
from datahub.emitter.serialization_helper import pre_json_transform


def make_assertion_urn(dataset_urn: str, assertion_name: str) -> str:
    return builder.make_assertion_urn(
        builder.datahub_guid(
            pre_json_transform(
                # these key-val pairs are essentially hashed; we want to choose pairs
                # that make the assertions unique (example: https://github.com/datahub-project/datahub/blob/d2d9d36987f20a9f7d6c973073d1404edf33e667/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py#L277-L289)
                {
                    "platform": "pattern-ds-dqv",
                    # bad name since assertions and datasets have a many-to-many relationship
                    "dataset_urn": dataset_urn,
                    "assertion_name": assertion_name,
                }
            )
        )
    )

In [9]:
yellow_rides_hourly_actuals__dataset_urn: str = get_athena_table_dataset_urn(
    catalog="AwsDataCatalog",
    database="nyc_taxi",
    table="yellow_rides_hourly_actuals",
    region="us-east-1",
)

dummy_assertion_arn: str = make_assertion_urn(
    dataset_urn=yellow_rides_hourly_actuals__dataset_urn,
    assertion_name="test_assertion",
)

dummy_assertion_arn

'urn:li:assertion:eb902459c44c5c939f980eee43d7b73c'

In [11]:
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.ingestion.graph.config import ClientMode

emitter = DatahubRestEmitter(
    gms_server="http://localhost:8091",
    # token=self.token,
    # read_timeout_sec=self.timeout_sec,
    # connect_timeout_sec=self.timeout_sec,
    # retry_status_codes=self.retry_status_codes,
    # retry_max_times=self.retry_max_times,
    # extra_headers=self.extra_headers,
    client_mode=ClientMode.INGESTION,
    datahub_component="gx-plugin",
)

import time
from datetime import datetime, timezone

from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
    AssertionResult,
    AssertionResultType,
    AssertionRunEvent,
    AssertionRunStatus,
)

assertion_result = AssertionRunEvent(
    timestampMillis=int(round(time.time() * 1000)),
    assertionUrn=dummy_assertion_arn,
    asserteeUrn=yellow_rides_hourly_actuals__dataset_urn,
    runId=datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
    result=AssertionResult(
        type=AssertionResultType.FAILURE,  # if success else AssertionResultType.FAILURE),
        rowCount=1000,  # presumably, the number of rows that were tested
        missingCount=5,  # presumably, the number of rows that were NULL,
        unexpectedCount=10,  # presumably, the number of rows that failed the test
        # actualAggValue=actualAggValue,  # use this if the assertion COUNTs, MAXs, MINs, things. E.g. if you are counting the number of unique values for a check, or asserring that the STDEV is within a certain bound
        #      For expect_column_mean_to_be_between(min=10, max=20) → actualAggValue might be 15.7
        externalUrl="https://ericriddoch.info",  # a placeholder website, this could potentially be the Metaflow HTML that DS DQV generates!,
        # nativeResults=nativeResults,
    ),
    # Batch spec is for when an assertion is run on a SUBSET of a porentially LARGE
    # dataset. E.g. if you get new data 1x/day... don't re-test your entire historical
    # dataset. Just test the new data. Here's an example
    #
    # batchSpec = BatchSpec(
    #    nativeBatchId="unique-batch-identifier",
    #    query="SELECT * FROM table WHERE date = '2024-01-01'",  # SQL query if applicable
    #    limit=1000,  # Optional row limit
    #    customProperties={
    #        "data_asset_name": "my_table",
    #        "datasource_name": "my_database",
    #        "batch_timestamp": "2024-01-01T00:00:00Z"
    #    }
    # . )
    # batchSpec=ds["batchSpec"],
    status=AssertionRunStatus.COMPLETE,
    runtimeContext={
        "some": "arbitrary params",
    },
)


In [15]:
from datahub.emitter.mcp import MetadataChangeProposalWrapper

dataset_assertionResult_mcp = MetadataChangeProposalWrapper(
    entityUrn=assertion_result.assertionUrn,
    aspect=assertion_result,
)

emitter.emit_mcp(dataset_assertionResult_mcp)  # , emit_mode="SYNC") TODO where does thie enum import from?