In [1]:
import mlrun
import pandas as pd
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.data_context import BaseDataContext
from great_expectations.data_context.types.base import (
    DataContextConfig,
    FilesystemStoreBackendDefaults,
)

In [2]:
project = mlrun.get_or_create_project("great-expectations", context="./")

> 2023-03-03 22:08:23,289 [info] loaded project great-expectations from MLRun DB


### Config

In [3]:
data_asset_name = "iris_dataset"
data_path = "https://s3.wasabisys.com/iguazio/data/iris/iris.data.raw.csv"
expectation_suite_name = "test_suite"
root_directory = f"/v3io/projects/{project.name}/great_expectations"

### Intialize Great Expectations Context

In [4]:
ge_context = BaseDataContext(
    project_config=DataContextConfig(
        store_backend_defaults=FilesystemStoreBackendDefaults(
            root_directory=root_directory
        )
    )
)

### Add Pandas Datasource

In [5]:
datasource_name = "pandas_datasource"
data_connector_name = "default_runtime_data_connector_name"

In [6]:
datasource_config = {
    "name": f"{datasource_name}",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        f"{data_connector_name}": {
            "class_name": "RuntimeDataConnector",
            "module_name": "great_expectations.datasource.data_connector",
            "batch_identifiers": ["default_identifier_name"],
        },
    },
}

In [7]:
ge_context.add_datasource(**datasource_config)

<great_expectations.datasource.new_datasource.Datasource at 0x7f3deadfa850>

### Create Expectation Suite

In [8]:
ge_context.create_expectation_suite(
    expectation_suite_name=expectation_suite_name, overwrite_existing=True
)

{
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.15.41"
  },
  "ge_cloud_id": null,
  "expectations": [],
  "expectation_suite_name": "test_suite"
}

### Get Data Batch

In [9]:
df = pd.read_csv(data_path)

In [10]:
batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name=data_connector_name,
    data_asset_name=data_asset_name,
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_identifier"},
)

### Get Validator

In [11]:
validator = ge_context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

### Add Expectations

In [12]:
validator.expect_column_values_to_not_be_null(column="sepal length (cm)")
validator.expect_column_values_to_not_be_null(column="sepal width (cm)")
validator.expect_column_values_to_be_between(
    column="sepal length (cm)", min_value=0, max_value=5
)
validator.expect_column_values_to_be_between(
    column="sepal width (cm)", min_value=2, max_value=4.4
)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 150,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {}
}

### Save Expectation Suite

In [13]:
validator.save_expectation_suite(discard_failed_expectations=False)

### Create and Build Validation Function

In [14]:
fn = project.set_function(
    name="validate_expectations",
    func="validate_great_expectations.py",
    kind="job",
    handler="validate_expectations",
    requirements="requirements.txt",
    image="mlrun/mlrun",
    #image=".mlrun/func-great-expectations-validate-expectations:latest"
).apply(mlrun.auto_mount())

In [15]:
fn.export()

> 2023-03-03 22:09:01,338 [info] function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f3deada3350>

In [16]:
project.build_function(fn)

> 2023-03-03 22:09:13,830 [info] Started building image: .mlrun/func-great-expectations-validate-expectations:latest
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:1.1.0  
[36mINFO[0m[0000] Retrieving image mlrun/mlrun:1.1.0 from registry index.docker.io 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:1.1.0  
[36mINFO[0m[0000] Returning cached image manifest              
[36mINFO[0m[0000] Executing 0 build triggers                   
[36mINFO[0m[0000] Unpacking rootfs as cmd RUN python -m pip install great-expectations==0.15.41 requires it. 
[36mINFO[0m[0021] RUN python -m pip install great-expectations==0.15.41 
[36mINFO[0m[0021] Taking snapshot of full filesystem...        
[36mINFO[0m[0033] cmd: /bin/sh                                 
[36mINFO[0m[0033] args: [-c python -m pip install great-expectations==0.15.41] 
[36mINFO[0m[0033] Running: [/bin/sh -c python -m pip install gre

BuildStatus(ready=True, outputs={'image': '.mlrun/func-great-expectations-validate-expectations:latest'})

### Run Validation

In [17]:
run = fn.run(
    inputs={"data": "https://s3.wasabisys.com/iguazio/data/iris/iris.data.raw.csv"},
    params={
        "expectation_suite_name": "test_suite",
        "data_asset_name": "iris_dataset",
    },
)

> 2023-03-03 22:10:40,839 [info] starting run validate-expectations-validate_expectations uid=436372d741034d678145c63fecfe4450 DB=http://mlrun-api:8080
> 2023-03-03 22:10:41,124 [info] Job is running in the background, pod: validate-expectations-validate-expectations-tx9xb
> 2023-03-03 22:10:55,088 [info] run executed, status=completed
Calculating Metrics: 100%|██████████| 19/19 [00:00<00:00, 323.87it/s]
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
great-expectations,...ecfe4450,0,Mar 03 22:10:51,completed,validate-expectations-validate_expectations,v3io_user=nickkind=jobowner=nickmlrun/client_version=1.1.0host=validate-expectations-validate-expectations-tx9xb,data,expectation_suite_name=test_suitedata_asset_name=iris_dataset,validated=False,validation_results





> 2023-03-03 22:11:01,178 [info] run executed, status=completed


### View Data Doc

In [18]:
import os
from IPython.display import IFrame

In [19]:
IFrame(src=os.path.relpath(run.outputs["validation_results"]), width=1000, height=800)