In [1]:
import mlrun
import pandas as pd
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.data_context import BaseDataContext
from great_expectations.data_context.types.base import (
    DataContextConfig,
    FilesystemStoreBackendDefaults,
)

In [2]:
project = mlrun.get_or_create_project("great-expectations", context="./")

> 2022-03-17 17:07:26,901 [info] loaded project great-expectations from MLRun DB


### Config

In [3]:
data_asset_name = "iris_dataset"
data_path = "https://s3.wasabisys.com/iguazio/data/iris/iris.data.raw.csv"
expectation_suite_name = "test_suite"
root_directory = f"/v3io/projects/{project.name}/great_expectations"

### Intialize Great Expectations Context

In [4]:
ge_context = BaseDataContext(
    project_config=DataContextConfig(
        store_backend_defaults=FilesystemStoreBackendDefaults(
            root_directory=root_directory
        )
    )
)

### Add Pandas Datasource

In [5]:
datasource_name = "pandas_datasource"
data_connector_name = "default_runtime_data_connector_name"

In [6]:
datasource_config = {
    "name": f"{datasource_name}",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        f"{data_connector_name}": {
            "class_name": "RuntimeDataConnector",
            "module_name": "great_expectations.datasource.data_connector",
            "batch_identifiers": ["default_identifier_name"],
        },
    },
}

In [7]:
ge_context.add_datasource(**datasource_config)

<great_expectations.datasource.new_datasource.Datasource at 0x7f233f9d7a50>

### Create Expectation Suite

In [8]:
ge_context.create_expectation_suite(
    expectation_suite_name=expectation_suite_name, overwrite_existing=True
)

{
  "meta": {
    "great_expectations_version": "0.14.9"
  },
  "data_asset_type": null,
  "ge_cloud_id": null,
  "expectations": [],
  "expectation_suite_name": "test_suite"
}

### Get Data Batch

In [9]:
df = pd.read_csv(data_path)

In [10]:
batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name=data_connector_name,
    data_asset_name=data_asset_name,
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_identifier"},
)

### Get Validator

In [11]:
validator = ge_context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

### Add Expectations

In [12]:
validator.expect_column_values_to_not_be_null(column="sepal length (cm)")
validator.expect_column_values_to_not_be_null(column="sepal width (cm)")
validator.expect_column_values_to_be_between(
    column="sepal length (cm)", min_value=0, max_value=5
)
validator.expect_column_values_to_be_between(
    column="sepal width (cm)", min_value=2, max_value=4.4
)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "success": true,
  "result": {
    "element_count": 150,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Save Expectation Suite

In [13]:
validator.save_expectation_suite(discard_failed_expectations=False)