In [1]:
pwd

'/home/naveen/mlops/tests'

In [None]:
import great_expectations as gx

# Create context
context = gx.get_context()
context = context.convert_to_file_context()

# Create a Datasource
datasource_name = "local_data"
path_to_folder_containing_csv_files = "../data/"
datasource = context.sources.add_pandas_filesystem(
    name=datasource_name, base_directory=path_to_folder_containing_csv_files
)

# Add Data Assets to the Datasource
asset_name = "projects"
batching_regex = "projects.csv"
datasource.add_csv_asset(name=asset_name, batching_regex=batching_regex)
asset_name = "tags"
batching_regex = "tags.csv"
datasource.add_csv_asset(name=asset_name, batching_regex=batching_regex)
asset_name = "labeled_projects"
batching_regex = "labeled_projects.csv"
datasource.add_csv_asset(name=asset_name, batching_regex=batching_regex)

# Create an ExpectationSuite - projects
suite = context.add_expectation_suite(expectation_suite_name="projects")
# Use an existing Data Asset to create a Batch Request
data_asset = context.get_datasource("local_data").get_asset("projects")
batch_request = data_asset.build_batch_request()
# Create a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="projects",
)
# Use the Validator to create and run an Expectation
## Expectations for projects.csv
### Table expectations:
# Presence of features
_ = validator.expect_table_columns_to_match_ordered_list(
    column_list=["id", "created_on", "title", "description"])
_ = validator.expect_compound_columns_to_be_unique(column_list=["title", "description"])  # data leak
## Column expectations:
# id
_ = validator.expect_column_values_to_be_unique(column="id")
# create_on
_ = validator.expect_column_values_to_not_be_null(column="created_on")
_ = validator.expect_column_values_to_match_strftime_format(
    column="created_on", strftime_format="%Y-%m-%d %H:%M:%S")
# title
_ = validator.expect_column_values_to_not_be_null(column="title")
validator.expect_column_values_to_be_of_type(column="title", type_="str")
# description
_ = validator.expect_column_values_to_not_be_null(column="description")
_ = validator.expect_column_values_to_be_of_type(column="description", type_="str")
# Save Expectations for future use
validator.save_expectation_suite(discard_failed_expectations=False)

# Create an ExpectationSuite - tags
suite = context.add_expectation_suite(expectation_suite_name="tags")
# Use an existing Data Asset to create a Batch Request
data_asset = context.get_datasource("local_data").get_asset("tags")
batch_request = data_asset.build_batch_request()
# Create a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="tags",
)
# Use the Validator to create and run an Expectation
## Expectations for tags.csv
### Table expectations:
# Presence of features
_ = validator.expect_table_columns_to_match_ordered_list(
    column_list=["id", "tag"])
## Column expectations:
# id
_ = validator.expect_column_values_to_be_unique(column="id")
# tag
_ = validator.expect_column_values_to_not_be_null(column="tag")
_ = validator.expect_column_values_to_be_of_type(column="tag", type_="str")
# Save Expectations for future use
validator.save_expectation_suite(discard_failed_expectations=False)

# Create an ExpectationSuite - labeled_projects
suite = context.add_expectation_suite(expectation_suite_name="labeled_projects")
# Use an existing Data Asset to create a Batch Request
data_asset = context.get_datasource("local_data").get_asset("labeled_projects")
batch_request = data_asset.build_batch_request()
# Create a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="labeled_projects",
)
# Use the Validator to create and run an Expectation
## Expectations for labeled_projects.csv
### Table expectations:
# Presence of features
_ = validator.expect_table_columns_to_match_ordered_list(
    column_list=["id", "created_on", "title", "description", "tag"])
_ = validator.expect_compound_columns_to_be_unique(column_list=["title", "description"])  # data leak
### Column expectations:
# id
_ = validator.expect_column_values_to_be_unique(column="id")
# create_on
_ = validator.expect_column_values_to_not_be_null(column="created_on")
_ = validator.expect_column_values_to_match_strftime_format(
    column="created_on", strftime_format="%Y-%m-%d %H:%M:%S")
# title
_ = validator.expect_column_values_to_not_be_null(column="title")
_ = validator.expect_column_values_to_be_of_type(column="title", type_="str")
# description
_ = validator.expect_column_values_to_not_be_null(column="description")
_ = validator.expect_column_values_to_be_of_type(column="description", type_="str")
# tag
_ = validator.expect_column_values_to_not_be_null(column="tag")
_ = validator.expect_column_values_to_be_of_type(column="tag", type_="str")
# Save Expectations for future use
validator.save_expectation_suite(discard_failed_expectations=False)

In [None]:
# Create "projects" Checkpoint
datasource_name = "local_data"
asset_name = "projects"
expectation_suite_name="projects"
checkpoint_name = "projects"
batch_request = context.get_datasource(datasource_name).get_asset(asset_name).build_batch_request()
checkpoint = gx.checkpoint.SimpleCheckpoint(
    name=checkpoint_name,
    data_context=context,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name,
        },
    ],
)

context.add_checkpoint(checkpoint=checkpoint)

In [None]:
# Create "tags" Checkpoint
datasource_name = "local_data"
asset_name = "tags"
expectation_suite_name="tags"
checkpoint_name = "tags"
batch_request = context.get_datasource(datasource_name).get_asset(asset_name).build_batch_request()
checkpoint = gx.checkpoint.SimpleCheckpoint(
    name=checkpoint_name,
    data_context=context,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name,
        },
    ],
)

context.add_checkpoint(checkpoint=checkpoint)

In [None]:
# Create "labeled_projects" Checkpoint
datasource_name = "local_data"
asset_name = "labeled_projects"
expectation_suite_name="labeled_projects"
checkpoint_name = "labeled_projects"
batch_request = context.get_datasource(datasource_name).get_asset(asset_name).build_batch_request()
checkpoint = gx.checkpoint.SimpleCheckpoint(
    name=checkpoint_name,
    data_context=context,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name,
        },
    ],
)

context.add_checkpoint(checkpoint=checkpoint)

In [17]:
_ = context.get_checkpoint(name="projects").run()
_ = context.get_checkpoint(name="tags").run()
_ = context.get_checkpoint(name="labeled_projects").run()
context.build_data_docs()

Calculating Metrics:   0%|          | 0/46 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/20 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/56 [00:00<?, ?it/s]

{'local_site': 'file:///home/naveen/mlops/tests/great_expectations/uncommitted/data_docs/local_site/index.html'}