In [8]:
import pandas as pd
import great_expectations as gx

In [13]:
csv_file_path = "data\yellow_tripdata_2023-01.parquet"
df = pd.read_parquet(csv_file_path)
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [24]:
# Cria o contexto
context = gx.get_context()

# Create the Data Source
dataframe_datasource = context.sources.add_or_update_pandas(
    name='my_pandas_in_memory')

# Build a Data Asset with Data Source
dataframe_asset = dataframe_datasource.add_dataframe_asset(
    name='yellow_tripdata',
    dataframe=df
)

# Batch Request
batch_request = dataframe_asset.build_batch_request()

# Creating the Suite
expectation_suite_name = 'suite_expectations'
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)

validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [31]:
print( type(dataframe_datasource) )
print( type(dataframe_asset) )
print( type(batch_request) )
print( type(context) )
print( type(validator) )

<class 'great_expectations.datasource.fluent.pandas_datasource.PandasDatasource'>
<class 'great_expectations.datasource.fluent.pandas_datasource.DataFrameAsset'>
<class 'great_expectations.datasource.fluent.batch_request.BatchRequest'>
<class 'great_expectations.data_context.data_context.ephemeral_data_context.EphemeralDataContext'>
<class 'great_expectations.validator.validator.Validator'>


In [26]:
# EXPECTATIONS
validator.expect_column_values_to_not_be_null(column='passenger_count')

validator.expect_column_values_to_be_between(column='congestion_surcharge', min_value=0, max_value=1000)

# Save the Expectation Suite to Expectation Store
validator.save_expectation_suite(discard_failed_expectations=False)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

In [27]:
validator.expect_column_values_to_be_between(column='congestion_surcharge', min_value=0, max_value=1000)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 3066766,
    "unexpected_count": 19718,
    "unexpected_percent": 0.6583588840553144,
    "partial_unexpected_list": [
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5,
      -2.5
    ],
    "missing_count": 71743,
    "missing_percent": 2.3393698769322473,
    "unexpected_percent_total": 0.6429574346396171,
    "unexpected_percent_nonmissing": 0.6583588840553144
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

You'll create and store a Checkpoint for your batch, which you can use to validate and run post-validation actions.

Run the following command to create the Checkpoint configuration that uses your Data Context, passes in your Batch Request (your data) and your Expectation Suite (your tests):



In [28]:
# CHECKPOINTS - to use to validate and post-validation actions
my_checkpoint_name = "my_checkpoint"

checkpoint = gx.checkpoint.Checkpoint(
    name=my_checkpoint_name,
    run_name_template="%Y%m%d-%H%M%S-my-run-name-template",
    data_context=context,
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
    action_list=[
        {
            "name": "store_validation_result",
            "action": {"class_name": "StoreValidationResultAction"},
        },
        {"name": "update_data_docs", "action": {"class_name": "UpdateDataDocsAction"}},
    ],
)

# To save the checkpoint
context.add_or_update_checkpoint(checkpoint=checkpoint)

# Run checkpoint
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/15 [00:00<?, ?it/s]

In [18]:
print(checkpoint.get_config().to_yaml_str())

name: my_checkpoint
config_version: 1.0
template_name:
module_name: great_expectations.checkpoint
class_name: Checkpoint
run_name_template: '%Y%m%d-%H%M%S-my-run-name-template'
expectation_suite_name: suite_expectations
batch_request:
  datasource_name: my_pandas_in_memory
  data_asset_name: yellow_tripdata
  options: {}
  batch_slice:
action_list:
  - name: store_validation_result
    action:
      class_name: StoreValidationResultAction
  - name: update_data_docs
    action:
      class_name: UpdateDataDocsAction
evaluation_parameters: {}
runtime_configuration: {}
validations: []
profilers: []
ge_cloud_id:
expectation_suite_ge_cloud_id:



# Build and view Data Docs

Your Checkpoint contained an UpdateDataDocsAction, so your Data Docs have already been built from the validation you ran and your Data Docs store contains a new rendered validation result.

In [None]:
# Build and view Data Docs
html = 'file:///C:/Users/RosaLux/AppData/Local/Temp/tmp7djytnw8/validations/suite_expectations/20231024-222148-my-run-name-template/20231024T222148.764155Z/my_pandas_in_memory-yellow_tripdata.html'
with open(html, "r") as f:
    data = "".join([l for l in f])
displayHTML(data)

In [29]:
context

{
  "anonymous_usage_statistics": {
    "data_context_id": "3685ce0c-9d66-4300-b0e7-39380cc69885",
    "enabled": true,
    "explicit_url": false,
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "explicit_id": true
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_version": 3,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "C:\\Users\\RosaLux\\AppData\\Local\\Temp\\tmpmp5njnpx"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {},
  "evaluation_parameter_store_name": "evaluation_parameter_store",
  "expectations_store_name": "expectations_store",
  "fluent_datasources": {},
  "include_rendered_content": {
    "expectation_suite": false,
    "globally": false,
    "expectation

In [30]:
checkpoint_result

{
  "run_id": {
    "run_name": "20231024-224813-my-run-name-template",
    "run_time": "2023-10-24T19:48:13.615139-03:00"
  },
  "run_results": {
    "ValidationResultIdentifier::suite_expectations/20231024-224813-my-run-name-template/20231024T224813.615139Z/my_pandas_in_memory-yellow_tripdata": {
      "validation_result": {
        "success": false,
        "results": [
          {
            "success": false,
            "expectation_config": {
              "expectation_type": "expect_column_values_to_not_be_null",
              "kwargs": {
                "column": "passenger_count",
                "batch_id": "my_pandas_in_memory-yellow_tripdata"
              },
              "meta": {}
            },
            "result": {
              "element_count": 3066766,
              "unexpected_count": 71743,
              "unexpected_percent": 2.3393698769322473,
              "partial_unexpected_list": [
                null,
                null,
                null,
         