In [6]:
import great_expectations as ge
from great_expectations.data_context import FileDataContext
import pandas as pd

In [7]:
context = FileDataContext.create(project_root_dir='./')

In [8]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-cleaned-data'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'trip-january'
path_to_data = './data_cleaned.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [9]:
# Creat an expectation suite
expectation_suite_name = 'expectation-cleaned-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,retail_sales_people,product_id,category,sub-category,product_name,returned,sales,quantity,discount,profit
0,1,CA-2016-152156,2016-08-11,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Cassandra Brandow,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,Not,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-08-11,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Cassandra Brandow,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Not,731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-12-06,2016-12-06,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,Anna Andreadi,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,Not,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-11-10,2015-11-10,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Cassandra Brandow,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,Not,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-11-10,2015-11-10,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Cassandra Brandow,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,Not,22.368,2,0.2,2.5164


In [10]:
# 1. Column Name Check - All column names should be lowercase nad has no white space
expected_ordered_columns = [col.strip().lower() for col in validator.columns()]
validator.expect_table_columns_to_match_ordered_list(expected_ordered_columns)

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "row_id",
      "order_id",
      "order_date",
      "ship_date",
      "ship_mode",
      "customer_id",
      "customer_name",
      "segment",
      "country",
      "city",
      "state",
      "postal_code",
      "region",
      "retail_sales_people",
      "product_id",
      "category",
      "sub-category",
      "product_name",
      "returned",
      "sales",
      "quantity",
      "discount",
      "profit"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
# 2. Missing Values Check - No column should have missing values
col_check = [col for col in validator.columns()]
validator.expect_column_values_to_not_be_null(col_check)

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [
      "row_id",
      "order_id",
      "order_date",
      "ship_date",
      "ship_mode",
      "customer_id",
      "customer_name",
      "segment",
      "country",
      "city",
      "state",
      "postal_code",
      "region",
      "retail_sales_people",
      "product_id",
      "category",
      "sub-category",
      "product_name",
      "returned",
      "sales"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [14]:
# 3. Unique row_id Check - Ensure that 'row_id' is unique
validator.expect_column_values_to_be_unique('row_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [15]:
# 4. Quantity Check - Ensure 'quantity' is greater than or equal to 0
validator.expect_column_values_to_be_between(
    column='quantity', min_value=0
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [16]:
# 5. Region Check - Ensure valid regions
valid_regions = ['central', 'east', 'west', 'north', 'south']
validator.expect_column_values_to_be_in_set("region", valid_regions)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 9994,
    "unexpected_count": 9994,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "South",
      "South",
      "West",
      "South",
      "South",
      "West",
      "West",
      "West",
      "West",
      "West",
      "West",
      "West",
      "South",
      "West",
      "Central",
      "Central",
      "Central",
      "West",
      "West",
      "West"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [17]:
# 6. Returned Check - Ensure 'returned' column has valid values ('yes', 'no')
validator.expect_column_values_to_be_in_set("returned", ["yes", "no"])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 9994,
    "unexpected_count": 9994,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Not",
      "Yes",
      "Yes"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [18]:
# 7. Postal Code Check - Ensure 'postal_code' is numeric
validator.expect_column_values_to_be_of_type("postal_code", "int")

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [25]:
# 8. Ship Date vs Order Date Check - Ensure 'ship_date' > 'order_date' per 'order_id'
validator.expect_column_pair_values_a_to_be_greater_than_b("ship_date", "order_date",or_equal=True)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [28]:
# 9. Postal Code Check - Ensure 'postal_code' is always a 4-5 digits number
validator.expect_column_value_lengths_to_be_between("postal_code", min_value=4,
    max_value=5)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9994,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [29]:
# Save the expectations to the suite
validator.save_expectation_suite(discard_failed_expectations=False)

In [30]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [31]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/61 [00:00<?, ?it/s]

An unexpected Exception occurred during data docs rendering.  Because of this error, certain parts of data docs will not be rendered properly and/or may not appear altogether.  Please use the trace, included in this message, to diagnose and repair the underlying issue.  Detailed information follows:
                TypeError: "unhashable type: 'list'".  Traceback: "Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.9/site-packages/great_expectations/render/renderer/site_builder.py", line 483, in build
    rendered_content = self.renderer_class.render(resource)
  File "/opt/miniconda3/lib/python3.9/site-packages/great_expectations/render/renderer/page_renderer.py", line 677, in render
    ) = expectations.get_grouped_and_ordered_expectations_by_column()
  File "/opt/miniconda3/lib/python3.9/site-packages/great_expectations/core/expectation_suite.py", line 986, in get_grouped_and_ordered_expectations_by_column
    if column not in expectations_by_column:
TypeError: un