In [1]:
# Install the library

!pip install -q "great-expectations==0.18.19"

In [2]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [3]:
# Give uniques name to the Datasource
datasource_name = 'csv-electronics-inventory'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'electronics-inventory-data'
path_to_data = 'H:/Hacktiv8/p2-ftds042-rmt-m3-nathanaelzefanya/P2M3_nathanael_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [None]:
# Create an expectation suite
expectation_suite_name = 'expectation-electronics-inventory'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,product_id,product_name,product_category,product_description,price,stock_quantity,warranty_period,product_dimensions,manufacturing_date,expiration_date,sku,product_tags,color_size_variations,product_ratings
0,93TGNAY7,Laptop,Home Appliances,Product_XU5QX,253.17,3,2,16x15x15 cm,2023-01-01,2026-01-01,8NMFZ4,"VNU,NZ6",Green/Large,2
1,TYYZ5AV7,Smartphone,Clothing,Product_NRUMS,214.37,92,2,15x19x19 cm,2023-03-15,2025-01-01,7P5YCW,"ZJA,0D3",Red/Small,2
2,5C94FGTQ,Headphones,Clothing,Product_IT7HG,475.29,19,2,9x6x6 cm,2023-03-15,2026-01-01,YW5BME,"ZNG,MAP",Red/Small,1
3,XBHKYPQB,Monitor,Clothing,Product_8SBDO,403.33,40,1,7x13x5 cm,2023-01-01,2026-01-01,65MQC3,"RPP,M40",Green/Large,1
4,728GCZFU,Laptop,Home Appliances,Product_54FAF,229.81,32,2,20x20x19 cm,2023-07-30,2026-01-01,RLCBRW,"R8U,X46",Blue/Medium,4


We have already created an expectation suite named 'expectation-electronics-sales', a validator based on the expectation suite, and checked the validator's data.

In [7]:
# Expectation 1 : Column `product_id` have to be unique

validator.expect_column_values_to_be_unique('product_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {}
}

This expectation ensures `product_id` must be unique to identify certain items in the inventory dataset.

In [8]:
# Expectation 2 : Column `product_ratings` must between min_value and max_value

validator.expect_column_values_to_be_between(column='product_ratings',min_value=0,max_value=5.0)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {}
}

This expectation ensures the values in `product_ratings` are within valid range of 0 to 5.0 to determine customer satisfaction on products, which can lead to analyse each products performances.

In [None]:
# Expectation 3 : Column `product_name` must contain one of the following 4 things :
# 1. Laptop
# 2. Smartphones
# 3. Monitor
# 4. Headphones

validator.expect_column_values_to_be_in_set('product_name', ['Laptop', 'Smartphones', 'Monitor', 'Headphones'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 10000,
    "unexpected_count": 10000,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "Laptop",
      "Smartphone",
      "Headphones",
      "Monitor",
      "Laptop",
      "Smartphone",
      "Headphones",
      "Monitor",
      "Monitor",
      "Smartphone",
      "Monitor",
      "Headphones",
      "Smartphone",
      "Monitor",
      "Laptop",
      "Smartphone",
      "Monitor",
      "Monitor",
      "Headphones",
      "Smartphone"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {}
}

This expectations ensure the data are specifically integrated to verify that product's names are limited to one of the four products.

In [10]:
# Expectation 4 : Column 'price' must in type list

validator.expect_column_values_to_be_in_type_list('price', ['float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "float64"
  },
  "meta": {}
}

This expectations ensure the input for `price` are using float type for validating product cost and other financial analysis.

In [11]:
# Expectation 5 : Column 'sku' must match Regex Pattern

validator.expect_column_values_to_match_regex('sku', '^[A-Z0-9]{6}$')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {}
}

This expectations ensure that all values in the 'sku' column consist of 6 random-alphanumerics.

In [12]:
# Expectation 6 : Column 'expiration_date' must match strftime format

validator.expect_column_values_to_match_strftime_format('expiration_date', '%Y-%m-%d')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {}
}

This expectations ensure all values in the 'expiration_date' column conform to a specific date format.

In [13]:
# Expectation 7 : Column 'product_name' must be in set

validator.expect_column_most_common_value_to_be_in_set('product_name', ['Laptop', 'Smartphone', 'Monitor', 'Headphones'])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": [
      "Headphones"
    ]
  },
  "meta": {}
}

This expectations ensure the set of acceptable product to be in the list of inventory ('Laptop', 'Smartphone', 'Monitor', 'Headphones').

In [14]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

Saves the \defined data expectations into an expectation suite.

In [15]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

Defines a checkpoint called 'checkpoint_1' within the data validation context, linking it to a given validator to automate and simplify the process of running data validation checks.

In [16]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/39 [00:00<?, ?it/s]

Runs a predefined series of data validation checks (called a checkpoint) on dataset to verify it meets certain standards to provide the desire output.

In [17]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://h:\\Hacktiv8\\p2-ftds042-rmt-m3-nathanaelzefanya\\gx\\uncommitted/data_docs/local_site/index.html'}

Creates and updates the dataset’s documentation by producing a summary that highlights the results of data validation and profiling.