## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations

### Profile a JSON dataset with product sales data to check for null values in the 'ProductID' and 'Price' fields.
- Create an expectation suite and connect it to the data context.
- Use the `expect_column_values_to_not_be_null` expectation to profile these fields.
- Review the summary to identify any unexpected null values.

In [2]:
# write your code from here

import pandas as pd
import great_expectations as ge
from great_expectations.data_context.types.base import DataContextConfig, FilesystemStoreBackendDefaults
import os
import traceback

# Step 1: Simulated JSON-like data (API + sales)
sales_data = [
    {"ProductID": 101, "Price": 29.99, "Status": "Active"},
    {"ProductID": 102, "Price": 39.99, "Status": "Inactive"},
    {"ProductID": None, "Price": 49.99, "Status": "Active"},
    {"ProductID": 104, "Price": None, "Status": "Pending"},  # invalid
    {"ProductID": 105, "Price": 19.99, "Status": "Active"},
]
df = pd.DataFrame(sales_data)

# Step 2: Setup Great Expectations Data Context
def setup_context():
    context_dir = "great_expectations"
    if not os.path.exists(context_dir):
        config = DataContextConfig(
            store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=context_dir)
        )
        context = ge.data_context.DataContext(project_config=config)
    else:
        context = ge.data_context.DataContext(context_dir)
    return context

# Step 3: Add in-memory Pandas datasource
def add_datasource(context):
    datasource_config = {
        "name": "pandas_datasource",
        "class_name": "Datasource",
        "execution_engine": {"class_name": "PandasExecutionEngine"},
        "data_connectors": {
            "runtime_data_connector": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["id"]
            }
        }
    }
    try:
        context.add_datasource(**datasource_config)
    except Exception:
        pass  # Already exists

# Step 4: Run validations
def run_validations(df, context):
    if df.empty:
        raise ValueError("DataFrame is empty.")

    batch_request = {
        "datasource_name": "pandas_datasource",
        "data_connector_name": "runtime_data_connector",
        "data_asset_name": "product_sales",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"id": "batch_1"}
    }

    suite_name = "product_sales_suite"
    context.create_expectation_suite(expectation_suite_name=suite_name, overwrite_existing=True)
    validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

    # Profiling rules
    validator.expect_column_values_to_not_be_null("ProductID")
    validator.expect_column_values_to_not_be_null("Price")

    # API data validation: 'Status' must be one of predefined values
    validator.expect_column_values_to_be_in_set("Status", ["Active", "Inactive"])

    validator.save_expectation_suite()
    results = validator.validate()

    print("\n📊 Validation Summary:")
    for r in results["results"]:
        expectation = r["expectation_config"]["expectation_type"]
        success = r["success"]
        failed_values = r["result"].get("unexpected_values", [])
        print(f"- {expectation} => {'✅ Passed' if success else '❌ Failed'}")
        if not success:
            print(f"  Unexpected: {failed_values}")
    return results

# Step 5: Run everything
if __name__ == "__main__":
    try:
        context = setup_context()
        add_datasource(context)
        result = run_validations(df, context)
        print("\n✅ Validation complete.")
    except Exception as e:
        print("❌ Error during validation:")
        traceback.print_exc()

❌ Error during validation:


Traceback (most recent call last):
  File "/tmp/ipykernel_5560/519439083.py", line 89, in <module>
    context = setup_context()
  File "/tmp/ipykernel_5560/519439083.py", line 26, in setup_context
    context = ge.data_context.DataContext(project_config=config)
AttributeError: module 'great_expectations.data_context' has no attribute 'DataContext'. Did you mean: 'data_context'?


2. Writing Validation Rules for Data Ingestion

### Define validation rules for an API data source to confirm that 'Status' field contains only predefined statuses ('Active', 'Inactive').

- Apply `expect_column_values_to_be_in_set` to check field values during data ingestion.
- Execute the validation and review any mismatches.

In [None]:
# write your code from here