## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations

### Profile a JSON dataset with product sales data to check for null values in the 'ProductID' and 'Price' fields.
- Create an expectation suite and connect it to the data context.
- Use the `expect_column_values_to_not_be_null` expectation to profile these fields.
- Review the summary to identify any unexpected null values.

In [5]:
import os
import pandas as pd
import great_expectations as ge
from great_expectations.data_context.types.base import DataContextConfig, FilesystemStoreBackendDefaults
import traceback

# -------------------- Setup Functions --------------------
def setup_context():
    """Initializes a Great Expectations context in local directory."""
    context_dir = "great_expectations"
    try:
        if not os.path.exists(context_dir):
            config = DataContextConfig(
                store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=context_dir)
            )
            return ge.data_context.DataContext(project_config=config)
        return ge.data_context.DataContext(context_dir)
    except Exception as e:
        print("❌ Error setting up data context:")
        traceback.print_exc()
        raise

def add_datasource(context):
    """Adds an in-memory pandas datasource."""
    config = {
        "name": "pandas_datasource",
        "class_name": "Datasource",
        "execution_engine": {"class_name": "PandasExecutionEngine"},
        "data_connectors": {
            "runtime_data_connector": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["id"]
            }
        }
    }
    try:
        context.add_datasource(**config)
    except Exception:
        pass  # Ignore if already added

# -------------------- Validation Function --------------------
def validate_sales_data(df, context, suite_name="product_sales_suite"):
    if df.empty:
        raise ValueError("DataFrame is empty. Cannot run validation.")

    batch_request = {
        "datasource_name": "pandas_datasource",
        "data_connector_name": "runtime_data_connector",
        "data_asset_name": "sales_data",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"id": "batch_1"}
    }

    # Create or overwrite suite
    context.create_expectation_suite(expectation_suite_name=suite_name, overwrite_existing=True)
    validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

    # Profiling expectations
    validator.expect_column_values_to_not_be_null("ProductID")
    validator.expect_column_values_to_not_be_null("Price")

    # Validation rules for ingestion
    validator.expect_column_values_to_be_in_set("Status", ["Active", "Inactive"])

    validator.save_expectation_suite()
    result = validator.validate()

    # Output summary
    print("\n📋 Validation Results:")
    for res in result["results"]:
        print(f"- {res['expectation_config']['expectation_type']} => {'✅' if res['success'] else '❌'}")
        if not res['success']:
            print("  Unexpected:", res['result'].get("unexpected_values", []))
    return result

# -------------------- Unit Test Function --------------------
def test_sales_validation():
    """Simulated unit test for validating edge cases."""
    print("\n🔎 Running Unit Test...")

    # Simulated data with edge cases
    test_data = pd.DataFrame([
        {"ProductID": 1, "Price": 20.0, "Status": "Active"},
        {"ProductID": None, "Price": 25.0, "Status": "Active"},         # Null ProductID
        {"ProductID": 3, "Price": None, "Status": "Inactive"},          # Null Price
        {"ProductID": 4, "Price": 15.0, "Status": "Unknown"},           # Invalid Status
    ])

    context = setup_context()
    add_datasource(context)
    results = validate_sales_data(test_data, context)

    # Test assertions
    assert not results["success"], "Test should fail due to nulls and bad 'Status'"
    assert any("expect_column_values_to_be_in_set" in r['expectation_config']['expectation_type']
               and not r['success'] for r in results["results"]), "Invalid status should be caught."
    assert any("expect_column_values_to_not_be_null" in r['expectation_config']['expectation_type']
               and not r['success'] for r in results["results"]), "Null values should be caught."

    print("✅ Unit Test Passed.")

# -------------------- Entry Point --------------------
if __name__ == "__main__":
    try:
        test_sales_validation()
        print("\n✅ All checks passed.")
    except AssertionError as e:
        print("❌ Assertion failed:", e)
    except Exception as e:
        print("❌ Unexpected error:", e)


🔎 Running Unit Test...
❌ Error setting up data context:
❌ Unexpected error: module 'great_expectations.data_context' has no attribute 'DataContext'


Traceback (most recent call last):
  File "/tmp/ipykernel_5560/983198734.py", line 16, in setup_context
    return ge.data_context.DataContext(project_config=config)
AttributeError: module 'great_expectations.data_context' has no attribute 'DataContext'. Did you mean: 'data_context'?


2. Writing Validation Rules for Data Ingestion

### Define validation rules for an API data source to confirm that 'Status' field contains only predefined statuses ('Active', 'Inactive').

- Apply `expect_column_values_to_be_in_set` to check field values during data ingestion.
- Execute the validation and review any mismatches.

In [6]:
# write your code from here