## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations
### Profile a CSV dataset containing customer information to inspect distribution patterns of 'Age' and 'Income' columns.
- Load the dataset using Great Expectations and create a data context.
- Generate a data asset to inspect the summary statistics.
- View the generated expectation suite to analyze data distributions.

In [3]:
import os
import pandas as pd
import great_expectations as ge
from great_expectations.data_context.types.base import DataContextConfig, FilesystemStoreBackendDefaults
import traceback

def setup_ge_context(context_root="great_expectations"):
    try:
        if not os.path.exists(context_root):
            context_config = DataContextConfig(
                store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=context_root)
            )
            context = ge.data_context.DataContext(project_config=context_config)
        else:
            context = ge.data_context.DataContext(context_root)
        return context
    except Exception as e:
        print("Failed to initialize Great Expectations context.")
        traceback.print_exc()
        raise

def add_datasource(context):
    try:
        datasource_config = {
            "name": "pandas_datasource",
            "class_name": "Datasource",
            "execution_engine": {"class_name": "PandasExecutionEngine"},
            "data_connectors": {
                "runtime_data_connector": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["id"]
                }
            }
        }
        context.add_datasource(**datasource_config)
    except Exception as e:
        print("Failed to add datasource.")
        traceback.print_exc()
        raise

def validate_date_column(df, context):
    if df.empty:
        raise ValueError("DataFrame is empty. Cannot perform validation.")

    batch_request = {
        "datasource_name": "pandas_datasource",
        "data_connector_name": "runtime_data_connector",
        "data_asset_name": "date_validation_data",
        "runtime_parameters": {"batch_data": df},
        "batch_identifiers": {"id": "batch_1"}
    }

    suite_name = "date_format_validation_suite"
    context.create_expectation_suite(expectation_suite_name=suite_name, overwrite_existing=True)
    validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

    # Add regex validation for date format: YYYY-MM-DD
    validator.expect_column_values_to_match_regex("Date", r"^\d{4}-\d{2}-\d{2}$")
    validator.save_expectation_suite()

    # Run validation
    results = validator.validate()

    print("\nValidation Results:")
    for res in results["results"]:
        print(f"Expectation: {res['expectation_config']['expectation_type']}")
        print(f"Success: {res['success']}")
        print(f"Unexpected Values: {res['result'].get('unexpected_values')}\n")

    return results

# Testable function
def test_validation_result():
    # Create test data
    test_data = pd.DataFrame({
        "CustomerID": [1, 2, 3],
        "Date": ["2024-01-15", "2024-13-01", "15-07-2024"]
    })

    context = setup_ge_context()
    add_datasource(context)
    results = validate_date_column(test_data, context)

    # Assert that the validation detects failures
    assert not results["success"], "Validation should fail due to bad date formats."
    assert results["results"][0]["result"]["unexpected_count"] == 2

if __name__ == "__main__":
    try:
        test_validation_result()
        print("✅ Test passed successfully.")
    except AssertionError as e:
        print("❌ Test failed:", e)
    except Exception as e:
        print("❌ Unexpected error during test:", e)

Failed to initialize Great Expectations context.
❌ Unexpected error during test: module 'great_expectations.data_context' has no attribute 'DataContext'


Traceback (most recent call last):
  File "/tmp/ipykernel_3139/204418925.py", line 13, in setup_ge_context
    context = ge.data_context.DataContext(project_config=context_config)
AttributeError: module 'great_expectations.data_context' has no attribute 'DataContext'. Did you mean: 'data_context'?


2. Writing Validation Rules for Data Ingestion
### Write validation rules for a CSV file to ensure the 'Date' column follows a specific date format.
- Utilize expect_column_values_to_match_regex to enforce date format validation.
- Run the validation and interpret the output.

In [2]:
# write your code from here

import pandas as pd
import great_expectations as ge
from great_expectations.data_context.types.base import DataContextConfig, FilesystemStoreBackendDefaults
import os

# Step 1: Simulate a DataFrame with 'Date' column
data = {
    "CustomerID": [1, 2, 3, 4],
    "Date": ["2024-05-01", "2024-06-15", "2024-13-01", "15-07-2024"]  # includes incorrect formats
}
df = pd.DataFrame(data)

# Step 2: Set up a Great Expectations Data Context
context_root_dir = "great_expectations"
if not os.path.exists(context_root_dir):
    context_config = DataContextConfig(
        store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=context_root_dir)
    )
    context = ge.data_context.DataContext(project_config=context_config)
else:
    context = ge.data_context.DataContext(context_root_dir)

# Step 3: Add an in-memory Pandas datasource
datasource_config = {
    "name": "pandas_datasource",
    "class_name": "Datasource",
    "execution_engine": {"class_name": "PandasExecutionEngine"},
    "data_connectors": {
        "runtime_data_connector": {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["id"]
        }
    }
}
context.add_datasource(**datasource_config)

# Step 4: Create a batch request from the DataFrame
batch_request = {
    "datasource_name": "pandas_datasource",
    "data_connector_name": "runtime_data_connector",
    "data_asset_name": "date_validation_data",
    "runtime_parameters": {"batch_data": df},
    "batch_identifiers": {"id": "batch_1"}
}

# Step 5: Create expectation suite and validator
suite_name = "date_format_validation_suite"
context.create_expectation_suite(expectation_suite_name=suite_name, overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

# Step 6: Define validation rule for date format (YYYY-MM-DD)
validator.expect_column_values_to_match_regex(
    "Date", r"^\d{4}-\d{2}-\d{2}$"
)

# Step 7: Save and run validation
validator.save_expectation_suite()
results = validator.validate()

# Step 8: Print result
print("\nValidation Results:")
for res in results["results"]:
    print(f"Expectation: {res['expectation_config']['expectation_type']}")
    print(f"Success: {res['success']}")
    print(f"Unexpected Values: {res['result'].get('unexpected_values')}\n")

AttributeError: module 'great_expectations.data_context' has no attribute 'DataContext'