## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations

### Profile a JSON dataset with product sales data to check for null values in the 'ProductID' and 'Price' fields.
- Create an expectation suite and connect it to the data context.
- Use the `expect_column_values_to_not_be_null` expectation to profile these fields.
- Review the summary to identify any unexpected null values.

In [3]:
# write your code from here

import great_expectations as gx
import pandas as pd
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.exceptions import DataContextError

def setup_context(context_root_dir="great_expectations"):
    """
    Sets up the Great Expectations Data Context. Handles potential errors during context creation.
    """
    try:
        context = gx.DataContext(context_root_dir=context_root_dir)
        return context
    except Exception as e:
        print(f"Error creating Data Context: {e}")
        return None

def add_datasource(context, datasource_name, csv_file_path):
    """
    Adds a datasource to the Great Expectations context, handling potential errors.
    """
    try:
        context.add_datasource(
            name=datasource_name,
            class_name="pandas",
            module_name="great_expectations.datasource",
            batch_kwargs_generators={
                "default": {
                    "class_name": "glob_reader",
                    "base_directory": ".",  # Assuming CSV is in the current directory
                    "glob": csv_file_path,
                }
            },
        )
        return True
    except Exception as e:
        print(f"Error adding datasource: {e}")
        #Re-raise the exception so the caller can determine how to handle it
        raise e #Re-raise exception

def run_validation(context, datasource_name, data_asset_name, expectation_suite_name):
    """
    Runs the data validation using Great Expectations.
    """
    try:
        batch_kwargs = {"datasource": datasource_name, "dataset_name": data_asset_name}
        batch = context.get_batch(batch_kwargs, expectation_suite_name)

        if batch.dataframe.empty:
            print("DataFrame is empty. Skipping validation.")
            return False

        results = context.run_validation(
            batch_request={
                "datasource_name": datasource_name,
                "data_asset_name": data_asset_name,
            },
            expectation_suite_name=expectation_suite_name,
            checkpoint_name="my_checkpoint",  # You might need to create a checkpoint first. Can be omitted for basic validation
        )

        if not results["success"]:
            print("Validation failed.")
        else:
            print("Validation successful.")

        return results["success"]

    except Exception as e:
        print(f"Error during validation: {e}")
        return False

def create_expectation_suite(context, expectation_suite_name, dataframe):
    """
    Creates or retrieves an expectation suite and adds expectations.
    """
    try:
        expectation_suite = context.get_expectation_suite(expectation_suite_name)
        print(f"Loaded existing ExpectationSuite `{expectation_suite.name}` containing {len(expectation_suite.expectations)} expectations.")
    except DataContextError:
        expectation_suite = context.create_expectation_suite(
            expectation_suite_name, overwrite_existing=True
        )
        print(f"Created a new ExpectationSuite `{expectation_suite.name}`.")

    # Expectation 1: Check for null values in any column
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={},
    )
    for col in dataframe.columns:  # Apply to all columns
        expectation_configuration.kwargs['column'] = col
        expectation_suite.add_expectation(expectation_configuration)

    # Expectation 2: Validate 'Status' field values (example)
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "Status",
            "value_set": ["Active", "Inactive", "Pending"],
            "mostly": 1.0,  # Expect all values to be in the set
        },
    )
    expectation_suite.add_expectation(expectation_configuration)

    context.save_expectation_suite(expectation_suite)
    return expectation_suite


if __name__ == "__main__":
    # Configuration
    context_root_dir = "great_expectations"
    datasource_name = "my_pandas_datasource"
    data_asset_name = "my_data_asset"
    csv_file_path = "data.csv"  # Replace with your actual CSV file
    expectation_suite_name = "my_expectation_suite"


    # Create a dummy data.csv for demonstration
    data = {'ID': [1, 2, 3, 4, 5],
            'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
            'Status': ['Active', 'Inactive', 'Active', 'Pending', 'Active'],
            'Value': [10, None, 30, 40, 50]}
    df = pd.DataFrame(data)
    df.to_csv("data.csv", index=False)


    # 1. Setup Data Context
    context = setup_context(context_root_dir)
    if context is None:
        exit()

    # 2. Add Datasource
    try:
      add_datasource(context, datasource_name, csv_file_path)
    except Exception as e:
      print("Failed to add datasource, exiting.")
      exit() #exit if datasource creation failed

    #Load dataframe
    dataframe = pd.read_csv(csv_file_path)

    # 3. Create/Load Expectation Suite
    expectation_suite = create_expectation_suite(context, expectation_suite_name, dataframe)


    # 4. Run Validation
    validation_result = run_validation(context, datasource_name, data_asset_name, expectation_suite_name)

    if validation_result:
        print("Data validation was successful.")
    else:
        print("Data validation failed.")

ModuleNotFoundError: No module named 'great_expectations.core.expectation_configuration'

2. Writing Validation Rules for Data Ingestion

### Define validation rules for an API data source to confirm that 'Status' field contains only predefined statuses ('Active', 'Inactive').

- Apply `expect_column_values_to_be_in_set` to check field values during data ingestion.
- Execute the validation and review any mismatches.

In [None]:
# write your code from here