### Using Great Expectations for Automated Data Checks
**Objective**: Use Great Expectations to perform data validation steps on a dataset.

**Task 1**: Validate Column Existence

**Steps**:
- Load your dataset using a Pandas DataFrame.
- Use Great Expectations to setup an expectation suite.
- Create an expectation to confirm that a specific column (e.g., customer_id ) exists in your dataset.
- Run the expectation and observe the results.

In [2]:
import pandas as pd
import great_expectations as ge
import unittest

def create_ge_dataframe(df):
    """
    Wrap a pandas DataFrame as a Great Expectations DataFrame.
    Raises ValueError if df is not a DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    return ge.from_pandas(df)

def validate_column_existence(ge_df, column_name):
    """
    Expect the specified column to exist.
    Returns expectation result dictionary.
    """
    try:
        result = ge_df.expect_column_to_exist(column_name)
    except Exception as e:
        raise RuntimeError(f"Error in column existence expectation: {e}")
    return result

def validate_column_dtype(ge_df, column_name, expected_dtype):
    """
    Expect column values to be of the specified dtype.
    Returns expectation result dictionary.
    """
    try:
        result = ge_df.expect_column_values_to_be_of_type(column_name, expected_dtype)
    except Exception as e:
        raise RuntimeError(f"Error in column dtype expectation: {e}")
    return result

def validate_column_value_range(ge_df, column_name, min_value, max_value):
    """
    Expect column values to be between min_value and max_value (inclusive).
    Returns expectation result dictionary.
    """
    try:
        result = ge_df.expect_column_values_to_be_between(column_name, min_value=min_value, max_value=max_value)
    except Exception as e:
        raise RuntimeError(f"Error in column value range expectation: {e}")
    return result

# Example usage with a sample DataFrame
if __name__ == "__main__":
    # Sample data
    data = {
        "customer_id": [1, 2, 3, 4],
        "purchase_amount": [23.5, 45.0, 12.99, 100.0],
        "age": [25, 30, 45, 60]
    }
    df = pd.DataFrame(data)

    # Wrap with GE
    ge_df = create_ge_dataframe(df)

    # Run validations
    col_exist_result = validate_column_existence(ge_df, "customer_id")
    dtype_result = validate_column_dtype(ge_df, "purchase_amount", "float64")
    range_result = validate_column_value_range(ge_df, "age", 18, 65)

    print("Column existence check:", col_exist_result)
    print("Column dtype check:", dtype_result)
    print("Column value range check:", range_result)


# Unit tests
class TestGreatExpectationsValidations(unittest.TestCase):
    def setUp(self):
        self.df = pd.DataFrame({
            "customer_id": [1, 2, 3],
            "purchase_amount": [10.5, 20.0, 15.5],
            "age": [20, 40, 30]
        })
        self.ge_df = create_ge_dataframe(self.df)

    def test_column_existence(self):
        res = validate_column_existence(self.ge_df, "customer_id")
        self.assertTrue(res["success"])
        res_missing = validate_column_existence(self.ge_df, "missing_col")
        self.assertFalse(res_missing["success"])

    def test_column_dtype(self):
        res = validate_column_dtype(self.ge_df, "purchase_amount", "float64")
        self.assertTrue(res["success"])
        # Introduce a string to test failure
        df2 = self.df.copy()
        df2.loc[0, "purchase_amount"] = "bad_data"
        ge_df2 = create_ge_dataframe(df2)
        res_fail = validate_column_dtype(ge_df2, "purchase_amount", "float64")
        self.assertFalse(res_fail["success"])

    def test_column_value_range(self):
        res = validate_column_value_range(self.ge_df, "age", 18, 65)
        self.assertTrue(res["success"])
        # Set age outside range
        df2 = self.df.copy()
        df2.loc[0, "age"] = 70
        ge_df2 = create_ge_dataframe(df2)
        res_fail = validate_column_value_range(ge_df2, "age", 18, 65)
        self.assertFalse(res_fail["success"])

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)

ModuleNotFoundError: No module named 'great_expectations'

In [1]:
# write your code from here

import pandas as pd
import great_expectations as ge
from great_expectations.core.batch import BatchRequest

# Create example dataset in-memory (simulate your data)
data = {
    "customer_id": [1, 2, 3, 4],
    "purchase_amount": [23.5, 45.0, 12.99, 100.0],
    "age": [25, 30, 45, 60],
    "notes": ["ok", "ok", "ok", "ok"]
}
df = pd.DataFrame(data)

# Wrap DataFrame with Great Expectations DataFrame
ge_df = ge.from_pandas(df)

# ---------------- Task 1: Validate Column Existence ----------------

# Create expectation that 'customer_id' column exists
result_col_exist = ge_df.expect_column_to_exist("customer_id")

print("Task 1 - Column existence expectation result:")
print(result_col_exist)

# ---------------- Task 2: Validate Column Data Types ----------------

# Great Expectations does not have direct 'expect_column_dtype' but
# you can check types by expectation 'expect_column_values_to_be_of_type'
# We expect 'purchase_amount' to be of type 'float64' (common float type in pandas)

# First, verify actual dtype
actual_dtype = df['purchase_amount'].dtype
print(f"\nActual dtype of 'purchase_amount': {actual_dtype}")

# Create expectation that 'purchase_amount' column values are floats
result_dtype = ge_df.expect_column_values_to_be_of_type("purchase_amount", "float64")

print("\nTask 2 - Column data type expectation result:")
print(result_dtype)

# ---------------- Task 3: Validate Range of Values ----------------

# Expect 'age' to be between 18 and 65 inclusive
result_range = ge_df.expect_column_values_to_be_between("age", min_value=18, max_value=65)

print("\nTask 3 - Range expectation result:")
print(result_range)

# ---------------- Interpretation ----------------

for task_num, res in enumerate([result_col_exist, result_dtype, result_range], start=1):
    if res["success"]:
        print(f"\nTask {task_num} PASSED.")
    else:
        print(f"\nTask {task_num} FAILED.")


ModuleNotFoundError: No module named 'great_expectations'

**Task 2**: Validate Column Data Types

**Steps**:
- Using the same dataset setup, create an expectation to check that a numeric column
(e.g., purchase_amount ) contains only float values.
- Identify a numeric column in your dataset.
- Use Great Expectations to create and validate an expectation that checks the column's data type is correct.
- Run your expectation and check if it passes for your data.

In [None]:
# write your code from here

**Task 3**: Validate Range of Values

**Steps**:
- Set an expectation using Great Expectations to ensure that a column (e.g., age ) values
are between 18 and 65.
- Identify a column in your dataset where values fall within a specific range.
- Implement a range-based expectation to check this column and validate your dataset.
- Observe and interpret the result of your expectation.

In [None]:
# write your code from here