In [9]:
import pandas as pd
import numpy as np

In [10]:
df_raw = pd.read_csv('../data/01_raw/booking.csv')

In [3]:
def split_data(df, cutoff_date):
    df = df.copy()

    ref_data = df[df['date of reservation'] <= cutoff_date]
    ana_data = df[df['date of reservation'] > cutoff_date]

    return ref_data, ana_data

In [None]:
# ref_data, ana_data = split_data(df, "2018-09-19")

In [5]:
!pip install great_expectations




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Liza_N\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [11]:
import great_expectations as gx


context = gx.get_context(context_root_dir="../great_expectations")

First layer of expectations on row data: 

In [12]:
from great_expectations.core import ExpectationSuite, ExpectationConfiguration


def build_raw_data_expectation_suite(suite_name: str = "raw_data_suite") -> ExpectationSuite:
    """
    Builds an ExpectationSuite for raw hotel booking data using schema-driven and logical validations.

    Returns:
        ExpectationSuite: A suite of expectations for the raw data.
    """
    suite = ExpectationSuite(expectation_suite_name=suite_name)

    # Column type expectations
    expected_types = {
        "Booking_ID": "object",
        "number of adults": "int64",
        "number of children": "int64",
        "number of weekend nights": "int64",
        "number of week nights": "int64",
        "type of meal": "object",
        "car parking space": "int64",
        "room type": "object",
        "lead time": "int64",
        "market segment type": "object",
        "repeated": "int64",
        "P-C": "int64",
        "P-not-C": "int64",
        "average price": "float64",
        "special requests": "int64",
        "date of reservation": "object",
        "booking status": "object"
    }

    for column, dtype in expected_types.items():
        suite.add_expectation(
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_of_type",
                kwargs={"column": column, "type_": dtype}
            )
        )

    # Non-null expectations
    for column in expected_types.keys():
        suite.add_expectation(
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_not_be_null",
                kwargs={"column": column}
            )
        )

    # No negative values in numeric columns
    non_negative_cols = [
        "number of adults", "number of children", "number of weekend nights",
        "number of week nights", "car parking space", "lead time",
        "P-C", "P-not-C", "average price", "special requests"
    ]
    for column in non_negative_cols:
        suite.add_expectation(
            ExpectationConfiguration(
                expectation_type="expect_column_min_to_be_between",
                kwargs={"column": column, "min_value": 0, "strict_min": False}
            )
        )

    # At least one guest (adult or child)
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_pair_values_A_plus_B_to_be_greater_than",
            kwargs={
                "column_A": "number of adults",
                "column_B": "number of children",
                "value": 0
            }
        )
    )
    # At least one night (weekend or week)

    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_pair_values_A_plus_B_to_be_greater_than",
            kwargs={
                "column_A": "number of weekend nights",
                "column_B": "number of week nights",
                "value": 0
            }
        )
    )
    # Format of date of reservation string can be potentially parsed as date (no validity check yet)
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_match_regex",
            kwargs={
                "column": "date of reservation",
                "regex": r"^\d{4}-\d{2}-\d{2}$"
            }
        )
    )

    # Booking status should be either 'Canceled' or 'Not_Canceled'
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_distinct_values_to_be_in_set",
            kwargs={
                "column": "booking status",
                "value_set": ["Canceled", "Not_Canceled"]
            }
        )
    )

    # Car parking space should be 0 or 1 (to be transformed into categorical)
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_distinct_values_to_be_in_set",
            kwargs={
                "column": "car parking space",
                "value_set": [0, 1]
            }
        )
    )

    # Repeated should be 0 or 1 (to be transformed into categorical)
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_distinct_values_to_be_in_set",
            kwargs={
                "column": "repeated",
                "value_set": [0, 1]
            }
        )
    )

    return suite

In [13]:
from great_expectations.checkpoint.types.checkpoint_result import CheckpointResult
from great_expectations.core.batch import BatchRequest
from great_expectations.validator.validator import Validator
from great_expectations.dataset import PandasDataset

# Assume df_raw is your raw dataset (a pandas DataFrame)
df_raw_ge = gx.dataset.PandasDataset(df_raw)

# Build the suite
suite = build_raw_data_expectation_suite()

# Validate
results = df_raw_ge.validate(expectation_suite=suite)

# Print summary
print("Validation success:", results["success"])
for res in results["results"]:
    print(f"\nExpectation: {res['expectation_config']['expectation_type']}")
    print("✅ Success:" if res["success"] else "❌ Failed:", res["result"].get("unexpected_index_list", []))

InvalidExpectationConfigurationError: Could not add expectation; provided configuration is not valid: expect_column_pair_values_A_plus_B_to_be_greater_than not found