In [1]:
import pandas as pd
import numpy as np

In [2]:
df_raw = pd.read_csv('../data/01_raw/booking.csv')

## Data unit test for raw data

In [5]:
!pip install great_expectations




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Liza_N\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [3]:
import great_expectations as gx


context = gx.get_context(context_root_dir="../great_expectations")

First layer of expectations on raw data: 
- schema
- datatypes (not formatted)
- logical expectations (a booking should be for at least 1 person for at least 1 night)
- target values

In [4]:
# Create a custom expectations on sum of values in two columns
from great_expectations.expectations.expectation import ColumnPairMapExpectation


class ExpectColumnPairSumGreaterThanZero(ColumnPairMapExpectation):
    map_metric = "column_pair.sum_greater_than_zero"
    success_keys = ("column_A", "column_B")

    def validate_configuration(self, configuration):
        assert "column_A" in configuration.kwargs and "column_B" in configuration.kwargs, \
            "Must specify column_A and column_B"
        return super().validate_configuration(configuration)

    def _validate(self, configuration, metrics, runtime_configuration=None, execution_engine=None):
        df = execution_engine.get_domain_records(configuration.domain_kwargs)
        colA = configuration.kwargs["column_A"]
        colB = configuration.kwargs["column_B"]

        result_series = (df[colA] + df[colB]) > 0
        unexpected_indices = (~result_series).to_numpy().nonzero()[0].tolist()
        return {
            "success": result_series.all(),
            "result": {"unexpected_index_list": unexpected_indices}
        }

In [5]:
from great_expectations.core import ExpectationSuite, ExpectationConfiguration
import re


def build_raw_data_expectation_suite(suite_name: str = "raw_data_suite") -> ExpectationSuite:
    """
    Builds an ExpectationSuite for raw hotel booking data using schema-driven and logical validations.

    Returns:
        ExpectationSuite: A suite of expectations for the raw data.
    """
    suite = ExpectationSuite(expectation_suite_name=suite_name)

    # Column type expectations
    expected_types = {
        "Booking_ID": "object",
        "number of adults": "int64",
        "number of children": "int64",
        "number of weekend nights": "int64",
        "number of week nights": "int64",
        "type of meal": "object",
        "car parking space": "int64",
        "room type": "object",
        "lead time": "int64",
        "market segment type": "object",
        "repeated": "int64",
        "P-C": "int64",
        "P-not-C": "int64",
        "average price": "float64",
        "special requests": "int64",
        "date of reservation": "object",
        "booking status": "object"
    }

    for column, dtype in expected_types.items():
        suite.add_expectation(
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_of_type",
                kwargs={"column": column, "type_": dtype}
            )
        )

    # Non-null expectations
    for column in expected_types.keys():
        suite.add_expectation(
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_not_be_null",
                kwargs={"column": column}
            )
        )

    # No negative values in numeric columns
    non_negative_cols = [
        "number of adults", "number of children", "number of weekend nights",
        "number of week nights", "car parking space", "lead time",
        "P-C", "P-not-C", "average price", "special requests"
    ]
    for column in non_negative_cols:
        suite.add_expectation(
            ExpectationConfiguration(
                expectation_type="expect_column_min_to_be_between",
                kwargs={"column": column, "min_value": 0, "strict_min": False}
            )
        )
    ## Add the new custom expectations:
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_pair_sum_greater_than_zero",
            kwargs={"column_A": "number of adults", "column_B": "number of children"}
        )
    )
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_pair_sum_greater_than_zero",
            kwargs={"column_A": "number of weekend nights", "column_B": "number of week nights"}
        )
    )

    # Format of date of reservation string can be potentially parsed as date (no validity check yet)
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_match_regex",
            kwargs={
                "column": "date of reservation",
                "regex": r"^(\d{1,2}/\d{1,2}/\d{4}|\d{4}-\d{1,2}-\d{1,2})"
            }
        )
    )

    # Booking status should be either 'Canceled' or 'Not_Canceled'
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_distinct_values_to_be_in_set",
            kwargs={
                "column": "booking status",
                "value_set": ["Canceled", "Not_Canceled"]
            }
        )
    )

    # Car parking space should be 0 or 1 (to be transformed into categorical)
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_distinct_values_to_be_in_set",
            kwargs={
                "column": "car parking space",
                "value_set": [0, 1]
            }
        )
    )

    # Repeated should be 0 or 1 (to be transformed into categorical)
    suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_distinct_values_to_be_in_set",
            kwargs={
                "column": "repeated",
                "value_set": [0, 1]
            }
        )
    )

    return suite

In [6]:
from great_expectations.checkpoint.types.checkpoint_result import CheckpointResult
from great_expectations.core.batch import BatchRequest
from great_expectations.validator.validator import Validator
from great_expectations.dataset import PandasDataset

# Assume df_raw is your raw dataset (a pandas DataFrame)
df_raw_ge = gx.dataset.PandasDataset(df_raw)

# Build the suite
suite = build_raw_data_expectation_suite()

# Validate
results = df_raw_ge.validate(expectation_suite=suite)

# Print summary
print("Validation success:", results["success"])
for res in results["results"]:
    print(f"\nExpectation: {res['expectation_config']['expectation_type']}")
    print("✅ Success:" if res["success"] else "❌ Failed:", res["result"].get("unexpected_index_list", []))

Validation success: False

Expectation: expect_column_values_to_be_of_type
✅ Success: []

Expectation: expect_column_values_to_not_be_null
✅ Success: []

Expectation: expect_column_values_to_be_of_type
✅ Success: []

Expectation: expect_column_values_to_not_be_null
✅ Success: []

Expectation: expect_column_min_to_be_between
✅ Success: []

Expectation: expect_column_values_to_be_of_type
✅ Success: []

Expectation: expect_column_values_to_not_be_null
✅ Success: []

Expectation: expect_column_min_to_be_between
✅ Success: []

Expectation: expect_column_values_to_be_of_type
✅ Success: []

Expectation: expect_column_values_to_not_be_null
✅ Success: []

Expectation: expect_column_min_to_be_between
✅ Success: []

Expectation: expect_column_values_to_be_of_type
✅ Success: []

Expectation: expect_column_values_to_not_be_null
✅ Success: []

Expectation: expect_column_min_to_be_between
✅ Success: []

Expectation: expect_column_values_to_be_of_type
✅ Success: []

Expectation: expect_column_values_t

In [7]:
import great_expectations as gx
from great_expectations.core import ExpectationSuite

# Build your suite
suite = build_raw_data_expectation_suite()

# Wrap your DataFrame for validation
df_ge = gx.dataset.PandasDataset(df_raw)

# Validate df directly with the suite
results = df_ge.validate(expectation_suite=suite)

# Print summary with details on failures
print("Validation success:", results["success"])
for res in results["results"]:
    exp_type = res["expectation_config"]["expectation_type"]
    success = res["success"]
    unexpected = res["result"].get("unexpected_index_list", [])
    print(f"\nExpectation: {exp_type}")
    print("✅ Passed" if success else f"❌ Failed at rows: {unexpected}")

Validation success: False

Expectation: expect_column_values_to_be_of_type
✅ Passed

Expectation: expect_column_values_to_not_be_null
✅ Passed

Expectation: expect_column_values_to_be_of_type
✅ Passed

Expectation: expect_column_values_to_not_be_null
✅ Passed

Expectation: expect_column_min_to_be_between
✅ Passed

Expectation: expect_column_values_to_be_of_type
✅ Passed

Expectation: expect_column_values_to_not_be_null
✅ Passed

Expectation: expect_column_min_to_be_between
✅ Passed

Expectation: expect_column_values_to_be_of_type
✅ Passed

Expectation: expect_column_values_to_not_be_null
✅ Passed

Expectation: expect_column_min_to_be_between
✅ Passed

Expectation: expect_column_values_to_be_of_type
✅ Passed

Expectation: expect_column_values_to_not_be_null
✅ Passed

Expectation: expect_column_min_to_be_between
✅ Passed

Expectation: expect_column_values_to_be_of_type
✅ Passed

Expectation: expect_column_values_to_not_be_null
✅ Passed

Expectation: expect_column_values_to_be_of_type
✅ P

In [8]:
# debugging custom expectations
import pprint

for res in results["results"]:
    print("\n--- Full result for debugging ---")
    pprint.pprint(res["result"])


--- Full result for debugging ---
{'observed_value': 'object_'}

--- Full result for debugging ---
{'element_count': 36285,
 'partial_unexpected_list': [],
 'unexpected_count': 0,
 'unexpected_percent': 0.0,
 'unexpected_percent_total': 0.0}

--- Full result for debugging ---
{'observed_value': 'int64'}

--- Full result for debugging ---
{'element_count': 36285,
 'partial_unexpected_list': [],
 'unexpected_count': 0,
 'unexpected_percent': 0.0,
 'unexpected_percent_total': 0.0}

--- Full result for debugging ---
{'element_count': 36285,
 'missing_count': None,
 'missing_percent': None,
 'observed_value': 0}

--- Full result for debugging ---
{'observed_value': 'int64'}

--- Full result for debugging ---
{'element_count': 36285,
 'partial_unexpected_list': [],
 'unexpected_count': 0,
 'unexpected_percent': 0.0,
 'unexpected_percent_total': 0.0}

--- Full result for debugging ---
{'element_count': 36285,
 'missing_count': None,
 'missing_percent': None,
 'observed_value': 0}

--- Full r

In [9]:
print((df_raw["number of week nights"] + df_raw["number of weekend nights"]).min())

0


In [10]:
filtered_df = df_raw[(df_raw['number of week nights'] ==0 ) & (df_raw['number of weekend nights'] ==0)]
filtered_df.head()

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
209,INN00210,1,0,0,0,Meal Plan 1,0,Room_Type 1,4,Complementary,0,0,0,0.0,1,2/27/2018,Not_Canceled
1158,INN01159,2,0,0,0,Meal Plan 1,0,Room_Type 1,145,Online,0,0,0,0.0,1,7/5/2018,Not_Canceled
1403,INN01404,3,0,0,0,Meal Plan 1,0,Room_Type 4,57,Online,0,0,0,0.0,2,4/1/2018,Not_Canceled
1907,INN01908,2,0,0,0,Meal Plan 2,0,Room_Type 1,247,Online,0,0,0,0.0,1,6/6/2018,Not_Canceled
1986,INN01987,2,0,0,0,Meal Plan 1,0,Room_Type 1,43,Online,0,0,0,0.0,1,10/17/2017,Not_Canceled


In [11]:
filtered_df.shape

(78, 17)

78 bookings are for 0 week nights and 0 weekend nights - invalid bookings

## Data Split

In [12]:
def split_data(df, cutoff_date):
    df = df.copy()

    ref_data = df[df['date of reservation'] <= cutoff_date]
    ana_data = df[df['date of reservation'] > cutoff_date]

    return ref_data, ana_data

In [13]:
ref_data, ana_data = split_data(df_raw, "9/19/2018")

In [14]:
ref_data.shape

(33431, 17)

In [15]:
ana_data.shape

(2854, 17)

## Preprocessing Train

In [16]:
from typing import Any, Dict, Tuple
import pandas as pd

def clean_data(
    data: pd.DataFrame,
) -> Tuple[pd.DataFrame, Dict, Dict]:    

    df_transformed = data.copy()

    # snake case names of columns
    df_transformed.columns = df_transformed.columns.str.replace(' ', '_').str.lower()

    # convert date of reservation to Datetime 
    df_transformed['date_of_reservation'] = pd.to_datetime(df_transformed['date_of_reservation'], errors='coerce')

    # drop raws NaN values (including NaT with invalid dates)
    df_transformed = df_transformed.dropna()

    # cast repeated and car parking space as boolean
    df_transformed["car_parking_space"] = df_transformed["car_parking_space"].astype(bool)
    df_transformed["repeated"] = df_transformed["repeated"].astype(bool)

    # drop invalid bookings with 0 sum of week and weekend nights
    df_transformed = df_transformed[~((df_transformed['number_of_week_nights'] == 0) & (df_transformed['number_of_weekend_nights'] == 0))]

    # remove outliers
    for cols in ["lead_time"]:
        Q1 = df_transformed[cols].quantile(0.25)
        Q3 = df_transformed[cols].quantile(0.75)
        IQR = Q3 - Q1     

        filter = (df_transformed[cols] >= Q1 - 1.5 * IQR) & (df_transformed[cols] <= Q3 + 1.5 *IQR)
        df_transformed = df_transformed.loc[filter]

    describe_to_dict_verified = df_transformed.describe().to_dict()

    return df_transformed, describe_to_dict_verified 

## Test Preprocessing Train

In [18]:
def test_clean_data_type(df):
    #df = pd.read_csv("./tests/pipelines/sample/sample.csv") 
    df_transformed, describe_to_dict_verified  = clean_data(df)
    isinstance(describe_to_dict_verified, dict)

def test_clean_data_null(df): #e.g. if there are still null values after data cleaning
    #df = pd.read_csv("./tests/pipelines/sample/sample.csv") 
    df_transformed, describe_to_dict_verified = clean_data(df)
    assert [col for col in df_transformed.columns if df_transformed[col].isnull().any()] == []

def test_clean_data_valid_bookings(df):
    #df = pd.read_csv("./tests/pipelines/sample/sample.csv") 
    df_transformed, describe_to_dict_verified  = clean_data(df)
    assert not ((df_transformed["number_of_week_nights"] == 0) & (df_transformed["number_of_weekend_nights"] == 0)).any()

In [19]:
test_clean_data_type(ref_data)

In [20]:
test_clean_data_null(ref_data)

In [21]:
test_clean_data_valid_bookings(ref_data)

## Feature Engineering

We want to create a new feature: season. Based on the date of reservation and lead time, we can establish the first day of stay. Assume stay in June, July, August and December - high season, other months - low season

In [26]:
from sklearn.preprocessing import OneHotEncoder , LabelEncoder

def add_season(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    
    #new season feature
    df['season'] = pd.to_datetime(df['date_of_reservation']) + pd.to_timedelta(df['lead_time'], unit='d')
    df['season'] = df['season'].dt.month.apply(lambda m: 'high' if m in [6,7,8,12] else 'low')

    return df

def feature_engineer( data: pd.DataFrame, OH_encoder) -> pd.DataFrame:
    df = data.copy()


    numerical_features = df.select_dtypes(exclude=['object','string','category']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object','string','category']).columns.tolist()

    OH_cols= pd.DataFrame(OH_encoder.transform(df[categorical_features]))

    # Adding column names to the encoded data set.
    OH_cols.columns = OH_encoder.get_feature_names_out(categorical_features)

    # One-hot encoding removed index; put it back
    OH_cols.index = df.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_df = df.drop(categorical_features, axis=1)

    # Add one-hot encoded columns to numerical features
    df_final = pd.concat([num_df, OH_cols], axis=1)

    return df_final

In [27]:
df_transformed, describe_to_dict_verified  = clean_data(ref_data)

df_transformed = add_season(df_transformed)

categorical_cols = df_transformed.select_dtypes(include=['object', 'string', 'category']).columns.tolist()
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_encoder.fit(df_transformed[categorical_cols])

df_final = feature_engineer(df_transformed, OH_encoder)
df_final.head()

MemoryError: Unable to allocate 7.67 GiB for an array with shape (32076, 32096) and data type float64

In [24]:
df_transformed, describe_to_dict_verified  = clean_data(ref_data)

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

df_final = feature_engineer(df_transformed, ohe)

df_final.head()

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## Data Unit Tests After Preprocessing