In [4]:
import pandas as pd
import great_expectations as gx
import great_expectations.expectations as gxe

# eval $(poetry env activate)

In [5]:
# Import sample data into Pandas DataFrame.
df = pd.read_csv(
    "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv"
)

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

df.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-15 03:36:12,2019-01-15 03:42:19,1,1.0,1,N,230,48,1,6.5,0.5,0.5,1.95,0.0,0.3,9.75,
1,1,2019-01-25 18:20:32,2019-01-25 18:26:55,1,0.8,1,N,112,112,1,6.0,1.0,0.5,1.55,0.0,0.3,9.35,0.0
2,1,2019-01-05 06:47:31,2019-01-05 06:52:19,1,1.1,1,N,107,4,2,6.0,0.0,0.5,0.0,0.0,0.3,6.8,
3,1,2019-01-09 15:08:02,2019-01-09 15:20:17,1,2.5,1,N,143,158,1,11.0,0.0,0.5,3.0,0.0,0.3,14.8,
4,1,2019-01-25 18:49:51,2019-01-25 18:56:44,1,0.8,1,N,246,90,1,6.5,1.0,0.5,1.65,0.0,0.3,9.95,0.0


In [6]:
context = gx.get_context(mode="file", )

data_source = context.data_sources.add_or_update_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="Taxi Asset")

run_name_str = "daily_validation_" + pd.to_datetime("now").strftime("%Y%m%dT%H%M")
run_name = gx.RunIdentifier(run_name=run_name_str)

base_directory = "uncommitted/data_docs/local_site/"
site_config = {
    "class_name": "SiteBuilder",
    "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
    "show_how_to_buttons": False,
    "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": base_directory,
    },
}

site_name = "my_data_docs_site"

context.add_data_docs_site(site_name=site_name, site_config=site_config)

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})


existing_suite = context.suites.get

suite = context.suites.add_or_update(
    gx.core.expectation_suite.ExpectationSuite(name="taxi_check",)
)

InvalidKeyError: Data Docs Site `my_data_docs_site` already exists in the Data Context.

In [7]:
# Column-level checks
suite.add_expectation(
    gxe.ExpectTableColumnsToMatchOrderedList(
        column_list=[
            'vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
            'trip_distance', 'rate_code_id', 'store_and_fwd_flag',
            'pickup_location_id', 'dropoff_location_id', 'payment_type',
            'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
            'improvement_surcharge', 'total_amount', 'congestion_surcharge'
        ],
    )
)

suite.add_expectation(
    gxe.ExpectColumnValuesToBeOfType(
        column="vendor_id", type_="int")
)

suite.add_expectation(
    gxe.ExpectColumnValuesToBeOfType(column="passenger_count", type_="int")
)

suite.add_expectation(
    gxe.ExpectColumnValuesToBeOfType(column="payment_type", type_="int")
)

suite.add_expectation(
    gxe.ExpectColumnValuesToBeOfType(column="total_amount", type_="float")
)

suite.add_expectation(
    gxe.ExpectColumnValuesToBeOfType(column="store_and_fwd_flag", type_="object")
)

suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="vendor_id")
)

suite.add_expectation(
    gxe.ExpectColumnValuesToBeBetween(column="passenger_count", min_value=1,)
)

suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="store_and_fwd_flag", value_set=["Y", "N"])
)

# Table-level checks
suite.add_expectation(
    gxe.ExpectTableRowCountToBeBetween(min_value=100, max_value=10000)
)

NameError: name 'suite' is not defined

In [8]:
# Create Validation Definition.
validation_definition = context.validation_definitions.add_or_update(
    gx.core.validation_definition.ValidationDefinition(
        name="validation definition",
        data=batch_definition,
        suite=suite,
    )
)

actions = [
    gx.checkpoint.actions.UpdateDataDocsAction(
        name="update_my_site", site_names=["site_name"]
    ),
]

NameError: name 'batch_definition' is not defined

In [9]:
checkpoint = context.checkpoints.add_or_update(
    gx.checkpoint.checkpoint.Checkpoint(
        name="checkpoint",
        validation_definitions=[validation_definition],
        actions=actions,
        result_format="COMPLETE",
    )
)

checkpoint_result = checkpoint.run({"dataframe": df})
print(checkpoint_result.describe())

context.build_data_docs(site_names=site_name)

NameError: name 'validation_definition' is not defined

In [10]:
context.open_data_docs()

In [32]:
checkpoint.save()