# Great Expectation tutorials

In [1]:
import great_expectations as gx
import pandas as pd

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir=".")

# You can take a look at all configurations related to GX here
print(context)

{
  "anonymous_usage_statistics": {
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "enabled": true,
    "explicit_id": true,
    "explicit_url": false,
    "data_context_id": "068e9a31-a78b-4fb5-a2e4-840492166001"
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_variables_file_path": "uncommitted/config_variables.yml",
  "config_version": 3.0,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "uncommitted/data_docs/local_site"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {},
  "evaluation_parameter_store_name": "evaluation_parameter_store",
  "expectations_store_name": "expectations_store",
  "fluent_datasources": {
    "default_pandas_datasource": {
      "type": "pandas

## Load another file as a data source

In [2]:
# You have been working with a single file, but how to work with a folder or a DB with several tables?
# Now, we have some new terms, the data source contains some data assets, and each asset will be divided into several batches

# First, add a folder as a data source
context.sources.add_pandas_filesystem(
    name="my_ds_2", base_directory="../data/yellow_tripdata/"
)

PandasFilesystemDatasource(type='pandas_filesystem', name='my_ds_2', id=None, assets=[], base_directory=PosixPath('../data/yellow_tripdata'), data_context_root_directory=None)

In [3]:
my_ds = context.datasources["my_ds_2"]

my_batching_regex = "yellow_tripdata_2022-.*.parquet"

# Create the data asset (as one or more files from our data source)
my_asset = my_ds.add_parquet_asset(
    name="my_tripdata_data_asset", batching_regex=my_batching_regex
)

# Define a Batch Request to include all batches in the available data set
my_batch_request = my_asset.build_batch_request()
batches = my_asset.get_batch_list_from_batch_request(my_batch_request)

In [4]:
# Let's verify what we have for each batch
for batch in batches:
    print(batch.batch_spec)

{'path': '../data/yellow_tripdata/yellow_tripdata_2022-12.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}


## Validate using our default expectation suite

In [5]:
# Now, define the validator and validate it against batches
# context.add_or_update_expectation_suite("my_asset_expectation_suite")

asset_validator = context.get_validator(
    batch_request=my_batch_request,
    expectation_suite_name="default",
)
asset_validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-12-01 00:37:35,2022-12-01 00:47:35,1.0,2.0,1.0,N,170,237,1,8.5,3.0,0.5,3.1,0.0,0.3,15.4,2.5,0.0
1,1,2022-12-01 00:34:35,2022-12-01 00:55:21,0.0,8.4,1.0,N,138,141,2,26.0,4.25,0.5,0.0,0.0,0.3,31.05,2.5,1.25
2,1,2022-12-01 00:33:26,2022-12-01 00:37:34,1.0,0.8,1.0,N,140,140,1,5.0,3.0,0.5,1.76,0.0,0.3,10.56,2.5,0.0
3,1,2022-12-01 00:45:51,2022-12-01 00:53:16,1.0,3.0,1.0,N,141,79,3,10.0,3.0,0.5,0.0,0.0,0.3,13.8,2.5,0.0
4,2,2022-12-01 00:49:49,2022-12-01 00:54:13,1.0,0.76,1.0,N,261,231,1,5.0,0.5,0.5,1.76,0.0,0.3,10.56,2.5,0.0


In [6]:
# Similar to a single file, create a checkpoint to validate the result
# Define the checkpoint
checkpoint = context.add_or_update_checkpoint(
    name="yellow_tripdata_asset_checkpoint",
    validator=asset_validator
)

# Get the result after validation
checkpoint_result = checkpoint.run()

# Quick view on the validation result
context.view_validation_result(checkpoint_result)

Calculating Metrics:   0%|          | 0/15 [00:00<?, ?it/s]

[1009/210456.059107:ERROR:file_io_posix.cc(152)] open /home/quandv/.config/BraveSoftware/Brave-Browser/Crash Reports/pending/6e3039b9-5d88-44cb-9730-f8d9e67ee50d.lock: File exists (17)


Opening in existing browser session.
