# Great Expectation - LocalFS

## Install & Import Dependency

In [1]:
%%bash

pip install 'great_expectations[s3]'

Collecting boto3>=1.17.106 (from great_expectations[s3])
  Obtaining dependency information for boto3>=1.17.106 from https://files.pythonhosted.org/packages/fe/15/fa88dc3bf239fe047fcd9f7c1c38b655f533d453d4d6c3f72ab5b145d44d/boto3-1.28.53-py3-none-any.whl.metadata
  Using cached boto3-1.28.53-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.32.0,>=1.31.53 (from boto3>=1.17.106->great_expectations[s3])
  Obtaining dependency information for botocore<1.32.0,>=1.31.53 from https://files.pythonhosted.org/packages/21/fd/d8591332f5a6b3a94a55dc88a8374518ef7caa24302562558e4eac07ec2e/botocore-1.31.53-py3-none-any.whl.metadata
  Using cached botocore-1.31.53-py3-none-any.whl.metadata (6.0 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.17.106->great_expectations[s3])
  Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0 (from boto3>=1.17.106->great_expectations[s3])
  Obtaining dependency information for s3transfer<0.7.0,>=0.6.0 from https://files

In [None]:
%%bash

echo -e "Verify your installed version `great_expectations --version | grep -oP 'version \K[^ ]+'` is higher than 0.17.19"

In [2]:
import os
import pandas as pd
import great_expectations as gx
from great_expectations.data_context import FileDataContext

## Initialize local variables

In [3]:
# variables
path_to_repo_dir = "/home/anku/sandbox/DAMG7245-Fall2023" # TODO: change this to your local path

## Initialize GX dir

In [4]:
context = FileDataContext.create(project_root_dir=path_to_repo_dir)

## GX : Create Data Source

Here the source of data is csv files uploaded to S3, the files pattern is `yellow_tripdata_YYYY-MM.csv`

In [5]:
datasource_name = "S3_NYC_Yellow_Taxi2"
bucket_name = "gx-nyc-trip-lab-demo"
boto3_options = {}

In [6]:
datasource = context.sources.add_pandas_s3(name=datasource_name, bucket=bucket_name, boto3_options=boto3_options)

In [7]:
asset_name = "S3_NYC_Yellow_Taxi_Data_Asset2"
s3_prefix = ""
batching_regex = r"yellow_tripdata_(?P<year>\d{4})-(?P<month>\d{2})\.csv"
data_asset = datasource.add_csv_asset(name=asset_name, batching_regex=batching_regex, s3_prefix=s3_prefix)

In [None]:
batch_request = data_asset.build_batch_request()

## GX : Create Expectations Suite

Expectations suite has the validation / checks to be done on data.

In [8]:
expectation_suite_name="S3_NYC_Yellow_Taxi_Data_Asset_Expectation_Suite"

In [9]:
data_asset = context.get_datasource(datasource_name).get_asset(asset_name)
batch_request = data_asset.build_batch_request()

In [10]:
context.list_expectation_suite_names()

['NOAA_Station_Data_Expectation_Suite', 'NYC_Yellow_Taxi_Expecation_Suite']

In [11]:
context.add_or_update_expectation_suite(expectation_suite_name)

{
  "expectation_suite_name": "S3_NYC_Yellow_Taxi_Data_Asset_Expectation_Suite",
  "ge_cloud_id": null,
  "expectations": [],
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.17.19"
  }
}

In [12]:
validator = context.get_validator(batch_request=batch_request, expectation_suite_name=expectation_suite_name)
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,01-01-2023 00:32,01-01-2023 00:40,1,0.97,1,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,01-01-2023 00:55,01-01-2023 01:01,1,1.1,1,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,01-01-2023 00:25,01-01-2023 00:37,1,2.51,1,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,01-01-2023 00:03,01-01-2023 00:13,0,1.9,1,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,01-01-2023 00:10,01-01-2023 00:21,1,1.43,1,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [13]:
validator.expect_column_values_to_not_be_null("VendorID")
validator.expect_column_values_to_be_in_set("VendorID", [1, 2])
validator.expect_column_values_to_not_be_null("tpep_pickup_datetime")
validator.expect_column_values_to_not_be_null("tpep_dropoff_datetime")
validator.expect_column_values_to_be_in_set("RatecodeID", [1,2,3,4,5,6])
validator.expect_column_values_to_be_in_set("store_and_fwd_flag", ["Y", "N"])
validator.expect_column_values_to_be_in_set("payment_type", [1,2,3,4,5,6])
validator.expect_column_values_to_be_between("passenger_count", auto=True)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "passenger_count",
      "min_value": 0,
      "max_value": 8,
      "mostly": 1.0,
      "strict_min": false,
      "strict_max": false
    },
    "meta": {
      "auto_generated_at": "20230925T180649.561700Z",
      "great_expectations_version": "0.17.19"
    }
  },
  "result": {
    "element_count": 1048575,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

> Explore Expectations - https://greatexpectations.io/expectations/?filterType=Backend%20support&gotoPage=1&showFilters=true&viewType=Summary

In [14]:
validator.save_expectation_suite()

## GX : Create Checkpoint

Checkpoint to run the validations defined in the expectation against the datasource and create a report

In [15]:
checkpoint = context.add_or_update_checkpoint(
    name="S3_NYC_Yellow_Taxi_Checkpoint_v1",
    validator=validator
)

In [16]:
checkpoint_result = checkpoint.run(run_name="Manual_run for S3 data")

Calculating Metrics:   0%|          | 0/44 [00:00<?, ?it/s]

In [None]:
context.build_data_docs()

> Install `Live Server` VSCode Extension [link](https://marketplace.visualstudio.com/items?itemName=ritwickdey.LiveServer) to view the report