# Great Expectation - LocalFS

## Install & Import Dependency

In [None]:
%%bash

pip install 'great_expectations'

In [None]:
%pip install pyarrow

In [None]:
%%bash

echo -e "Verify your installed version `great_expectations --version | grep -oP 'version \K[^ ]+'` is higher than 0.17.19"

In [1]:
import os
import pandas as pd
import great_expectations as gx
from great_expectations.data_context import FileDataContext

## Initialize local variables

In [2]:
# variables
path_to_repo_dir = "/home/anku/sandbox/DAMG7245-Fall2023" # TODO: change this to your local path
path_to_data_dir = f"{path_to_repo_dir}/gx/data"
expectation_suite_name = "NYC_Yellow_Taxi_Expecation_Suite"

# url
nyc_yellow_taxi_trip_data_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"

## Initialize GX dir

In [3]:
context = FileDataContext.create(project_root_dir=path_to_repo_dir)

## Download raw data

NYC Yellow Trips data
* Source - https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
* Data Dictionary - https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [None]:
os.mkdir(path_to_data_dir) # Add data dir to gitignore file
df = pd.read_parquet(nyc_yellow_taxi_trip_data_url)
df.to_csv(f"{path_to_data_dir}/{nyc_yellow_taxi_trip_data_url.split('/')[-1].split('.')[0]}.csv", index=False)

In [None]:
df.head()

## GX : Create Data Source

Here the source of data is csv files in local file system, the files pattern is `yellow_tripdata_YYYY-MM.csv`

In [4]:
# Give your Datasource a name
datasource_name = "Local_FileSystem_Source3"
datasource = context.sources.add_pandas_filesystem(name=datasource_name, base_directory=path_to_data_dir)

# Give your first Asset a name
asset_name = "NYC_Yellow_Taxi_Trip_Data3"
batching_regex = r"yellow_tripdata_(?P<year>\d{4})-(?P<month>\d{2})\.csv"

asset = datasource.add_csv_asset(name=asset_name, batching_regex=batching_regex)

# Build batch request
batch_request = asset.build_batch_request()

## GX : Create Expectations Suite

Expectations suite has the validation / checks to be done on data.

In [5]:
data_asset = context.get_datasource(datasource_name).get_asset(asset_name)
batch_request = data_asset.build_batch_request()

In [6]:
context.list_expectation_suite_names()

['NYC_Yellow_Taxi_Expecation_Suite']

In [7]:
context.add_or_update_expectation_suite(expectation_suite_name)

{
  "expectation_suite_name": "NYC_Yellow_Taxi_Expecation_Suite",
  "ge_cloud_id": null,
  "expectations": [],
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.17.19"
  }
}

In [8]:
validator = context.get_validator(batch_request=batch_request, expectation_suite_name=expectation_suite_name)
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [9]:
validator.expect_column_values_to_not_be_null("VendorID")
validator.expect_column_values_to_be_in_set("VendorID", [1, 2])
validator.expect_column_values_to_not_be_null("tpep_pickup_datetime")
validator.expect_column_values_to_not_be_null("tpep_dropoff_datetime")
validator.expect_column_values_to_be_in_set("RatecodeID", [1,2,3,4,5,6])
validator.expect_column_values_to_be_in_set("store_and_fwd_flag", ["Y", "N"])
validator.expect_column_values_to_be_in_set("payment_type", [1,2,3,4,5,6])
validator.expect_column_values_to_be_between("passenger_count", auto=True)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "passenger_count",
      "min_value": 0.0,
      "max_value": 9.0,
      "mostly": 1.0,
      "strict_min": false,
      "strict_max": false
    },
    "meta": {
      "auto_generated_at": "20230925T173959.008828Z",
      "great_expectations_version": "0.17.19"
    }
  },
  "result": {
    "element_count": 3066766,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 71743,
    "missing_percent": 2.3393698769322473,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

> Explore Expectations - https://greatexpectations.io/expectations/?filterType=Backend%20support&gotoPage=1&showFilters=true&viewType=Summary

In [10]:
validator.save_expectation_suite()

> Save at `gx/expectations/NYC_Yellow_Taxi_Expecation_Suite.json`

## GX : Create Checkpoint

Checkpoint to run the validations defined in the expectation against the datasource and create a report

In [11]:
checkpoint = context.add_or_update_checkpoint(
    name="NYC_Yellow_Taxi_Checkpoint_v1",
    validator=validator
)

In [14]:
checkpoint_result = checkpoint.run(run_name="Manual_run 2")

Calculating Metrics:   0%|          | 0/37 [00:00<?, ?it/s]

> Saved at `gx/checkpoints/NYC_Yellow_Taxi_Checkpoint_v1.yml`

In [13]:
context.build_data_docs()

{'local_site': 'file:///home/anku/sandbox/DAMG7245-Fall2023/gx/uncommitted/data_docs/local_site/index.html'}

> Install `Live Server` VSCode Extension [link](https://marketplace.visualstudio.com/items?itemName=ritwickdey.LiveServer) to view the report