In [None]:
import datetime
import numpy as np
import pandas as pd
from pprint import pprint

import great_expectations as gx
from great_expectations.data_context import EphemeralDataContext

from matplotlib import pyplot as plt

%matplotlib inline

# Download and Explore Data

In [None]:
# https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [None]:
# !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-01.parquet -O data/yellow_tripdata_2019-01.parquet

In [None]:
# Read a data sample 
df = pd.read_parquet('../data/yellow_tripdata_2019-01.parquet').sample(10000)

# Prepare 'test.csv' for an example in Validate Data 
df.to_csv('../data/test.csv')

In [None]:
df.head(3).T

In [None]:
df.info()

In [None]:
plt.hist(df['payment_type'])

In [None]:
plt.hist(df['passenger_count'])

In [None]:
plt.hist(df['trip_distance'])

In [None]:
plt.hist(df['total_amount'])

# Create a Data Context 

## Ephemeral Data Contex

An Ephemeral Data Context is an in-memory Data Context that is not intended to persist beyond the current Python session. However, if you decide that you would like to save its contents for future use you can do so by converting it to a Filesystem Data Context ([docs](https://docs.greatexpectations.io/docs/oss/guides/setup/configuring_data_contexts/instantiating_data_contexts/instantiate_data_context))

In [None]:
context = gx.get_context()

In [None]:
context

In [None]:
if isinstance(context, EphemeralDataContext):
    print("It's Ephemeral!")

In [None]:
# Convert the Ephemeral Data Context into a Filesystem Data Context

context = context.convert_to_file_context()

## Filesystem Data Context 

In [None]:
from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir="..")

# Same as
# context = gx.get_context(project_root_dir="..")

In [None]:
context

# Connect to Data

## Use Validator (simple)

In [None]:
validator = context.sources.pandas_default.read_parquet(
    '../data/yellow_tripdata_2019-01.parquet', 
)

In [None]:
validator.head()

## Use Datasource 

In [None]:
context.datasources

In [None]:
# A Data Source provides a standard API for accessing and interacting with data from a wide variety of source systems.

# DEBUG: uncomment if you need to clear list for Datasources
context.datasources.clear()

datasource = context.sources.add_pandas_filesystem(
    name='local_dir', 
    base_directory="../data"
)
print("DATASOURCE:\n", datasource)

# The batching_regex should max file names in the data_directory
asset = datasource.add_parquet_asset(
    name="parquet_asset",
    batching_regex=r"yellow_tripdata_(?P<year>\d{4})-(?P<month>\d{2}).parquet",
    order_by=["year", "month"],
)
print("ASSET:\n", asset)

batch_request = asset.build_batch_request(options={"year": "2019", "month": "01"})
# options = asset.batch_request_options
# print("OPTIONS:\n", options)


In [None]:
print(context.datasources["local_dir"])


In [None]:
validator = context.get_validator(
    datasource_name="local_dir",
    data_asset_name="yellow_tripdata_2019-01.parquet",
    batch_request=batch_request
)

validator

In [None]:
validator.head()

# Create Expectations

In [None]:
validator.expect_column_values_to_not_be_null("tpep_pickup_datetime")

In [None]:
validator.expect_column_values_to_be_between(
    "passenger_count", 
    min_value=1, 
    max_value=6
)

In [None]:
validator.expect_column_values_to_be_between(
    column='payment_type',  
    min_value=0, 
    max_value=3, 
    mostly=0.9
)


In [None]:
validator.expect_column_values_to_be_of_type(
    column='payment_type', 
    type_='int64'
)

# Save Expectation Suite

In [None]:
validator.save_expectation_suite('../gx/expectations/first_expectation_suite.json')

# Validate Data

In [None]:
# # DEBUG

# validation_context.sources.delete("taxi_test_datasource")
# del validation_context
# del test_data_validator


In [None]:
validation_context = gx.get_context(context_root_dir="../gx")

In [None]:
validation_context.list_expectation_suite_names()

In [None]:
validation_suite = validation_context.get_expectation_suite("first_expectation_suite")
validation_suite

In [None]:
test = pd.read_csv("../data/test.csv")

test_data_validator = (
    validation_context.sources.add_pandas("taxi_test_datasource")
    .read_dataframe(
       test, 
       asset_name="taxi_test", 
       batch_metadata={"type": "test"})
  )


In [None]:
test_data_validator.expectation_suite


In [None]:
test_data_validator.expectation_suite_name = "first_expectation_suite"

In [None]:
test_data_validator.expectation_suite

In [None]:
test_data_validator.expectation_suite

In [None]:
checkpoint = validation_context.add_or_update_checkpoint(
    name="my_taxi_validator_checkpoint", 
    validator=test_data_validator,
    expectation_suite_name="first_expectation_suite"
)

checkpoint_result = checkpoint.run()

In [None]:
checkpoint_result

In [None]:
# validation_context.build_data_docs()

In [None]:
validation_context.open_data_docs()