### DATA QUALITY CHECK


##### Load library

In [1]:
import great_expectations as gx
import great_expectations.expectations as gxe
import glob
import pandas as pd

##### Create GX context and fetch CSV into dataset

In [2]:
# Create Data Context.
context = gx.get_context()

# Import sample data into Pandas DataFrame.
df = pd.read_csv(glob.glob('/code/data/output' + "/*.csv")[0], sep='|')


##### Create Data Source, Data Asset, Batch Definition, and Batch

In [3]:
# Connect to data.
# Create Data Source, Data Asset, Batch Definition, and Batch.
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd_cc_list")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

##### Create different type of validation.
##### Below scenario show 2 different type of validation (ExpectColumnValuesToNotBeNull & ExpectColumnDistinctValuesToContainSet)
##### For additional validation, please refer to https://greatexpectations.io/expectations/

In [4]:
# ExpectColumnValuesToNotBeNull
expect_gender_NotBeNull = gx.expectations.ExpectColumnValuesToNotBeNull( column="gender", mostly=1 )
expect_cc_bic_NotBeNull = gx.expectations.ExpectColumnValuesToNotBeNull( column="cc_bic", mostly=1 )
expect_trans_date_NotBeNull = gx.expectations.ExpectColumnValuesToNotBeNull( column="trans_date_trans_time", mostly=1 )
expect_merchant_NotBeNull = gx.expectations.ExpectColumnValuesToNotBeNull( column="merchant", mostly=1 )
expect_city_NotBeNull = gx.expectations.ExpectColumnValuesToNotBeNull( column="city", mostly=1 )

ge_NotBeNull = [expect_city_NotBeNull, expect_gender_NotBeNull, expect_merchant_NotBeNull, expect_cc_bic_NotBeNull, expect_trans_date_NotBeNull]


# ExpectColumnDistinctValuesToContainSet
expect_gender_ContainSet = gx.expectations.ExpectColumnDistinctValuesToContainSet(column="gender", value_set=['F', 'M'])
expect_is_fraud_ContainSet = gx.expectations.ExpectColumnDistinctValuesToContainSet(column="is_fraud", value_set=[0, 1])

ge_ContainSet = [expect_gender_ContainSet, expect_is_fraud_ContainSet]


# Aggregate list of Expectation for be validate()
combined_results = []
for i in (ge_NotBeNull + ge_ContainSet):
    combined_results.append( batch.validate(i))


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

##### Simple aggregate of test summaries  

In [5]:
for i in combined_results:
    _column = i['expectation_config']['kwargs']['column']
    _type = i['expectation_config']['type']
    _success = i['success']

    print("Column : {0} [{1}] - {2}".format(_column, _type, _success))
    

Column : city [expect_column_values_to_not_be_null] - True
Column : gender [expect_column_values_to_not_be_null] - True
Column : merchant [expect_column_values_to_not_be_null] - True
Column : cc_bic [expect_column_values_to_not_be_null] - False
Column : trans_date_trans_time [expect_column_values_to_not_be_null] - True
Column : gender [expect_column_distinct_values_to_contain_set] - True
Column : is_fraud [expect_column_distinct_values_to_contain_set] - True


##### Below is the raw json respose from the test (it provide meaningful information when test is FAILED)

In [6]:
combined_results

[{
   "success": true,
   "expectation_config": {
     "type": "expect_column_values_to_not_be_null",
     "kwargs": {
       "batch_id": "pandas-pd_cc_list",
       "column": "city"
     },
     "meta": {}
   },
   "result": {
     "element_count": 5000,
     "unexpected_count": 0,
     "unexpected_percent": 0.0,
     "partial_unexpected_list": [],
     "partial_unexpected_counts": [],
     "partial_unexpected_index_list": []
   },
   "meta": {},
   "exception_info": {
     "raised_exception": false,
     "exception_traceback": null,
     "exception_message": null
   }
 },
 {
   "success": true,
   "expectation_config": {
     "type": "expect_column_values_to_not_be_null",
     "kwargs": {
       "batch_id": "pandas-pd_cc_list",
       "column": "gender"
     },
     "meta": {}
   },
   "result": {
     "element_count": 5000,
     "unexpected_count": 0,
     "unexpected_percent": 0.0,
     "partial_unexpected_list": [],
     "partial_unexpected_counts": [],
     "partial_unexpected_in