In [1]:
import os
import json
import sys
import great_expectations as ge
import great_expectations.jupyter_ux
from datetime import datetime
import math
import pandas as pd
os.chdir('/Users/mparayil/Desktop/Development/dsa-data-workflows/grtexp_agero_dsa/great_expectations')

2020-02-24T13:37:16-0500 - INFO - Great Expectations logging enabled at INFO level by JupyterUX module.


In [2]:
import ge_prod.ge_data_access as gda
import ge_prod.queries as queries

In [3]:
rule_query = queries.queries.get('network_claims').get('create_expectations_2019Q4')

In [4]:
rule_query

"SELECT * FROM network_claims WHERE modified_date_utc >= to_date('2019-10-01') and modified_date_utc <= to_date('2019-12-31');"

# Author Expectations



[**Watch a short tutorial video**](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#video)

[**Read more in the tutorial**](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations)

**Reach out for help on** [**Great Expectations Slack**](https://tinyurl.com/great-expectations-slack)


### Get a DataContext object
[Read more in the tutorial](https://great-expectations.readthedocs.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-datacontext-object)




In [5]:
context = ge.data_context.DataContext()

2020-02-24T13:37:23-0500 - INFO - Using project config: /Users/mparayil/Desktop/Development/dsa-data-workflows/grtexp_agero_dsa/great_expectations/great_expectations.yml


### List data assets in your project

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#data-assets)


In [6]:
great_expectations.jupyter_ux.list_available_data_asset_names(context)

Inspecting your data sources. This may take a moment...


#### Pick one of the data asset names above and use as the value of data_asset_name argument below

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-batch)


### Specify data_asset & expectation_suite_name

In [7]:
data_asset_name = 'network_claims'
normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)
print(normalized_data_asset_name)

NormalizedDataAssetName(datasource='agero_dsa_pandas', generator='default', generator_asset='network_claims')


### Create a new empty expectation suite

In [8]:
expectation_suite_name = 'warnings_2019Q4'
context.create_expectation_suite(data_asset_name=data_asset_name, expectation_suite_name=expectation_suite_name,
                                overwrite_existing=True)

{'data_asset_name': 'agero_dsa_pandas/default/network_claims',
 'meta': {'great_expectations.__version__': '0.8.8'},
 'expectations': []}

In [9]:
context.list_expectation_suite_keys()

[{'data_asset_name': agero_dsa_pandas/default/network_outreach,
 {'data_asset_name': agero_dsa_pandas/default/network_outreach,
 {'data_asset_name': agero_dsa_pandas/default/network_claims,
 {'data_asset_name': agero_dsa_pandas/default/network_claims,

### Get batch to create expectations against

In [10]:
rule_df = gda.snowflake_connector_to_df(rule_query)
# rule_df.to_pickle('temp_data/network_claims_2019Q4.pkl')

In [11]:
# pickle_df = pd.read_pickle('temp_data/network_claims_2019Q4.pkl')

In [12]:
b_kwargs = {"dataset": rule_df}
batch = context.get_batch(normalized_data_asset_name, expectation_suite_name=expectation_suite_name,
                         batch_kwargs=b_kwargs)

In [13]:
batch.get_row_count()

2678943

In [14]:
print(rule_df.shape)

(2678943, 39)


In [15]:
[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']

['agero_dsa_pandas']

In [16]:
# getting rule_df batchId & fingerprint
rule_batch_fingerprint = batch.batch_fingerprint
rule_batch_id = batch.batch_id

In [17]:
print('rule_batch_fingerprint: ', rule_batch_fingerprint, sep='\n')
print('rule_batch_id: ', rule_batch_id, sep='\n')

rule_batch_fingerprint: 
{'partition_id': '20200224T183814.055708Z', 'fingerprint': 'afa0a1402df775cdc86068bc697680a0'}
rule_batch_id: 
{'timestamp': 1582569491.5552058, 'PandasInMemoryDF': True, 'fingerprint': '7303977713c3205caa9594effecc1260'}


## Author Expectations

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#create-expectations)

See available expectations in the [expectation glossary](https://docs.greatexpectations.io/en/latest/glossary.html?utm_source=notebook&utm_medium=create_expectations)


### Dataset exploration & understanding of fields to ensure rules reflect behavior of data
- Validating columns to exist in table shape
- Expected column count in table shape
- Expected set values to be seen in given column
- Expected columns to have null or non-null values X percentage of the time
- Expect column values to be of certain data type(s)
- Placing max and min values limits on numerical columns
- Average or median column value to be within a certain range
- Expecting column A to be large/less than column B

### 1. Validating to see if every column exists in table

In [18]:
# add more expectations here
column_names = batch.get_table_columns()
print(column_names)

['CASE_ID', 'TASK_ID', 'VENDOR_ID', 'CLAIM_ID', 'VENDOR_ADDRESS_ID', 'CLAIM_PO_NUMBER_ENTERED', 'ORIGINAL_CLAIM_PAYMENT_AMOUNT', 'ORIGINAL_CLAIM_STATUS_CODE', 'ORIGINAL_CLAIM_TYPE_CODE', 'SUBMITTED_CLAIM_AMOUNT', 'SUBMITTED_TOW_MILES', 'SUBMITTED_ENROUTE_MILES', 'SUBMITTED_LABOR_HOURS', 'SERVICE_ID', 'WAS_AUDITED', 'ADDCHARGE_AMOUNT', 'IS_CLAIM_APPROVED', 'ORIGINAL_CLAIM_APPROVED_PAYMENT', 'ADDCHARGE_DETAILS', 'ADDCHARGE_COUNT', 'BASE_TOTAL_FROM_RATES', 'ADDPAY_DETAILS', 'ADDPAY_COUNT', 'ADDPAY_PAYMENT_AMOUNT', 'ADDPAY_APPROVED_PAYMENT', 'COMPLETE_APPROVED_PAYMENT', 'IS_VCC', 'ORIGINAL_CLAIM_APPROVED_DATE_EASTERN', 'ORIGINAL_CLAIM_APPROVED_DATE_UTC', 'SERVICE_DATE_EASTERN', 'SERVICE_DATE_UTC', 'ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN', 'ORIGINAL_CLAIM_SUBMITTED_DATE_UTC', 'ADDPAY_APPROVED_DATE_EASTERN', 'ADDPAY_APPROVED_DATE_UTC', 'ADDPAY_SUBMITTED_DATE_EASTERN', 'ADDPAY_SUBMITTED_DATE_UTC', 'MODIFIED_DATE_EASTERN', 'MODIFIED_DATE_UTC']


In [19]:
# columns with multipe data types, will be changing them to int types in future
# case_id, po_number, vendor_id

In [20]:
master_column_names = ['ADDCHARGE_AMOUNT', 'ADDCHARGE_COUNT', 'ADDCHARGE_DETAILS',
                        'ADDPAY_APPROVED_DATE_EASTERN', 'ADDPAY_APPROVED_DATE_UTC',
                        'ADDPAY_APPROVED_PAYMENT', 'ADDPAY_COUNT', 'ADDPAY_DETAILS',
                        'ADDPAY_PAYMENT_AMOUNT', 'ADDPAY_SUBMITTED_DATE_EASTERN', 'ADDPAY_SUBMITTED_DATE_UTC',
                        'BASE_TOTAL_FROM_RATES', 'CASE_ID', 'CLAIM_ID',
                        'CLAIM_PO_NUMBER_ENTERED', 'COMPLETE_APPROVED_PAYMENT',  'IS_CLAIM_APPROVED', 'IS_VCC',
                        'MODIFIED_DATE_EASTERN', 'MODIFIED_DATE_UTC', 'ORIGINAL_CLAIM_APPROVED_DATE_EASTERN',
                        'ORIGINAL_CLAIM_APPROVED_DATE_UTC', 'ORIGINAL_CLAIM_APPROVED_PAYMENT',
                        'ORIGINAL_CLAIM_PAYMENT_AMOUNT', 'ORIGINAL_CLAIM_STATUS_CODE',
                        'ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN',
                        'ORIGINAL_CLAIM_SUBMITTED_DATE_UTC', 'ORIGINAL_CLAIM_TYPE_CODE',
                        'SERVICE_DATE_EASTERN', 'SERVICE_DATE_UTC', 'SERVICE_ID',
                        'SUBMITTED_CLAIM_AMOUNT', 'SUBMITTED_ENROUTE_MILES',
                        'SUBMITTED_LABOR_HOURS', 'SUBMITTED_TOW_MILES', 'TASK_ID',
                        'VENDOR_ADDRESS_ID', 'VENDOR_ID', 'WAS_AUDITED']

In [21]:
len(master_column_names)

39

In [22]:
len(column_names)

39

In [23]:
# Ensuring columns to exist
for col in master_column_names:
    print(col + ':', batch.expect_column_to_exist(col, result_format='BASIC', catch_exceptions=True), sep='\n')

ADDCHARGE_AMOUNT:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDCHARGE_COUNT:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDCHARGE_DETAILS:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDPAY_APPROVED_DATE_EASTERN:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDPAY_APPROVED_DATE_UTC:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDPAY_APPROVED_PAYMENT:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDPAY_COUNT:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDP

### 2. Validating column count in table is always the same

In [24]:
print('# of columns in network_claims: ', len(column_names))

# of columns in network_claims:  39


In [25]:
print('# of columns in {}: '.format('network_claims'), len(master_column_names), '\n')
if len(column_names) == len(master_column_names):
    print(batch.expect_table_column_count_to_equal(len(column_names), result_format='SUMMARY'))
else:
    print(batch.expect_table_column_count_to_equal(len(master_column_names), result_format='SUMMARY'))

# of columns in network_claims:  39 

{'success': True, 'result': {'observed_value': 39}}


### 3. Checking which columns should not have null values

In [26]:
# identifying which columns should not be null
print(column_names)

['CASE_ID', 'TASK_ID', 'VENDOR_ID', 'CLAIM_ID', 'VENDOR_ADDRESS_ID', 'CLAIM_PO_NUMBER_ENTERED', 'ORIGINAL_CLAIM_PAYMENT_AMOUNT', 'ORIGINAL_CLAIM_STATUS_CODE', 'ORIGINAL_CLAIM_TYPE_CODE', 'SUBMITTED_CLAIM_AMOUNT', 'SUBMITTED_TOW_MILES', 'SUBMITTED_ENROUTE_MILES', 'SUBMITTED_LABOR_HOURS', 'SERVICE_ID', 'WAS_AUDITED', 'ADDCHARGE_AMOUNT', 'IS_CLAIM_APPROVED', 'ORIGINAL_CLAIM_APPROVED_PAYMENT', 'ADDCHARGE_DETAILS', 'ADDCHARGE_COUNT', 'BASE_TOTAL_FROM_RATES', 'ADDPAY_DETAILS', 'ADDPAY_COUNT', 'ADDPAY_PAYMENT_AMOUNT', 'ADDPAY_APPROVED_PAYMENT', 'COMPLETE_APPROVED_PAYMENT', 'IS_VCC', 'ORIGINAL_CLAIM_APPROVED_DATE_EASTERN', 'ORIGINAL_CLAIM_APPROVED_DATE_UTC', 'SERVICE_DATE_EASTERN', 'SERVICE_DATE_UTC', 'ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN', 'ORIGINAL_CLAIM_SUBMITTED_DATE_UTC', 'ADDPAY_APPROVED_DATE_EASTERN', 'ADDPAY_APPROVED_DATE_UTC', 'ADDPAY_SUBMITTED_DATE_EASTERN', 'ADDPAY_SUBMITTED_DATE_UTC', 'MODIFIED_DATE_EASTERN', 'MODIFIED_DATE_UTC']


In [27]:
rule_df.isnull().sum()

CASE_ID                                        0
TASK_ID                                        0
VENDOR_ID                                      0
CLAIM_ID                                       0
VENDOR_ADDRESS_ID                           5435
CLAIM_PO_NUMBER_ENTERED                     2646
ORIGINAL_CLAIM_PAYMENT_AMOUNT                  0
ORIGINAL_CLAIM_STATUS_CODE                     0
ORIGINAL_CLAIM_TYPE_CODE                       0
SUBMITTED_CLAIM_AMOUNT                         0
SUBMITTED_TOW_MILES                        58878
SUBMITTED_ENROUTE_MILES                    15740
SUBMITTED_LABOR_HOURS                     167879
SERVICE_ID                                     0
WAS_AUDITED                                    0
ADDCHARGE_AMOUNT                         2157721
IS_CLAIM_APPROVED                              0
ORIGINAL_CLAIM_APPROVED_PAYMENT                0
ADDCHARGE_DETAILS                        2157721
ADDCHARGE_COUNT                                0
BASE_TOTAL_FROM_RATE

In [31]:
# Separating null & non-null columns
null_cols = list(batch.isnull().sum()[batch.isnull().sum() > 0].keys())
not_null_cols = list(batch.isnull().sum()[batch.isnull().sum() == 0].keys())

In [32]:
not_null_cols

['CASE_ID',
 'TASK_ID',
 'VENDOR_ID',
 'CLAIM_ID',
 'ORIGINAL_CLAIM_PAYMENT_AMOUNT',
 'ORIGINAL_CLAIM_STATUS_CODE',
 'ORIGINAL_CLAIM_TYPE_CODE',
 'SUBMITTED_CLAIM_AMOUNT',
 'SERVICE_ID',
 'WAS_AUDITED',
 'IS_CLAIM_APPROVED',
 'ORIGINAL_CLAIM_APPROVED_PAYMENT',
 'ADDCHARGE_COUNT',
 'COMPLETE_APPROVED_PAYMENT',
 'IS_VCC',
 'MODIFIED_DATE_EASTERN',
 'MODIFIED_DATE_UTC']

In [33]:
# checking for all columns that shouldn't be null are not
for col in not_null_cols:
    print(col, '\n', batch.expect_column_values_to_not_be_null(col, result_format='BASIC'))

CASE_ID 
 {'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
TASK_ID 
 {'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
VENDOR_ID 
 {'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
CLAIM_ID 
 {'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
ORIGINAL_CLAIM_PAYMENT_AMOUNT 
 {'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
ORIGINAL_CLAIM_STATUS_CODE 
 {'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
ORIGINAL_CLAIM_TYPE_CODE 
 {'success': True, 'result': {'element_count': 2

In [34]:
print(f"capturing the following columns to not be null: \n {not_null_cols}")

capturing the following columns to not be null: 
 ['CASE_ID', 'TASK_ID', 'VENDOR_ID', 'CLAIM_ID', 'ORIGINAL_CLAIM_PAYMENT_AMOUNT', 'ORIGINAL_CLAIM_STATUS_CODE', 'ORIGINAL_CLAIM_TYPE_CODE', 'SUBMITTED_CLAIM_AMOUNT', 'SERVICE_ID', 'WAS_AUDITED', 'IS_CLAIM_APPROVED', 'ORIGINAL_CLAIM_APPROVED_PAYMENT', 'ADDCHARGE_COUNT', 'COMPLETE_APPROVED_PAYMENT', 'IS_VCC', 'MODIFIED_DATE_EASTERN', 'MODIFIED_DATE_UTC']


In [35]:
null_cols

['VENDOR_ADDRESS_ID',
 'CLAIM_PO_NUMBER_ENTERED',
 'SUBMITTED_TOW_MILES',
 'SUBMITTED_ENROUTE_MILES',
 'SUBMITTED_LABOR_HOURS',
 'ADDCHARGE_AMOUNT',
 'ADDCHARGE_DETAILS',
 'BASE_TOTAL_FROM_RATES',
 'ADDPAY_DETAILS',
 'ADDPAY_COUNT',
 'ADDPAY_PAYMENT_AMOUNT',
 'ADDPAY_APPROVED_PAYMENT',
 'ORIGINAL_CLAIM_APPROVED_DATE_EASTERN',
 'ORIGINAL_CLAIM_APPROVED_DATE_UTC',
 'SERVICE_DATE_EASTERN',
 'SERVICE_DATE_UTC',
 'ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN',
 'ORIGINAL_CLAIM_SUBMITTED_DATE_UTC',
 'ADDPAY_APPROVED_DATE_EASTERN',
 'ADDPAY_APPROVED_DATE_UTC',
 'ADDPAY_SUBMITTED_DATE_EASTERN',
 'ADDPAY_SUBMITTED_DATE_UTC']

In [36]:
len(null_cols)

22

### 4. Validating columns to have null values
- **columns to check:**
    - VENDOR_ADDRESS_ID
    - CLAIM_PO_NUMBER_ENTERED
    - SUBMITTED_TOW_MILES 
    - SUBMITTED_ENROUTE_MILES
    - SUBMITTED_LABOR_HOURS
    - ADDCHARGE_AMOUNT
    - ADDCHARGE_DETAILS 
    - BASE_TOTAL_FROM_RATES 
    - ADDPAY_DETAILS 
    - ADDPAY_COUNT 
    - ADDPAY_PAYMENT_AMOUNT 
    - ADDPAY_APPROVED_PAYMENT 
    - ORIGINAL_CLAIM_APPROVED_DATE_EASTERN 
    - ORIGINAL_CLAIM_APPROVED_DATE_UTC 
    - ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN 
    - ORIGINAL_CLAIM_SUBMITTED_DATE_UTC
    - ADDPAY_APPROVED_DATE_EASTERN
    - ADDPAY_APPROVED_DATE_UTC
    - ADDPAY_SUBMITTED_DATE_EASTERN
    - ADDPAY_SUBMITTED_DATE_UTC

In [39]:
rule_df.isnull().sum()

CASE_ID                                        0
TASK_ID                                        0
VENDOR_ID                                      0
CLAIM_ID                                       0
VENDOR_ADDRESS_ID                           5435
CLAIM_PO_NUMBER_ENTERED                     2646
ORIGINAL_CLAIM_PAYMENT_AMOUNT                  0
ORIGINAL_CLAIM_STATUS_CODE                     0
ORIGINAL_CLAIM_TYPE_CODE                       0
SUBMITTED_CLAIM_AMOUNT                         0
SUBMITTED_TOW_MILES                        58878
SUBMITTED_ENROUTE_MILES                    15740
SUBMITTED_LABOR_HOURS                     167879
SERVICE_ID                                     0
WAS_AUDITED                                    0
ADDCHARGE_AMOUNT                         2157721
IS_CLAIM_APPROVED                              0
ORIGINAL_CLAIM_APPROVED_PAYMENT                0
ADDCHARGE_DETAILS                        2157721
ADDCHARGE_COUNT                                0
BASE_TOTAL_FROM_RATE

In [86]:
# calculating weight for columns of how often they should be null
null_percents = (1 -(batch.isnull().sum() / len(batch))[batch.isnull().sum() / len(batch) > 0])
for x, y in null_percents.items():
    print(x, y, sep='\n')
    
not_null_weights = dict(null_percents)

# lowering weights by one thousandth of decimal
for key, weight in not_null_weights.items():
    not_null_weights[key] = round((1- weight), 3)
    
print('---------------------------------------')
print('not null weights:')
for x, y in not_null_weights.items():
    print(x,y)

VENDOR_ADDRESS_ID
0.9979712147664209
CLAIM_PO_NUMBER_ENTERED
0.9990122970141582
SUBMITTED_TOW_MILES
0.9780219287980372
SUBMITTED_ENROUTE_MILES
0.9941245483759826
SUBMITTED_LABOR_HOURS
0.9373338663793892
ADDCHARGE_AMOUNT
0.19456255694876678
ADDCHARGE_DETAILS
0.19456255694876678
BASE_TOTAL_FROM_RATES
0.6519869963638645
ADDPAY_DETAILS
0.039364779317813015
ADDPAY_COUNT
0.039364779317813015
ADDPAY_PAYMENT_AMOUNT
0.039364406036261346
ADDPAY_APPROVED_PAYMENT
0.039364406036261346
ORIGINAL_CLAIM_APPROVED_DATE_EASTERN
0.9759229666327354
ORIGINAL_CLAIM_APPROVED_DATE_UTC
0.9759229666327354
SERVICE_DATE_EASTERN
0.9999212375925878
SERVICE_DATE_UTC
0.9999212375925878
ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN
0.998713671772785
ORIGINAL_CLAIM_SUBMITTED_DATE_UTC
0.998713671772785
ADDPAY_APPROVED_DATE_EASTERN
0.03733823377354428
ADDPAY_APPROVED_DATE_UTC
0.03733823377354428
ADDPAY_SUBMITTED_DATE_EASTERN
0.03936328619160612
ADDPAY_SUBMITTED_DATE_UTC
0.03936328619160612
---------------------------------------
n

In [68]:
# updated not_null_weights to take into account addpays over daily batch data
not_null_weights = {'VENDOR_ADDRESS_ID': 0.988,
 'CLAIM_PO_NUMBER_ENTERED': 0.989,
 'SUBMITTED_TOW_MILES': 0.968,
 'SUBMITTED_ENROUTE_MILES': 0.984,
 'SUBMITTED_LABOR_HOURS': 0.927,
 'ADDCHARGE_AMOUNT': 0.185,
 'ADDCHARGE_DETAILS': 0.185,
 'BASE_TOTAL_FROM_RATES': 0.642,
 'ADDPAY_DETAILS': 0.005,
 'ADDPAY_COUNT': 0.005,
 'ADDPAY_PAYMENT_AMOUNT': 0.005,
 'ADDPAY_APPROVED_PAYMENT': 0.005,
 'ORIGINAL_CLAIM_APPROVED_DATE_EASTERN': 0.966,
 'ORIGINAL_CLAIM_APPROVED_DATE_UTC': 0.966,
 'SERVICE_DATE_EASTERN': 0.99,
 'SERVICE_DATE_UTC': 0.99,
 'ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN': 0.989,
 'ORIGINAL_CLAIM_SUBMITTED_DATE_UTC': 0.989,
 'ADDPAY_APPROVED_DATE_EASTERN': 0.002,
 'ADDPAY_APPROVED_DATE_UTC': 0.002,
 'ADDPAY_SUBMITTED_DATE_EASTERN': 0.005,
 'ADDPAY_SUBMITTED_DATE_UTC': 0.005}

In [71]:
# validating columns that should be null with 
for col, weight in not_null_weights.items():
    print(col, batch.expect_column_values_to_not_be_null(col, mostly=weight, include_config=True,
                                                           catch_exceptions=True,
                                                           result_format='SUMMARY'), sep='\n')

VENDOR_ADDRESS_ID
{'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 5435, 'unexpected_percent': 0.20287852335790646, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'VENDOR_ADDRESS_ID', 'mostly': 0.988, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
CLAIM_PO_NUMBER_ENTERED
{'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 2646, 'unexpected_percent': 0.0987702985841804, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'CLAIM_PO_NUMBER_ENTERED', 'mostly': 0.989, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
SUBMITTED_TOW_MILES
{'success': True, 'result': {'element_count': 2678943, 'unexpected_count'

ADDPAY_APPROVED_DATE_EASTERN
{'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 2578916, 'unexpected_percent': 96.26617662264557, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'ADDPAY_APPROVED_DATE_EASTERN', 'mostly': 0.002, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDPAY_APPROVED_DATE_UTC
{'success': True, 'result': {'element_count': 2678943, 'unexpected_count': 2578916, 'unexpected_percent': 96.26617662264557, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'ADDPAY_APPROVED_DATE_UTC', 'mostly': 0.002, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDPAY_SUBMITTED_DATE_EASTERN
{'success': True, 'result': {'elemen

### 5. Expecting column values to be in a set
- original_claim_status_code
- original_claim_type_code
- service_id

In [72]:
batch.head()

Unnamed: 0,CASE_ID,TASK_ID,VENDOR_ID,CLAIM_ID,VENDOR_ADDRESS_ID,CLAIM_PO_NUMBER_ENTERED,ORIGINAL_CLAIM_PAYMENT_AMOUNT,ORIGINAL_CLAIM_STATUS_CODE,ORIGINAL_CLAIM_TYPE_CODE,SUBMITTED_CLAIM_AMOUNT,...,SERVICE_DATE_EASTERN,SERVICE_DATE_UTC,ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN,ORIGINAL_CLAIM_SUBMITTED_DATE_UTC,ADDPAY_APPROVED_DATE_EASTERN,ADDPAY_APPROVED_DATE_UTC,ADDPAY_SUBMITTED_DATE_EASTERN,ADDPAY_SUBMITTED_DATE_UTC,MODIFIED_DATE_EASTERN,MODIFIED_DATE_UTC
0,2260715166,1,56865,206510333,1.0,961863749.0,34.24,DP,VND,34.24,...,2018-12-05 00:00:00,2018-12-05 05:00:00,2019-11-22 14:24:25,2019-11-22 19:24:25,NaT,NaT,NaT,NaT,2019-11-22 14:24:25,2019-11-22 19:24:25
1,2260643050,1,56865,206510229,6.0,231268953.0,65.27,DP,VND,65.27,...,2018-12-04 00:00:00,2018-12-04 05:00:00,2019-11-22 14:12:54,2019-11-22 19:12:54,NaT,NaT,NaT,NaT,2019-11-22 14:12:54,2019-11-22 19:12:54
2,2233051739,1,56865,112937786,1.0,952577978.0,69.02,DP,VND,69.02,...,2018-02-19 00:00:00,2018-02-19 05:00:00,2019-10-03 11:51:05,2019-10-03 15:51:05,NaT,NaT,NaT,NaT,2019-10-03 11:51:05,2019-10-03 15:51:05
3,2232812263,1,56865,204367999,1.0,170214814.0,42.27,DP,VND,42.27,...,2018-02-16 00:00:00,2018-02-16 05:00:00,2019-10-02 14:42:16,2019-10-02 18:42:16,NaT,NaT,NaT,NaT,2019-10-02 14:42:16,2019-10-02 18:42:16
4,2289358827,2,127957,89538584,1.0,,-507.69,PD,VND,-507.69,...,2018-12-21 12:00:00,2018-12-21 17:00:00,2019-10-21 17:28:51,2019-10-21 21:28:51,NaT,NaT,NaT,NaT,2019-10-21 17:48:55,2019-10-21 21:48:55


In [73]:
category_cols = ['ORIGINAL_CLAIM_STATUS_CODE', 'ORIGINAL_CLAIM_TYPE_CODE', 'SERVICE_ID']

In [74]:
for col in category_cols:
    print(list(batch.get_column_value_counts(col, sort='value').keys()), '\n')

['AP', 'DP', 'FC', 'FP', 'HD', 'IN', 'PA', 'PC', 'PD', 'PJ', 'SP', 'VD'] 

['OWN', 'VND'] 

['GOA', 'LOCK', 'Lock', 'NA', 'REPO', 'ROAD', 'Road', 'TOW', 'Tow', 'Winch', 'null', 'tow'] 



In [75]:
print(batch.get_column_value_counts('VENDOR_ADDRESS_ID', sort='value'))

value
-1.0           6
 0.0       46432
 1.0     2331235
 2.0      143810
 3.0       60882
 4.0       64553
 5.0       10378
 6.0        4157
 7.0        6718
 8.0        1401
 9.0         783
 10.0       1727
 11.0        781
 12.0        330
 13.0         50
 14.0        121
 15.0         45
 17.0         87
 19.0         12
Name: count, dtype: int64


In [76]:
val_set_list = []

for col in category_cols:
    list_sets = list(batch.get_column_value_counts(col, sort='value').keys())
    val_set_list.append(list_sets)

In [77]:
val_set_list

[['AP', 'DP', 'FC', 'FP', 'HD', 'IN', 'PA', 'PC', 'PD', 'PJ', 'SP', 'VD'],
 ['OWN', 'VND'],
 ['GOA',
  'LOCK',
  'Lock',
  'NA',
  'REPO',
  'ROAD',
  'Road',
  'TOW',
  'Tow',
  'Winch',
  'null',
  'tow']]

In [78]:
for col, x in zip(category_cols,val_set_list):
    print(col, '\n', x, '\n')

ORIGINAL_CLAIM_STATUS_CODE 
 ['AP', 'DP', 'FC', 'FP', 'HD', 'IN', 'PA', 'PC', 'PD', 'PJ', 'SP', 'VD'] 

ORIGINAL_CLAIM_TYPE_CODE 
 ['OWN', 'VND'] 

SERVICE_ID 
 ['GOA', 'LOCK', 'Lock', 'NA', 'REPO', 'ROAD', 'Road', 'TOW', 'Tow', 'Winch', 'null', 'tow'] 



In [79]:
for col, vals in zip(category_cols, val_set_list):
    print(col, '\n', batch.expect_column_values_to_be_in_set(col, vals, result_format='BASIC', 
                                        include_config=True, catch_exceptions=True), '\n')

ORIGINAL_CLAIM_STATUS_CODE 
 {'success': True, 'result': {'element_count': 2678943, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': {'column': 'ORIGINAL_CLAIM_STATUS_CODE', 'value_set': ['AP', 'DP', 'FC', 'FP', 'HD', 'IN', 'PA', 'PC', 'PD', 'PJ', 'SP', 'VD'], 'result_format': 'BASIC'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}} 

ORIGINAL_CLAIM_TYPE_CODE 
 {'success': True, 'result': {'element_count': 2678943, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': {'column': 'ORIGINAL_CLAIM_TYPE_CODE', 'value_set': ['OWN', 'VND'

### 6. Determine if columns are json parseable
- addcharge_details
- addpay_details

In [106]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [108]:
# expect_column_values_to_be_json_parseable
json_columns = ["ADDCHARGE_DETAILS", "ADDPAY_DETAILS"]
for col in json_columns:
    print(col, batch.expect_column_values_to_be_json_parseable(col, result_format='BASIC', mostly=0.98,
                                                               catch_exceptions=True))

ADDCHARGE_DETAILS {'success': True, 'result': {'element_count': 2674621, 'missing_count': 2152143, 'missing_percent': 80.46534443571632, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ADDPAY_DETAILS {'success': True, 'result': {'element_count': 2674621, 'missing_count': 2573850, 'missing_percent': 96.23232600058101, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}


### 7. Determine if column match json schema
- addcharge_details: TODO
- addpay_details: see below

In [109]:
addpay_schema = {
  "definitions": {},
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "http://example.com/root.json",
  "type": [
    "array",
    "null"
  ],
  "title": "The Root Schema",
  "items": {
    "$id": "#/items",
    "type": [
      "object",
      "null"
    ],
    "title": "The Items Schema",
    "required": [
      "addpay_approved_date",
      "addpay_approved_payment_amount",
      "addpay_claim_id",
      "addpay_is_approved",
      "addpay_payment_amount",
      "addpay_submitted_date",
      "addpay_task_id",
      "addpay_vendor_id",
      "original_claim_amount_paid",
      "original_claim_amount_requested",
      "original_claim_id",
      "original_claim_vendor_id",
      "service_date"
    ],
    "properties": {
      "addpay_approved_date": {
        "$id": "#/items/properties/addpay_approved_date",
        "type": [
          "string",
          "null"
        ],
          "format": "date-time",
        "title": "The Addpay_approved_date Schema",
        "default": "",
        "examples": [
          "2019-02-20T04:02:00"
        ],
        "pattern": "^(.*)$"
      },
      "addpay_approved_payment_amount": {
        "$id": "#/items/properties/addpay_approved_payment_amount",
        "type": [
          "number",
          "null"
        ],
        "title": "The Addpay_approved_payment_amount Schema",
        "default": 0,
        "examples": [
          27
        ]
      },
      "addpay_claim_id": {
        "$id": "#/items/properties/addpay_claim_id",
        "type": [
          "number",
          "null"
        ],
        "title": "The Addpay_claim_id Schema",
        "default": 0,
        "examples": [
          88551584
        ]
      },
      "addpay_is_approved": {
        "$id": "#/items/properties/addpay_is_approved",
        "type": [
          "boolean",
          "null"
        ],
        "title": "The Addpay_is_approved Schema",
        "default": False,
        "examples": [
          True
        ]
      },
      "addpay_payment_amount": {
        "$id": "#/items/properties/addpay_payment_amount",
        "type": [
          "number",
          "null"
        ],
        "title": "The Addpay_payment_amount Schema",
        "default": 0,
        "examples": [
          27
        ]
      },
      "addpay_submitted_date": {
        "$id": "#/items/properties/addpay_submitted_date",
        "type": [
          "string",
          "null"
        ],
          "format": "date-time",
        "title": "The Addpay_submitted_date Schema",
        "default": "",
        "examples": [
          "2019-02-19T00:00:00"
        ],
        "pattern": "^(.*)$"
      },
      "addpay_task_id": {
        "$id": "#/items/properties/addpay_task_id",
        "type": [
          "number",
          "null"
        ],
        "title": "The Addpay_task_id Schema",
        "default": 0,
        "examples": [
          2
        ]
      },
      "addpay_vendor_id": {
        "$id": "#/items/properties/addpay_vendor_id",
        "type": [
          "string",
          "null"
        ],
        "title": "The Addpay_vendor_id Schema",
        "default": "",
        "examples": [
          "133573"
        ],
        "pattern": "^(.*)$"
      },
      "original_claim_amount_paid": {
        "$id": "#/items/properties/original_claim_amount_paid",
        "type": [
          "number",
          "null"
        ],
        "title": "The Original_claim_amount_paid Schema",
        "default": 0.0,
        "examples": [
          57.95
        ]
      },
      "original_claim_amount_requested": {
        "$id": "#/items/properties/original_claim_amount_requested",
        "type": [
          "number",
          "null"
        ],
        "title": "The Original_claim_amount_requested Schema",
        "default": 0,
        "examples": [
          59
        ]
      },
      "original_claim_id": {
        "$id": "#/items/properties/original_claim_id",
        "type": [
          "number",
          "null"
        ],
        "title": "The Original_claim_id Schema",
        "default": 0,
        "examples": [
          195082625
        ]
      },
      "original_claim_vendor_id": {
        "$id": "#/items/properties/original_claim_vendor_id",
        "type": [
          "string",
          "null"
        ],
        "title": "The Original_claim_vendor_id Schema",
        "default": "",
        "examples": [
          "133573"
        ],
        "pattern": "^(.*)$"
      },
      "service_date": {
        "$id": "#/items/properties/service_date",
        "type": [
          "string",
          "null"
        ],
          "format": "date-time",
        "title": "The Service_date Schema",
        "default": "",
        "examples": [
          "2019-02-14T11:50:21"
        ],
        "pattern": "^(.*)$"
      }
    }
  }
}

In [110]:
# expect_column_values_to_match_json_schema
batch.expect_column_values_to_match_json_schema('ADDPAY_DETAILS', addpay_schema, mostly=0.92, 
                                               result_format='BASIC')

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 2573850,
  'missing_percent': 96.23232600058101,
  'unexpected_count': 2647,
  'unexpected_percent': 0.09896729293608329,
  'unexpected_percent_nonmissing': 2.6267477746573915,
  'partial_unexpected_list': ['[\n  {\n    "addpay_approved_date": "2019-11-02T04:03:05",\n    "addpay_approved_payment_amount": 21,\n    "addpay_claim_id": 89581222,\n    "addpay_is_approved": true,\n    "addpay_payment_amount": 21,\n    "addpay_submitted_date": "2019-11-01T00:00:00",\n    "addpay_vendor_id": "300733",\n    "original_claim_amount_paid": null,\n    "original_claim_amount_requested": null,\n    "original_claim_id": null,\n    "original_claim_task_id": null,\n    "original_claim_vendor_id": null,\n    "service_date": "2019-10-10T14:48:38"\n  }\n]',
   '[\n  {\n    "addpay_approved_date": "2019-10-24T12:22:04",\n    "addpay_approved_payment_amount": 100,\n    "addpay_claim_id": 89549080,\n    "addpay_is_approved": true,\n   

In [112]:
addcharge_schema = {
  "definitions": {},
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "http://example.com/root.json",
  "type": [
    "array",
    "null"
  ],
  "title": "The Root Schema",
  "items": {
    "$id": "#/items",
    "type": [
      "object",
      "null"
    ],
    "title": "The Items Schema",
    "properties": {
      "addcharge_amount": {
        "$id": "#/items/properties/addcharge_amount",
        "type": ["number", "null"],
        "title": "The Addcharge_amount Schema"
      },
      "addcharge_type": {
        "$id": "#/items/properties/addcharge_type",
        "type": ["string", "null"],
        "title": "The Addcharge_type Schema"
      }
    },
      'additionalProperties': {"type": "string"}
  }
}

In [113]:
# expect_column_values_to_match_json_schema
batch.expect_column_values_to_match_json_schema('ADDCHARGE_DETAILS', addcharge_schema, mostly=0.8)

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 2152143,
  'missing_percent': 80.46534443571632,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': [],
  'partial_unexpected_index_list': [],
  'partial_unexpected_counts': [],
  'unexpected_list': [],
  'unexpected_index_list': []}}

### 8. Determine if columns are unique per row
- CASE_ID, TASK_ID
- CASE_ID, TASK_ID, VENDOR_ID
- CLAIM_ID

In [114]:
batch.expect_multicolumn_values_to_be_unique(column_list=['CASE_ID', 'TASK_ID'], result_format='SUMMARY',
                                            catch_exceptions=True, include_config=True)

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': [],
  'partial_unexpected_index_list': [],
  'partial_unexpected_counts': []},
 'expectation_config': {'expectation_type': 'expect_multicolumn_values_to_be_unique',
  'kwargs': {'column_list': ['CASE_ID', 'TASK_ID'],
   'result_format': 'SUMMARY'}},
 'exception_info': {'raised_exception': False,
  'exception_message': None,
  'exception_traceback': None}}

In [115]:
batch.expect_multicolumn_values_to_be_unique(column_list=['CASE_ID', 'TASK_ID', 'VENDOR_ID'], result_format='SUMMARY',
                                            catch_exceptions=True, include_config=True)

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': [],
  'partial_unexpected_index_list': [],
  'partial_unexpected_counts': []},
 'expectation_config': {'expectation_type': 'expect_multicolumn_values_to_be_unique',
  'kwargs': {'column_list': ['CASE_ID', 'TASK_ID', 'VENDOR_ID'],
   'result_format': 'SUMMARY'}},
 'exception_info': {'raised_exception': False,
  'exception_message': None,
  'exception_traceback': None}}

In [132]:
print(batch.expect_column_values_to_be_unique('CLAIM_ID', catch_exceptions=True))

{'success': True, 'result': {'element_count': 2674621, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}


### 9. Looking at column A to be greater than column B
- ORIGINAL_CLAIM_SUBMITTED_DATE_UTC > SERVICE_DATE_UTC
- ORIGINAL_CLAIM_APPROVED_DATE_UTC > ORIGINAL_CLAIM_SUBMITTED_DATE_UTC
- MODIFIED_DATE_UTC > ADDPAY_APPROVED_DATE_UTC

In [118]:
batch.expect_column_pair_values_A_to_be_greater_than_B('ORIGINAL_CLAIM_SUBMITTED_DATE_UTC', 'SERVICE_DATE_UTC', 
                                                       mostly=0.99, ignore_row_if='either_value_is_missing', 
                                                       result_format='SUMMARY', catch_exceptions=True)

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 3194,
  'missing_percent': 0.11941878868071401,
  'unexpected_count': 116,
  'unexpected_percent': 0.004337063082956427,
  'unexpected_percent_nonmissing': 0.004342248543568662,
  'partial_unexpected_list': [['2019-10-27 04:00:00', '2019-10-28 01:01:47'],
   ['2019-11-22 05:00:00', '2019-11-23 04:16:49'],
   ['2019-11-18 05:00:00', '2019-11-18 14:47:05'],
   ['2019-11-18 05:00:00', '2019-11-18 21:19:46'],
   ['2019-11-08 00:29:51', '2019-11-08 05:00:00'],
   ['2019-11-04 05:00:00', '2019-11-05 11:08:28'],
   ['2019-11-16 05:00:00', '2019-11-16 09:39:48'],
   ['2019-11-11 05:00:00', '2019-11-11 05:00:00'],
   ['2019-11-04 05:00:00', '2019-11-04 13:27:36'],
   ['2019-11-07 05:00:00', '2019-11-07 14:42:09'],
   ['2019-11-04 05:00:00', '2019-11-06 00:01:14'],
   ['2019-11-18 05:00:00', '2019-11-18 16:19:18'],
   ['2019-11-23 05:00:00', '2019-11-23 07:38:28'],
   ['2019-12-30 05:00:00', '2019-12-30 12:17:37'],
   ['2

In [119]:
batch.expect_column_pair_values_A_to_be_greater_than_B('ORIGINAL_CLAIM_APPROVED_DATE_UTC', 
                                                       'ORIGINAL_CLAIM_SUBMITTED_DATE_UTC',
                                                       mostly=0.99, ignore_row_if='either_value_is_missing', 
                                                       result_format='SUMMARY', catch_exceptions=True)

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 66043,
  'missing_percent': 2.4692470447214765,
  'unexpected_count': 151,
  'unexpected_percent': 0.005645659702813969,
  'unexpected_percent_nonmissing': 0.005788594398940726,
  'partial_unexpected_list': [['2019-10-16 14:39:34', '2019-11-15 05:00:00'],
   ['2019-10-16 13:39:14', '2019-11-15 05:00:00'],
   ['2019-12-17 05:10:42', '2019-12-17 05:10:42'],
   ['2019-10-16 13:39:14', '2019-11-15 05:00:00'],
   ['2019-10-16 13:39:14', '2019-11-15 05:00:00'],
   ['2019-10-16 08:18:36', '2019-11-15 05:00:00'],
   ['2019-10-16 13:39:14', '2019-11-15 05:00:00'],
   ['2020-01-03 13:53:34', '2020-12-31 05:00:00'],
   ['2019-12-15 05:01:23', '2019-12-15 05:01:23'],
   ['2019-12-28 05:00:42', '2019-12-28 05:00:42'],
   ['2020-01-03 13:53:34', '2020-12-31 05:00:00'],
   ['2020-01-23 17:00:13', '2020-01-23 17:00:13'],
   ['2020-01-01 05:14:15', '2020-01-01 05:14:15'],
   ['2020-01-21 05:18:08', '2020-01-21 05:18:08'],
   ['2

In [124]:
batch.expect_column_pair_values_A_to_be_greater_than_B('MODIFIED_DATE_UTC','ADDPAY_APPROVED_DATE_UTC',
                                                       or_equal=True,
                                                       ignore_row_if='either_value_is_missing', 
                                                       result_format='SUMMARY', catch_exceptions=True)

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 2578267,
  'missing_percent': 96.39747089400704,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': [],
  'partial_unexpected_index_list': [],
  'partial_unexpected_counts': []},
 'exception_info': {'raised_exception': False,
  'exception_message': None,
  'exception_traceback': None}}

### 8. Expecting columns to be certain data type

In [125]:
rule_df.dtypes

CASE_ID                                           int64
TASK_ID                                            int8
VENDOR_ID                                        object
CLAIM_ID                                          int32
VENDOR_ADDRESS_ID                               float64
CLAIM_PO_NUMBER_ENTERED                         float64
ORIGINAL_CLAIM_PAYMENT_AMOUNT                   float64
ORIGINAL_CLAIM_STATUS_CODE                       object
ORIGINAL_CLAIM_TYPE_CODE                         object
SUBMITTED_CLAIM_AMOUNT                          float64
SUBMITTED_TOW_MILES                             float64
SUBMITTED_ENROUTE_MILES                         float64
SUBMITTED_LABOR_HOURS                           float64
SERVICE_ID                                       object
WAS_AUDITED                                        bool
ADDCHARGE_AMOUNT                                float64
IS_CLAIM_APPROVED                                  bool
ORIGINAL_CLAIM_APPROVED_PAYMENT                 

In [126]:
for x, y in batch.dtypes.iteritems():
    print(x, y)

CASE_ID int64
TASK_ID int8
VENDOR_ID object
CLAIM_ID int32
VENDOR_ADDRESS_ID float64
CLAIM_PO_NUMBER_ENTERED float64
ORIGINAL_CLAIM_PAYMENT_AMOUNT float64
ORIGINAL_CLAIM_STATUS_CODE object
ORIGINAL_CLAIM_TYPE_CODE object
SUBMITTED_CLAIM_AMOUNT float64
SUBMITTED_TOW_MILES float64
SUBMITTED_ENROUTE_MILES float64
SUBMITTED_LABOR_HOURS float64
SERVICE_ID object
WAS_AUDITED bool
ADDCHARGE_AMOUNT float64
IS_CLAIM_APPROVED bool
ORIGINAL_CLAIM_APPROVED_PAYMENT float64
ADDCHARGE_DETAILS object
ADDCHARGE_COUNT int16
BASE_TOTAL_FROM_RATES float64
ADDPAY_DETAILS object
ADDPAY_COUNT float64
ADDPAY_PAYMENT_AMOUNT float64
ADDPAY_APPROVED_PAYMENT float64
COMPLETE_APPROVED_PAYMENT float64
IS_VCC bool
ORIGINAL_CLAIM_APPROVED_DATE_EASTERN datetime64[ns]
ORIGINAL_CLAIM_APPROVED_DATE_UTC datetime64[ns]
SERVICE_DATE_EASTERN datetime64[ns]
SERVICE_DATE_UTC datetime64[ns]
ORIGINAL_CLAIM_SUBMITTED_DATE_EASTERN datetime64[ns]
ORIGINAL_CLAIM_SUBMITTED_DATE_UTC datetime64[ns]
ADDPAY_APPROVED_DATE_EASTERN datetime

In [127]:
network_claims_data_types = dict(batch.dtypes.iteritems())

In [128]:
for key, val in network_claims_data_types.items():
    network_claims_data_types[key] = str(val)

In [129]:
network_claims_data_types

{'CASE_ID': 'int64',
 'TASK_ID': 'int8',
 'VENDOR_ID': 'object',
 'CLAIM_ID': 'int32',
 'VENDOR_ADDRESS_ID': 'float64',
 'CLAIM_PO_NUMBER_ENTERED': 'float64',
 'ORIGINAL_CLAIM_PAYMENT_AMOUNT': 'float64',
 'ORIGINAL_CLAIM_STATUS_CODE': 'object',
 'ORIGINAL_CLAIM_TYPE_CODE': 'object',
 'SUBMITTED_CLAIM_AMOUNT': 'float64',
 'SUBMITTED_TOW_MILES': 'float64',
 'SUBMITTED_ENROUTE_MILES': 'float64',
 'SUBMITTED_LABOR_HOURS': 'float64',
 'SERVICE_ID': 'object',
 'WAS_AUDITED': 'bool',
 'ADDCHARGE_AMOUNT': 'float64',
 'IS_CLAIM_APPROVED': 'bool',
 'ORIGINAL_CLAIM_APPROVED_PAYMENT': 'float64',
 'ADDCHARGE_DETAILS': 'object',
 'ADDCHARGE_COUNT': 'int16',
 'BASE_TOTAL_FROM_RATES': 'float64',
 'ADDPAY_DETAILS': 'object',
 'ADDPAY_COUNT': 'float64',
 'ADDPAY_PAYMENT_AMOUNT': 'float64',
 'ADDPAY_APPROVED_PAYMENT': 'float64',
 'COMPLETE_APPROVED_PAYMENT': 'float64',
 'IS_VCC': 'bool',
 'ORIGINAL_CLAIM_APPROVED_DATE_EASTERN': 'datetime64[ns]',
 'ORIGINAL_CLAIM_APPROVED_DATE_UTC': 'datetime64[ns]',
 'SE

In [130]:
for col, typ in network_claims_data_types.items():
    print(batch.expect_column_values_to_be_of_type(col, typ, result_format='SUMMARY', catch_exceptions=True))

{'success': True, 'result': {'observed_value': 'int64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'int8'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'object_'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'int32'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'float64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'float64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'obser

### Review the expectations

Expectations that were true on this data batch were added. To view all the expectations you added so far about this data asset, do:

In [131]:
batch.get_expectation_suite()

2020-02-24T12:38:40-0500 - INFO - 	128 expectation(s) included in expectation_suite. Omitting 3 expectation(s) that failed when last run; set discard_failed_expectations=False to include them. result_format settings filtered.


{'data_asset_name': 'agero_dsa_pandas/default/network_claims',
 'meta': {'great_expectations.__version__': '0.8.8'},
 'expectations': [{'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_AMOUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_COUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_DETAILS'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_DATE_EASTERN'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_DATE_UTC'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_PAYMENT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_COUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_DETAILS'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_PAYME

In [41]:
batch.save_expectation_suite()

2020-01-31T18:26:29-0500 - INFO - 	64 expectation(s) included in expectation_suite. result_format settings filtered.


### You created and saved expectations for at least one of the data assets.

### We will show you how to set up validation - the process of checking if new files of this type conform to your expectations before they are processed by your pipeline's code. 

### Go to [integrate_validation_into_pipeline.ipynb](integrate_validation_into_pipeline.ipynb) to proceed.


