In [1]:
import os
import json
import sys
import great_expectations as ge
import great_expectations.jupyter_ux
from datetime import datetime
import math
import pandas as pd
os.chdir('/Users/mparayil/Desktop/Development/dsa-data-workflows/grtexp_agero_dsa/great_expectations')

2020-03-01T21:37:24-0500 - INFO - Great Expectations logging enabled at INFO level by JupyterUX module.


In [2]:
import ge_prod.ge_data_access as gda
import ge_prod.queries as queries

In [3]:
rule_query = queries.queries.get('service_progress').get('create_expectations_2019Q4')

In [4]:
rule_query

"SELECT * FROM service_progress where completed_time_utc >= to_date('2019-10-01') and completed_time_utc <= to_date('2019-12-31');"

# Author Expectations



[**Watch a short tutorial video**](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#video)

[**Read more in the tutorial**](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations)

**Reach out for help on** [**Great Expectations Slack**](https://tinyurl.com/great-expectations-slack)


### Get a DataContext object
[Read more in the tutorial](https://great-expectations.readthedocs.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-datacontext-object)




In [5]:
context = ge.data_context.DataContext()

2020-03-01T21:38:07-0500 - INFO - Using project config: /Users/mparayil/Desktop/Development/dsa-data-workflows/grtexp_agero_dsa/great_expectations/great_expectations.yml


### List data assets in your project

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#data-assets)


In [6]:
great_expectations.jupyter_ux.list_available_data_asset_names(context)

Inspecting your data sources. This may take a moment...


#### Pick one of the data asset names above and use as the value of data_asset_name argument below

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-batch)


### Specify data_asset & expectation_suite_name

In [7]:
data_asset_name = 'service_progress'
normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)
print(normalized_data_asset_name)

NormalizedDataAssetName(datasource='agero_dsa_pandas', generator='default', generator_asset='service_progress')


### Create a new empty expectation suite

In [10]:
expectation_suite_name = 'warnings_2019Q4'
context.create_expectation_suite(data_asset_name=normalized_data_asset_name, expectation_suite_name=expectation_suite_name,
                                overwrite_existing=True)

{'data_asset_name': 'agero_dsa_pandas/default/service_progress',
 'meta': {'great_expectations.__version__': '0.8.8'},
 'expectations': []}

In [11]:
context.list_expectation_suite_keys()

[{'data_asset_name': agero_dsa_pandas/default/customer_experience,
 {'data_asset_name': agero_dsa_pandas/default/network_outreach,
 {'data_asset_name': agero_dsa_pandas/default/network_outreach,
 {'data_asset_name': agero_dsa_pandas/default/network_outreach,
 {'data_asset_name': agero_dsa_pandas/default/customer_complaints,
 {'data_asset_name': agero_dsa_pandas/default/network_claims,
 {'data_asset_name': agero_dsa_pandas/default/network_claims,
 {'data_asset_name': agero_dsa_pandas/default/service_progress,

### Get batch to create expectations against

In [12]:
rule_df = gda.snowflake_connector_to_df(rule_query)
# rule_df.to_pickle('temp_data/network_claims_2019Q4.pkl')

In [20]:
rule_df.to_pickle('temp_data/service_progress_2019Q4.pkl')

In [13]:
rule_df.shape

(2125493, 42)

In [14]:
b_kwargs = {"dataset": rule_df}
batch = context.get_batch(normalized_data_asset_name, expectation_suite_name=expectation_suite_name,
                         batch_kwargs=b_kwargs)

In [21]:
batch.get_row_count()

2125493

In [22]:
print(rule_df.shape)

(2125493, 42)


In [23]:
[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']

['agero_dsa_pandas']

In [24]:
# getting rule_df batchId & fingerprint
rule_batch_fingerprint = batch.batch_fingerprint
rule_batch_id = batch.batch_id

In [25]:
print('rule_batch_fingerprint: ', rule_batch_fingerprint, sep='\n')
print('rule_batch_id: ', rule_batch_id, sep='\n')

rule_batch_fingerprint: 
{'partition_id': '20200302T024059.144845Z', 'fingerprint': '7226722f7525871f1fcf9480acec6a89'}
rule_batch_id: 
{'timestamp': 1583116829.278037, 'PandasInMemoryDF': True, 'fingerprint': 'e1ed5ce77da0b50a6823c67f5dce8c6c'}


## Author Expectations

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#create-expectations)

See available expectations in the [expectation glossary](https://docs.greatexpectations.io/en/latest/glossary.html?utm_source=notebook&utm_medium=create_expectations)


### Dataset exploration & understanding of fields to ensure rules reflect behavior of data
- Validating columns to exist in table shape
- Expected column count in table shape
- Expected set values to be seen in given column
- Expected columns to have null or non-null values X percentage of the time
- Expect column values to be of certain data type(s)
- Placing max and min values limits on numerical columns
- Average or median column value to be within a certain range
- Expecting column A to be large/less than column B

### 1. Validating to see if every column exists in table

In [26]:
# add more expectations here
column_names = batch.get_table_columns()
column_names

['CASE_ID',
 'TASK_ID',
 'DRIVER_PROFILE_ID',
 'DISPATCHER_PROFILE_ID',
 'ENROUTE_LATITUDE',
 'ENROUTE_LONGITUDE',
 'ONSCENE_LATITUDE',
 'ONSCENE_LONGITUDE',
 'TOW_IN_PROGRESS_LATITUDE',
 'TOW_IN_PROGRESS_LONGITUDE',
 'TOW_ARRIVED_LATITUDE',
 'TOW_ARRIVED_LONGITUDE',
 'COMPLETED_LATITUDE',
 'COMPLETED_LONGITUDE',
 'STATUS_SOURCE',
 'ASSIGNED_SOURCE_DETAILS',
 'ENROUTE_SOURCE_DETAILS',
 'ONSCENE_SOURCE_DETAILS',
 'TOW_IN_PROGRESS_SOURCE_DETAILS',
 'TOW_ARRIVED_SOURCE_DETAILS',
 'COMPLETED_SOURCE_DETAILS',
 'SERVICE_TIME_EASTERN',
 'SERVICE_TIME_UTC',
 'SERVICE_TIME_LOCAL',
 'ASSIGNED_TIME_EASTERN',
 'ASSIGNED_TIME_UTC',
 'ASSIGNED_TIME_LOCAL',
 'ENROUTE_TIME_EASTERN',
 'ENROUTE_TIME_UTC',
 'ENROUTE_TIME_LOCAL',
 'ONSCENE_TIME_EASTERN',
 'ONSCENE_TIME_UTC',
 'ONSCENE_TIME_LOCAL',
 'TOW_IN_PROGRESS_TIME_EASTERN',
 'TOW_IN_PROGRESS_TIME_UTC',
 'TOW_IN_PROGRESS_TIME_LOCAL',
 'TOW_ARRIVED_TIME_EASTERN',
 'TOW_ARRIVED_TIME_UTC',
 'TOW_ARRIVED_TIME_LOCAL',
 'COMPLETED_TIME_EASTERN',
 'COMPLETE

In [30]:
colnames = list(batch.columns)
# colnames.sort()

In [31]:
print(colnames)

['CASE_ID', 'TASK_ID', 'DRIVER_PROFILE_ID', 'DISPATCHER_PROFILE_ID', 'ENROUTE_LATITUDE', 'ENROUTE_LONGITUDE', 'ONSCENE_LATITUDE', 'ONSCENE_LONGITUDE', 'TOW_IN_PROGRESS_LATITUDE', 'TOW_IN_PROGRESS_LONGITUDE', 'TOW_ARRIVED_LATITUDE', 'TOW_ARRIVED_LONGITUDE', 'COMPLETED_LATITUDE', 'COMPLETED_LONGITUDE', 'STATUS_SOURCE', 'ASSIGNED_SOURCE_DETAILS', 'ENROUTE_SOURCE_DETAILS', 'ONSCENE_SOURCE_DETAILS', 'TOW_IN_PROGRESS_SOURCE_DETAILS', 'TOW_ARRIVED_SOURCE_DETAILS', 'COMPLETED_SOURCE_DETAILS', 'SERVICE_TIME_EASTERN', 'SERVICE_TIME_UTC', 'SERVICE_TIME_LOCAL', 'ASSIGNED_TIME_EASTERN', 'ASSIGNED_TIME_UTC', 'ASSIGNED_TIME_LOCAL', 'ENROUTE_TIME_EASTERN', 'ENROUTE_TIME_UTC', 'ENROUTE_TIME_LOCAL', 'ONSCENE_TIME_EASTERN', 'ONSCENE_TIME_UTC', 'ONSCENE_TIME_LOCAL', 'TOW_IN_PROGRESS_TIME_EASTERN', 'TOW_IN_PROGRESS_TIME_UTC', 'TOW_IN_PROGRESS_TIME_LOCAL', 'TOW_ARRIVED_TIME_EASTERN', 'TOW_ARRIVED_TIME_UTC', 'TOW_ARRIVED_TIME_LOCAL', 'COMPLETED_TIME_EASTERN', 'COMPLETED_TIME_UTC', 'COMPLETED_TIME_LOCAL']


In [32]:
master_column_names = ['CASE_ID', 'TASK_ID', 'DRIVER_PROFILE_ID', 'DISPATCHER_PROFILE_ID', 'ENROUTE_LATITUDE',
                        'ENROUTE_LONGITUDE', 'ONSCENE_LATITUDE', 'ONSCENE_LONGITUDE', 'TOW_IN_PROGRESS_LATITUDE',
                        'TOW_IN_PROGRESS_LONGITUDE', 'TOW_ARRIVED_LATITUDE', 'TOW_ARRIVED_LONGITUDE',
                        'COMPLETED_LATITUDE', 'COMPLETED_LONGITUDE', 'STATUS_SOURCE', 'ASSIGNED_SOURCE_DETAILS',
                        'ENROUTE_SOURCE_DETAILS', 'ONSCENE_SOURCE_DETAILS', 'TOW_IN_PROGRESS_SOURCE_DETAILS',
                        'TOW_ARRIVED_SOURCE_DETAILS', 'COMPLETED_SOURCE_DETAILS', 'SERVICE_TIME_EASTERN',
                        'SERVICE_TIME_UTC', 'SERVICE_TIME_LOCAL', 'ASSIGNED_TIME_EASTERN', 'ASSIGNED_TIME_UTC',
                        'ASSIGNED_TIME_LOCAL', 'ENROUTE_TIME_EASTERN', 'ENROUTE_TIME_UTC', 'ENROUTE_TIME_LOCAL',
                        'ONSCENE_TIME_EASTERN', 'ONSCENE_TIME_UTC', 'ONSCENE_TIME_LOCAL',
                        'TOW_IN_PROGRESS_TIME_EASTERN', 'TOW_IN_PROGRESS_TIME_UTC', 'TOW_IN_PROGRESS_TIME_LOCAL',
                        'TOW_ARRIVED_TIME_EASTERN', 'TOW_ARRIVED_TIME_UTC', 'TOW_ARRIVED_TIME_LOCAL',
                        'COMPLETED_TIME_EASTERN', 'COMPLETED_TIME_UTC', 'COMPLETED_TIME_LOCAL']

In [33]:
len(master_column_names)

42

In [34]:
len(column_names)

42

In [35]:
# Ensuring columns to exist
for col in master_column_names:
    print(col + ':', batch.expect_column_to_exist(col, result_format='BASIC', catch_exceptions=True), sep='\n')

CASE_ID:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
TASK_ID:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
DRIVER_PROFILE_ID:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
DISPATCHER_PROFILE_ID:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ENROUTE_LATITUDE:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ENROUTE_LONGITUDE:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ONSCENE_LATITUDE:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ONSCENE_LONGITUDE:
{'success': True, '

### 2. Validating column count in table is always the same

In [36]:
print('# of columns in customer_complaints: ', len(column_names))

# of columns in customer_complaints:  42


In [27]:
print('# of columns in {}: '.format('network_claims'), len(master_column_names), '\n')
if len(column_names) == len(master_column_names):
    print(batch.expect_table_column_count_to_equal(len(column_names), result_format='SUMMARY'))
else:
    print(batch.expect_table_column_count_to_equal(len(master_column_names), result_format='SUMMARY'))

# of columns in network_claims:  22 

{'success': True, 'result': {'observed_value': 22}}


### 3. Checking which columns should not have null values

In [37]:
# identifying which columns should not be null
print(column_names)

['CASE_ID', 'TASK_ID', 'DRIVER_PROFILE_ID', 'DISPATCHER_PROFILE_ID', 'ENROUTE_LATITUDE', 'ENROUTE_LONGITUDE', 'ONSCENE_LATITUDE', 'ONSCENE_LONGITUDE', 'TOW_IN_PROGRESS_LATITUDE', 'TOW_IN_PROGRESS_LONGITUDE', 'TOW_ARRIVED_LATITUDE', 'TOW_ARRIVED_LONGITUDE', 'COMPLETED_LATITUDE', 'COMPLETED_LONGITUDE', 'STATUS_SOURCE', 'ASSIGNED_SOURCE_DETAILS', 'ENROUTE_SOURCE_DETAILS', 'ONSCENE_SOURCE_DETAILS', 'TOW_IN_PROGRESS_SOURCE_DETAILS', 'TOW_ARRIVED_SOURCE_DETAILS', 'COMPLETED_SOURCE_DETAILS', 'SERVICE_TIME_EASTERN', 'SERVICE_TIME_UTC', 'SERVICE_TIME_LOCAL', 'ASSIGNED_TIME_EASTERN', 'ASSIGNED_TIME_UTC', 'ASSIGNED_TIME_LOCAL', 'ENROUTE_TIME_EASTERN', 'ENROUTE_TIME_UTC', 'ENROUTE_TIME_LOCAL', 'ONSCENE_TIME_EASTERN', 'ONSCENE_TIME_UTC', 'ONSCENE_TIME_LOCAL', 'TOW_IN_PROGRESS_TIME_EASTERN', 'TOW_IN_PROGRESS_TIME_UTC', 'TOW_IN_PROGRESS_TIME_LOCAL', 'TOW_ARRIVED_TIME_EASTERN', 'TOW_ARRIVED_TIME_UTC', 'TOW_ARRIVED_TIME_LOCAL', 'COMPLETED_TIME_EASTERN', 'COMPLETED_TIME_UTC', 'COMPLETED_TIME_LOCAL']


In [38]:
rule_df.isnull().sum()

CASE_ID                                 0
TASK_ID                                 0
DRIVER_PROFILE_ID                   75900
DISPATCHER_PROFILE_ID                 157
ENROUTE_LATITUDE                   477148
ENROUTE_LONGITUDE                  477148
ONSCENE_LATITUDE                   458142
ONSCENE_LONGITUDE                  458142
TOW_IN_PROGRESS_LATITUDE          1249104
TOW_IN_PROGRESS_LONGITUDE         1249104
TOW_ARRIVED_LATITUDE              1624380
TOW_ARRIVED_LONGITUDE             1624380
COMPLETED_LATITUDE                 118180
COMPLETED_LONGITUDE                118180
STATUS_SOURCE                         683
ASSIGNED_SOURCE_DETAILS           2090301
ENROUTE_SOURCE_DETAILS             444402
ONSCENE_SOURCE_DETAILS             464771
TOW_IN_PROGRESS_SOURCE_DETAILS    1235314
TOW_ARRIVED_SOURCE_DETAILS        1615565
COMPLETED_SOURCE_DETAILS           161466
SERVICE_TIME_EASTERN               256088
SERVICE_TIME_UTC                   256088
SERVICE_TIME_LOCAL                

In [67]:
# Separating null & non-null columns
null_cols = list(batch.isnull().sum()[batch.isnull().sum() > 0].keys())
not_null_cols = list(batch.isnull().sum()[batch.isnull().sum() == 0].keys())

In [40]:
print('Viewing column null value counts: ', batch.isnull().sum(), sep='\n')

Viewing column null value counts: 
CASE_ID                                 0
TASK_ID                                 0
DRIVER_PROFILE_ID                   75900
DISPATCHER_PROFILE_ID                 157
ENROUTE_LATITUDE                   477148
ENROUTE_LONGITUDE                  477148
ONSCENE_LATITUDE                   458142
ONSCENE_LONGITUDE                  458142
TOW_IN_PROGRESS_LATITUDE          1249104
TOW_IN_PROGRESS_LONGITUDE         1249104
TOW_ARRIVED_LATITUDE              1624380
TOW_ARRIVED_LONGITUDE             1624380
COMPLETED_LATITUDE                 118180
COMPLETED_LONGITUDE                118180
STATUS_SOURCE                         683
ASSIGNED_SOURCE_DETAILS           2090301
ENROUTE_SOURCE_DETAILS             444402
ONSCENE_SOURCE_DETAILS             464771
TOW_IN_PROGRESS_SOURCE_DETAILS    1235314
TOW_ARRIVED_SOURCE_DETAILS        1615565
COMPLETED_SOURCE_DETAILS           161466
SERVICE_TIME_EASTERN               256088
SERVICE_TIME_UTC                   256088

In [51]:
not_null_cols.sort()

In [45]:
print(not_null_cols)

['CASE_ID', 'TASK_ID', 'COMPLETED_TIME_EASTERN', 'COMPLETED_TIME_UTC']


In [47]:
null_cols.sort()

In [68]:
print(null_cols)

['DRIVER_PROFILE_ID', 'DISPATCHER_PROFILE_ID', 'ENROUTE_LATITUDE', 'ENROUTE_LONGITUDE', 'ONSCENE_LATITUDE', 'ONSCENE_LONGITUDE', 'TOW_IN_PROGRESS_LATITUDE', 'TOW_IN_PROGRESS_LONGITUDE', 'TOW_ARRIVED_LATITUDE', 'TOW_ARRIVED_LONGITUDE', 'COMPLETED_LATITUDE', 'COMPLETED_LONGITUDE', 'STATUS_SOURCE', 'ASSIGNED_SOURCE_DETAILS', 'ENROUTE_SOURCE_DETAILS', 'ONSCENE_SOURCE_DETAILS', 'TOW_IN_PROGRESS_SOURCE_DETAILS', 'TOW_ARRIVED_SOURCE_DETAILS', 'COMPLETED_SOURCE_DETAILS', 'SERVICE_TIME_EASTERN', 'SERVICE_TIME_UTC', 'SERVICE_TIME_LOCAL', 'ASSIGNED_TIME_EASTERN', 'ASSIGNED_TIME_UTC', 'ASSIGNED_TIME_LOCAL', 'ENROUTE_TIME_EASTERN', 'ENROUTE_TIME_UTC', 'ENROUTE_TIME_LOCAL', 'ONSCENE_TIME_EASTERN', 'ONSCENE_TIME_UTC', 'ONSCENE_TIME_LOCAL', 'TOW_IN_PROGRESS_TIME_EASTERN', 'TOW_IN_PROGRESS_TIME_UTC', 'TOW_IN_PROGRESS_TIME_LOCAL', 'TOW_ARRIVED_TIME_EASTERN', 'TOW_ARRIVED_TIME_UTC', 'TOW_ARRIVED_TIME_LOCAL', 'COMPLETED_TIME_LOCAL']


In [69]:
# checking for all columns that shouldn't be null are not
for col in not_null_cols:
    print(col, '\n', batch.expect_column_values_to_not_be_null(col, result_format='BASIC'))

CASE_ID 
 {'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
TASK_ID 
 {'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
COMPLETED_TIME_EASTERN 
 {'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
COMPLETED_TIME_UTC 
 {'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}


In [49]:
print(f"capturing the following columns to not be null: \n {not_null_cols}")

capturing the following columns to not be null: 
 ['CASE_ID', 'TASK_ID', 'COMPLETED_TIME_EASTERN', 'COMPLETED_TIME_UTC']


### 4. Validating columns to have null values
- **columns to check:**
    - 'ASSIGNED_SOURCE_DETAILS', 'ASSIGNED_TIME_EASTERN', 'ASSIGNED_TIME_LOCAL', 'ASSIGNED_TIME_UTC',
'COMPLETED_LATITUDE', 'COMPLETED_LONGITUDE', 'COMPLETED_SOURCE_DETAILS', 'COMPLETED_TIME_LOCAL',
'DISPATCHER_PROFILE_ID', 'DRIVER_PROFILE_ID', 'ENROUTE_LATITUDE', 'ENROUTE_LONGITUDE',
'ENROUTE_SOURCE_DETAILS', 'ENROUTE_TIME_EASTERN', 'ENROUTE_TIME_LOCAL','ENROUTE_TIME_UTC',	                    'ONSCENE_LATITUDE', 'ONSCENE_LONGITUDE', 'ONSCENE_SOURCE_DETAILS', 'ONSCENE_TIME_EASTERN',
'ONSCENE_TIME_LOCAL', 'ONSCENE_TIME_UTC', 'SERVICE_TIME_EASTERN', 'SERVICE_TIME_LOCAL',
'SERVICE_TIME_UTC', 'STATUS_SOURCE', 'TOW_ARRIVED_LATITUDE', 'TOW_ARRIVED_LONGITUDE',
'TOW_ARRIVED_SOURCE_DETAILS', 'TOW_ARRIVED_TIME_EASTERN', 'TOW_ARRIVED_TIME_LOCAL',
'TOW_ARRIVED_TIME_UTC', 'TOW_IN_PROGRESS_LATITUDE', 'TOW_IN_PROGRESS_LONGITUDE',
'TOW_IN_PROGRESS_SOURCE_DETAILS', 'TOW_IN_PROGRESS_TIME_EASTERN', 'TOW_IN_PROGRESS_TIME_LOCAL',
'TOW_IN_PROGRESS_TIME_UTC'

In [242]:
from typing import Union
from great_expectations.dataset import PandasDataset
def get_df_not_null_weights(df: Union[pd.DataFrame, PandasDataset], groupby_col: str, not_null_col: str) -> float:
    """
    Provides specified column's weight/percentage for it not to be null.

    Parameters
    -----------
    df: pd.DataFrame or great_expectations.dataset.PandasDataset
        dataframe object to look at
    groupby_col: str
        grouping column string to groupby dataframe on when looking at specified column in next parameter
    not_null_col: str
        column used from dataframe to calculate safe weight thresholds of when it would be not null

    Returns
    ------------
    float
        Not null weight of specified column lowered by 5% after looking at the 10% quartile
    """

    df_group = df.groupby(df[groupby_col].dt.date)
    df_group = df_group.apply(lambda x: x[not_null_col].notnull().mean())

    adjusted_weight = df_group.quantile(0.1, interpolation='lower')
    return adjusted_weight.round(4)

In [115]:
from typing import Union
from great_expectations.dataset import PandasDataset
def get_df_not_null_weights(df: Union[pd.DataFrame, PandasDataset], groupby_col: str, not_null_col: str) -> float:
	"""
	Provides specified column's weight/percentage for it not to be null.

	Parameters
	-----------
	df: pd.DataFrame or great_expectations.dataset.PandasDataset
		dataframe object to look at
	groupby_col: str
		grouping column string to groupby dataframe on when looking at specified column in next parameter
	not_null_col: str
		column used from dataframe to calculate safe weight thresholds of when it would be not null

	Returns
	------------
	float
		Not null weight of specified column lowered by 5% after looking at the 10% quartile
	"""
	
	df_group = df.groupby(df[groupby_col].dt.date)
	df_group = df_group.apply(lambda x: x[not_null_col].notnull().mean())

	base_weight = df_group.quantile(0.1, interpolation='midpoint')
	adjusted_weight = (base_weight - 0.009)
	if adjusted_weight < 0.005:
		final_weight = base_weight.round(4)
	else:
		final_weight = adjusted_weight.round(4)
	return float(final_weight)

In [116]:
for col in null_cols:
    w = get_df_not_null_weights(rule_df, 'COMPLETED_TIME_UTC', col)
    print(col, w, sep='\n')

DRIVER_PROFILE_ID
0.9513
DISPATCHER_PROFILE_ID
0.9909
ENROUTE_LATITUDE
0.7595
ENROUTE_LONGITUDE
0.7595
ONSCENE_LATITUDE
0.7679
ONSCENE_LONGITUDE
0.7679
TOW_IN_PROGRESS_LATITUDE
0.3144
TOW_IN_PROGRESS_LONGITUDE
0.3144
TOW_ARRIVED_LATITUDE
0.1772
TOW_ARRIVED_LONGITUDE
0.1772
COMPLETED_LATITUDE
0.9313
COMPLETED_LONGITUDE
0.9313
STATUS_SOURCE
0.9905


KeyboardInterrupt: 

In [124]:
# calculating weight for columns of how often they should be null
null_percents = dict(1 -(batch.isnull().sum() / len(batch))[batch.isnull().sum() / len(batch) > 0])
    
print(null_percents)
not_null_weights = null_percents.copy()

# lowering weights by one thousandth of decimal
for key, weight in not_null_weights.items():
    not_null_weights[key] = round((weight - 0.009), 4)
    
print('---------------------------------------')
print('not null weights:')
print(not_null_weights)

{'DRIVER_PROFILE_ID': 0.9642906375132734, 'DISPATCHER_PROFILE_ID': 0.999926134783789, 'ENROUTE_LATITUDE': 0.7755118459576202, 'ENROUTE_LONGITUDE': 0.7755118459576202, 'ONSCENE_LATITUDE': 0.7844537714309103, 'ONSCENE_LONGITUDE': 0.7844537714309103, 'TOW_IN_PROGRESS_LATITUDE': 0.41232269407615085, 'TOW_IN_PROGRESS_LONGITUDE': 0.41232269407615085, 'TOW_ARRIVED_LATITUDE': 0.23576318529395301, 'TOW_ARRIVED_LONGITUDE': 0.23576318529395301, 'COMPLETED_LATITUDE': 0.9443987818355554, 'COMPLETED_LONGITUDE': 0.9443987818355554, 'STATUS_SOURCE': 0.9996786627855279, 'ASSIGNED_SOURCE_DETAILS': 0.016557099929287, 'ENROUTE_SOURCE_DETAILS': 0.7909181540470847, 'ONSCENE_SOURCE_DETAILS': 0.7813349655821026, 'TOW_IN_PROGRESS_SOURCE_DETAILS': 0.4188106006465323, 'TOW_ARRIVED_SOURCE_DETAILS': 0.23991045842070524, 'COMPLETED_SOURCE_DETAILS': 0.9240336241991858, 'SERVICE_TIME_EASTERN': 0.8795159522990667, 'SERVICE_TIME_UTC': 0.8795159522990667, 'SERVICE_TIME_LOCAL': 0.8771174499280873, 'ASSIGNED_TIME_EASTERN'

In [143]:
round_null_percents = {k:round(v, 4) for k, v in null_percents.items()}
round_not_null_weights = {k:round(v, 4) for k, v in not_null_weights.items()}

In [144]:
print(round_null_percents)

{'DRIVER_PROFILE_ID': 0.9643, 'DISPATCHER_PROFILE_ID': 0.9999, 'ENROUTE_LATITUDE': 0.7755, 'ENROUTE_LONGITUDE': 0.7755, 'ONSCENE_LATITUDE': 0.7845, 'ONSCENE_LONGITUDE': 0.7845, 'TOW_IN_PROGRESS_LATITUDE': 0.4123, 'TOW_IN_PROGRESS_LONGITUDE': 0.4123, 'TOW_ARRIVED_LATITUDE': 0.2358, 'TOW_ARRIVED_LONGITUDE': 0.2358, 'COMPLETED_LATITUDE': 0.9444, 'COMPLETED_LONGITUDE': 0.9444, 'STATUS_SOURCE': 0.9997, 'ASSIGNED_SOURCE_DETAILS': 0.0166, 'ENROUTE_SOURCE_DETAILS': 0.7909, 'ONSCENE_SOURCE_DETAILS': 0.7813, 'TOW_IN_PROGRESS_SOURCE_DETAILS': 0.4188, 'TOW_ARRIVED_SOURCE_DETAILS': 0.2399, 'COMPLETED_SOURCE_DETAILS': 0.924, 'SERVICE_TIME_EASTERN': 0.8795, 'SERVICE_TIME_UTC': 0.8795, 'SERVICE_TIME_LOCAL': 0.8771, 'ASSIGNED_TIME_EASTERN': 0.8795, 'ASSIGNED_TIME_UTC': 0.8795, 'ASSIGNED_TIME_LOCAL': 0.8771, 'ENROUTE_TIME_EASTERN': 0.8064, 'ENROUTE_TIME_UTC': 0.8064, 'ENROUTE_TIME_LOCAL': 0.804, 'ONSCENE_TIME_EASTERN': 0.8136, 'ONSCENE_TIME_UTC': 0.8136, 'ONSCENE_TIME_LOCAL': 0.8112, 'TOW_IN_PROGRESS_TI

In [145]:
print(round_not_null_weights)

{'DRIVER_PROFILE_ID': 0.9553, 'DISPATCHER_PROFILE_ID': 0.9909, 'ENROUTE_LATITUDE': 0.7665, 'ENROUTE_LONGITUDE': 0.7665, 'ONSCENE_LATITUDE': 0.7755, 'ONSCENE_LONGITUDE': 0.7755, 'TOW_IN_PROGRESS_LATITUDE': 0.4033, 'TOW_IN_PROGRESS_LONGITUDE': 0.4033, 'TOW_ARRIVED_LATITUDE': 0.2268, 'TOW_ARRIVED_LONGITUDE': 0.2268, 'COMPLETED_LATITUDE': 0.9354, 'COMPLETED_LONGITUDE': 0.9354, 'STATUS_SOURCE': 0.9907, 'ASSIGNED_SOURCE_DETAILS': 0.0076, 'ENROUTE_SOURCE_DETAILS': 0.7819, 'ONSCENE_SOURCE_DETAILS': 0.7723, 'TOW_IN_PROGRESS_SOURCE_DETAILS': 0.4098, 'TOW_ARRIVED_SOURCE_DETAILS': 0.2309, 'COMPLETED_SOURCE_DETAILS': 0.915, 'SERVICE_TIME_EASTERN': 0.8705, 'SERVICE_TIME_UTC': 0.8705, 'SERVICE_TIME_LOCAL': 0.8681, 'ASSIGNED_TIME_EASTERN': 0.8705, 'ASSIGNED_TIME_UTC': 0.8705, 'ASSIGNED_TIME_LOCAL': 0.8681, 'ENROUTE_TIME_EASTERN': 0.7974, 'ENROUTE_TIME_UTC': 0.7974, 'ENROUTE_TIME_LOCAL': 0.795, 'ONSCENE_TIME_EASTERN': 0.8046, 'ONSCENE_TIME_UTC': 0.8046, 'ONSCENE_TIME_LOCAL': 0.8022, 'TOW_IN_PROGRESS_TI

In [130]:
print(null_percents)

{'DRIVER_PROFILE_ID': 0.9642906375132734, 'DISPATCHER_PROFILE_ID': 0.999926134783789, 'ENROUTE_LATITUDE': 0.7755118459576202, 'ENROUTE_LONGITUDE': 0.7755118459576202, 'ONSCENE_LATITUDE': 0.7844537714309103, 'ONSCENE_LONGITUDE': 0.7844537714309103, 'TOW_IN_PROGRESS_LATITUDE': 0.41232269407615085, 'TOW_IN_PROGRESS_LONGITUDE': 0.41232269407615085, 'TOW_ARRIVED_LATITUDE': 0.23576318529395301, 'TOW_ARRIVED_LONGITUDE': 0.23576318529395301, 'COMPLETED_LATITUDE': 0.9443987818355554, 'COMPLETED_LONGITUDE': 0.9443987818355554, 'STATUS_SOURCE': 0.9996786627855279, 'ASSIGNED_SOURCE_DETAILS': 0.016557099929287, 'ENROUTE_SOURCE_DETAILS': 0.7909181540470847, 'ONSCENE_SOURCE_DETAILS': 0.7813349655821026, 'TOW_IN_PROGRESS_SOURCE_DETAILS': 0.4188106006465323, 'TOW_ARRIVED_SOURCE_DETAILS': 0.23991045842070524, 'COMPLETED_SOURCE_DETAILS': 0.9240336241991858, 'SERVICE_TIME_EASTERN': 0.8795159522990667, 'SERVICE_TIME_UTC': 0.8795159522990667, 'SERVICE_TIME_LOCAL': 0.8771174499280873, 'ASSIGNED_TIME_EASTERN'

In [131]:
print(not_null_weights)

{'DRIVER_PROFILE_ID': 0.9553, 'DISPATCHER_PROFILE_ID': 0.9909, 'ENROUTE_LATITUDE': 0.7665, 'ENROUTE_LONGITUDE': 0.7665, 'ONSCENE_LATITUDE': 0.7755, 'ONSCENE_LONGITUDE': 0.7755, 'TOW_IN_PROGRESS_LATITUDE': 0.4033, 'TOW_IN_PROGRESS_LONGITUDE': 0.4033, 'TOW_ARRIVED_LATITUDE': 0.2268, 'TOW_ARRIVED_LONGITUDE': 0.2268, 'COMPLETED_LATITUDE': 0.9354, 'COMPLETED_LONGITUDE': 0.9354, 'STATUS_SOURCE': 0.9907, 'ASSIGNED_SOURCE_DETAILS': 0.0076, 'ENROUTE_SOURCE_DETAILS': 0.7819, 'ONSCENE_SOURCE_DETAILS': 0.7723, 'TOW_IN_PROGRESS_SOURCE_DETAILS': 0.4098, 'TOW_ARRIVED_SOURCE_DETAILS': 0.2309, 'COMPLETED_SOURCE_DETAILS': 0.915, 'SERVICE_TIME_EASTERN': 0.8705, 'SERVICE_TIME_UTC': 0.8705, 'SERVICE_TIME_LOCAL': 0.8681, 'ASSIGNED_TIME_EASTERN': 0.8705, 'ASSIGNED_TIME_UTC': 0.8705, 'ASSIGNED_TIME_LOCAL': 0.8681, 'ENROUTE_TIME_EASTERN': 0.7974, 'ENROUTE_TIME_UTC': 0.7974, 'ENROUTE_TIME_LOCAL': 0.795, 'ONSCENE_TIME_EASTERN': 0.8046, 'ONSCENE_TIME_UTC': 0.8046, 'ONSCENE_TIME_LOCAL': 0.8022, 'TOW_IN_PROGRESS_TI

In [146]:
null_col_weights = {'DRIVER_PROFILE_ID': 0.95, 'DISPATCHER_PROFILE_ID': 0.99, 'ENROUTE_LATITUDE': 0.75,
	                    'ENROUTE_LONGITUDE': 0.75, 'ONSCENE_LATITUDE': 0.75, 'ONSCENE_LONGITUDE': 0.75,
	                    'TOW_IN_PROGRESS_LATITUDE': 0.4, 'TOW_IN_PROGRESS_LONGITUDE': 0.4,
	                    'TOW_ARRIVED_LATITUDE': 0.2, 'TOW_ARRIVED_LONGITUDE': 0.2, 'COMPLETED_LATITUDE': 0.92,
	                    'COMPLETED_LONGITUDE': 0.92, 'STATUS_SOURCE': 0.99, 'ASSIGNED_SOURCE_DETAILS': 0.01,
	                    'ENROUTE_SOURCE_DETAILS': 0.75, 'ONSCENE_SOURCE_DETAILS': 0.75,
	                    'TOW_IN_PROGRESS_SOURCE_DETAILS': 0.40, 'TOW_ARRIVED_SOURCE_DETAILS': 0.2,
	                    'COMPLETED_SOURCE_DETAILS': 0.9, 'SERVICE_TIME_EASTERN': 0.85, 'SERVICE_TIME_UTC': 0.85,
	                    'SERVICE_TIME_LOCAL': 0.85, 'ASSIGNED_TIME_EASTERN': 0.85, 'ASSIGNED_TIME_UTC': 0.85,
	                    'ASSIGNED_TIME_LOCAL': 0.85, 'ENROUTE_TIME_EASTERN': 0.78, 'ENROUTE_TIME_UTC': 0.78,
	                    'ENROUTE_TIME_LOCAL': 0.78, 'ONSCENE_TIME_EASTERN': 0.79, 'ONSCENE_TIME_UTC': 0.79,
	                    'ONSCENE_TIME_LOCAL': 0.79, 'TOW_IN_PROGRESS_TIME_EASTERN': 0.4,
	                    'TOW_IN_PROGRESS_TIME_UTC': 0.4, 'TOW_IN_PROGRESS_TIME_LOCAL': 0.4,
	                    'TOW_ARRIVED_TIME_EASTERN': 0.21, 'TOW_ARRIVED_TIME_UTC': 0.21,
	                    'TOW_ARRIVED_TIME_LOCAL': 0.21, 'COMPLETED_TIME_LOCAL': 0.99}

In [147]:
for col, weight in null_col_weights.items():
    print(col, batch.expect_column_values_to_not_be_null(col, mostly=weight, include_config=True,
                                                           catch_exceptions=True,
                                                           result_format='SUMMARY'), sep='\n')

DRIVER_PROFILE_ID
{'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 75900, 'unexpected_percent': 3.5709362486726612, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'DRIVER_PROFILE_ID', 'mostly': 0.95, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
DISPATCHER_PROFILE_ID
{'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 157, 'unexpected_percent': 0.007386521621101552, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'DISPATCHER_PROFILE_ID', 'mostly': 0.99, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
ENROUTE_LATITUDE
{'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 477148

SERVICE_TIME_EASTERN
{'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 256088, 'unexpected_percent': 12.048404770093338, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'SERVICE_TIME_EASTERN', 'mostly': 0.85, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
SERVICE_TIME_UTC
{'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 256088, 'unexpected_percent': 12.048404770093338, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'SERVICE_TIME_UTC', 'mostly': 0.85, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
SERVICE_TIME_LOCAL
{'success': True, 'result': {'element_count': 2125493, 'unexpected_count': 261186

KeyboardInterrupt: 

### 5. Expecting column values to be in a set
- COMPLAINT_CATEGORY
- COMPLAINT_REASON
- COMPLAINT_REASON_DETAILS
- COMPLAINT_ORGIN
- CASE_RESOLUTION
- COMPLAINT_TYPE

In [134]:
from snowflake.connector.converter_null import SnowflakeNoConverterToPython
def snowflake_connector_to_df(query: str) -> pd.DataFrame:
    sf_creds = load_credentials()
    ctx = snowflake.connector.connect(
        user=sf_creds['dsa_username'],
        password=sf_creds['dsa_password'],
        account=sf_creds['dsa_account'],
        database=sf_creds['dsa_master_database'],
        schema=sf_creds['dsa_schema'],
        warehouse=sf_creds['dsa_warehouse'],
        role=sf_creds['dsa_etl_role'],
        converter_class=SnowflakeNoConverterToPython
    )
    cur = ctx.cursor()
    try:
        cur.execute(query)
        df = cur.fetch_pandas_all()
    finally:
        cur.close()
    return df

In [135]:
from snowflake.connector.converter_null import SnowflakeNoConverterToPython
def snowflake_connector_to_df(query: str) -> pd.DataFrame:
    sf_creds = gda.load_credentials()
    ctx = snowflake.connector.connect(
        user=sf_creds['dsa_username'],
        password=sf_creds['dsa_password'],
        account=sf_creds['dsa_account'],
        database=sf_creds['dsa_master_database'],
        schema=sf_creds['dsa_schema'],
        warehouse=sf_creds['dsa_warehouse'],
        role=sf_creds['dsa_etl_role'],
        converter_class=SnowflakeNoConverterToPython
    )
    cur = ctx.cursor()
    try:
        cur.execute(query)
        df = cur.fetch_pandas_all()
    finally:
        cur.close()
    return df

In [148]:
master_column_names


['CASE_ID',
 'TASK_ID',
 'DRIVER_PROFILE_ID',
 'DISPATCHER_PROFILE_ID',
 'ENROUTE_LATITUDE',
 'ENROUTE_LONGITUDE',
 'ONSCENE_LATITUDE',
 'ONSCENE_LONGITUDE',
 'TOW_IN_PROGRESS_LATITUDE',
 'TOW_IN_PROGRESS_LONGITUDE',
 'TOW_ARRIVED_LATITUDE',
 'TOW_ARRIVED_LONGITUDE',
 'COMPLETED_LATITUDE',
 'COMPLETED_LONGITUDE',
 'STATUS_SOURCE',
 'ASSIGNED_SOURCE_DETAILS',
 'ENROUTE_SOURCE_DETAILS',
 'ONSCENE_SOURCE_DETAILS',
 'TOW_IN_PROGRESS_SOURCE_DETAILS',
 'TOW_ARRIVED_SOURCE_DETAILS',
 'COMPLETED_SOURCE_DETAILS',
 'SERVICE_TIME_EASTERN',
 'SERVICE_TIME_UTC',
 'SERVICE_TIME_LOCAL',
 'ASSIGNED_TIME_EASTERN',
 'ASSIGNED_TIME_UTC',
 'ASSIGNED_TIME_LOCAL',
 'ENROUTE_TIME_EASTERN',
 'ENROUTE_TIME_UTC',
 'ENROUTE_TIME_LOCAL',
 'ONSCENE_TIME_EASTERN',
 'ONSCENE_TIME_UTC',
 'ONSCENE_TIME_LOCAL',
 'TOW_IN_PROGRESS_TIME_EASTERN',
 'TOW_IN_PROGRESS_TIME_UTC',
 'TOW_IN_PROGRESS_TIME_LOCAL',
 'TOW_ARRIVED_TIME_EASTERN',
 'TOW_ARRIVED_TIME_UTC',
 'TOW_ARRIVED_TIME_LOCAL',
 'COMPLETED_TIME_EASTERN',
 'COMPLETE

In [152]:
c_weights = {}
for col in master_column_names:
    unique_weights = rule_df[col].value_counts(normalize=True) * 100
    c_weights[col] = unique_weights.values.mean().round(5)

In [153]:
c_weights

{'CASE_ID': 5e-05,
 'TASK_ID': 6.66667,
 'DRIVER_PROFILE_ID': 0.00392,
 'DISPATCHER_PROFILE_ID': 0.02138,
 'ENROUTE_LATITUDE': 7e-05,
 'ENROUTE_LONGITUDE': 7e-05,
 'ONSCENE_LATITUDE': 7e-05,
 'ONSCENE_LONGITUDE': 7e-05,
 'TOW_IN_PROGRESS_LATITUDE': 0.00012,
 'TOW_IN_PROGRESS_LONGITUDE': 0.00012,
 'TOW_ARRIVED_LATITUDE': 0.00021,
 'TOW_ARRIVED_LONGITUDE': 0.00021,
 'COMPLETED_LATITUDE': 6e-05,
 'COMPLETED_LONGITUDE': 6e-05,
 'STATUS_SOURCE': 8.33333,
 'ASSIGNED_SOURCE_DETAILS': 50.0,
 'ENROUTE_SOURCE_DETAILS': 11.11111,
 'ONSCENE_SOURCE_DETAILS': 7.14286,
 'TOW_IN_PROGRESS_SOURCE_DETAILS': 12.5,
 'TOW_ARRIVED_SOURCE_DETAILS': 14.28571,
 'COMPLETED_SOURCE_DETAILS': 9.09091,
 'SERVICE_TIME_EASTERN': 7e-05,
 'SERVICE_TIME_UTC': 7e-05,
 'SERVICE_TIME_LOCAL': 7e-05,
 'ASSIGNED_TIME_EASTERN': 6e-05,
 'ASSIGNED_TIME_UTC': 6e-05,
 'ASSIGNED_TIME_LOCAL': 6e-05,
 'ENROUTE_TIME_EASTERN': 7e-05,
 'ENROUTE_TIME_UTC': 7e-05,
 'ENROUTE_TIME_LOCAL': 7e-05,
 'ONSCENE_TIME_EASTERN': 7e-05,
 'ONSCENE_TIME

In [164]:
def get_categorical_columns_values(df: Union[pd.DataFrame, PandasDataset], cols: list, table_name: str) -> dict:
	c_weights = {}
	for col in cols:
		unique_weights = df[col].value_counts(normalize=True) * 100
		c_weights[col] = unique_weights.values.mean().round(5)
		
	cat_weight_dict = {c: w for (c, w) in c_weights.items() if w > 0.9 if df[c].dtypes != bool
	                   if c not in ['TASK_ID', 'task_id'] if c not in ['CLIENT_ID', 'client_id']}
	
	execute_strings = ' '.join(f"SELECT DISTINCT {c_name} FROM {table_name};" for c_name in cat_weight_dict.keys())
	ctx = gda.get_snowflake_connector()
	
	cursor_list = ctx.execute_string(execute_strings, remove_comments=True, return_cursors=True)
	category_col_values = {}
	for cur in cursor_list:
		col_names = ','.join([col[0] for col in cur.description])
		cat_values = [x[0] for x in cur.fetchall() if x[0]]
		category_col_values[col_names] = cat_values
	return category_col_values

In [157]:
dtest = get_categorical_columns_values(rule_df, master_column_names, 'service_progress')

In [168]:
print(dtest)

{'STATUS_SOURCE': ['pulsar-connection-app', 'swoop_digital_dispatch', 'towmagic', 'roadsideconnect', 'towxchangetopsinterfaceprod', 'rangersst', 'towmagic4-prod', 'inman_app_issc', '&src:automaticstatusupdate', 'towbook', 'roadsideconnect-ios-app', 'agero_html5-agerosupport-web-portal', 'progrssive-platforms-dispatch-app', 'trackerddlive'], 'ASSIGNED_SOURCE_DETAILS': ['autoassigned', 'autoassi', 'autoas'], 'ENROUTE_SOURCE_DETAILS': ['tomtom; error', 'unknown', 'android', 'pulsar', 'ranger', 'txi', 'tomtom', 'ios', 'swoop', 'towmagic4'], 'ONSCENE_SOURCE_DETAILS': ['automaticstatusupd', 'tomtom; error', 'ios-geofence-ss', 'unknown', 'automaticstatusupdate aws-g', 'issc', 'system', 'automaticstatusupda', 'android', 'pulsar', 'automaticstatu', 'towbook', 'automaticstatusupdate aws-gf', 'txi', 'ranger', 'automa', 'tomtom', 'automaticstatusupdate aws-', 'ios', 'swoop', 'towmagic4', 'android-geofence-ss', '-geofence-ss'], 'TOW_IN_PROGRESS_SOURCE_DETAILS': ['automaticstatusupd', 'tomtom; error

In [419]:
[len(x) for x in val_set_list]

[8, 73, 8, 13, 34, 39]

In [248]:
execute_strings = ' '.join(f"SELECT DISTINCT {col} FROM customer_complaints;" for col in cat_dict)

In [249]:
execute_strings

'SELECT DISTINCT CASE_RESOLUTION FROM customer_complaints; SELECT DISTINCT COMPLAINT_CATEGORY FROM customer_complaints; SELECT DISTINCT COMPLAINT_ORIGIN FROM customer_complaints; SELECT DISTINCT COMPLAINT_REASON FROM customer_complaints; SELECT DISTINCT COMPLAINT_REASON_DETAILS FROM customer_complaints; SELECT DISTINCT COMPLAINT_TYPE FROM customer_complaints;'

In [292]:
for v in category_col_values.values():
    print(len(v))

42
11
23
89
38
62


In [269]:
frames = []
for cur in cursor_list:
    df = cur.fetch_pandas_all()
    frames.append(df)
cat_df = pd.concat(frames)

In [272]:
cat_df3 = cat_df.drop_duplicates()

In [271]:
cat_df.COMPLAINT_CATEGORY.nunique()

12

In [428]:
val_set_list = []

for col in master_categorical_columns:
    list_sets = list(batch.get_column_value_counts(col, sort='count').keys())
    val_set_list.append(list_sets)

In [429]:
for c, v in zip(master_categorical_columns, val_set_list):
    print(c, len(v))

COMPLAINT_CATEGORY 8
COMPLAINT_REASON 73
COMPLAINT_REASON_DETAILS 8
COMPLAINT_ORIGIN 13
CASE_RESOLUTION 34
COMPLAINT_TYPE 39


In [79]:
for col, vals in zip(category_cols, val_set_list):
    print(col, '\n', batch.expect_column_values_to_be_in_set(col, vals, result_format='BASIC', 
                                        include_config=True, catch_exceptions=True), '\n')

ORIGINAL_CLAIM_STATUS_CODE 
 {'success': True, 'result': {'element_count': 2678943, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': {'column': 'ORIGINAL_CLAIM_STATUS_CODE', 'value_set': ['AP', 'DP', 'FC', 'FP', 'HD', 'IN', 'PA', 'PC', 'PD', 'PJ', 'SP', 'VD'], 'result_format': 'BASIC'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}} 

ORIGINAL_CLAIM_TYPE_CODE 
 {'success': True, 'result': {'element_count': 2678943, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': {'column': 'ORIGINAL_CLAIM_TYPE_CODE', 'value_set': ['OWN', 'VND'

In [431]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### 8. Determine if columns are unique per row
- CASE_ID, TASK_ID

In [171]:
batch.expect_multicolumn_values_to_be_unique(column_list=['CASE_ID', 'TASK_ID'], result_format='SUMMARY',
                                            catch_exceptions=True, include_config=True)

{'success': True,
 'result': {'element_count': 2125493,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': [],
  'partial_unexpected_index_list': [],
  'partial_unexpected_counts': []},
 'expectation_config': {'expectation_type': 'expect_multicolumn_values_to_be_unique',
  'kwargs': {'column_list': ['CASE_ID', 'TASK_ID'],
   'result_format': 'SUMMARY'}},
 'exception_info': {'raised_exception': False,
  'exception_message': None,
  'exception_traceback': None}}

### 9. Looking at column A to be greater than column B
- COMPLETED_TIME_UTC > SERVICE_TIME_UTC
- ASSIGNED_TIME_UTC > SERVICE_TIME_UTC
- ENROUTE_TIME_UTC > ASSIGNED_TIME_UTC
- ONSCENE_TIME_UTC > ENROUTE_TIME_UTC
- TOW_IN_PROGRESS_TIME_UTC > ONSCENE_TIME_UTC
- TOW_ARRIVED_TIME_UTC > TOW_IN_PROGRESS_TIME_UTC
- COMPLETED_TIME_UTC > TOW_ARRIVED_TIME_UTC

In [192]:
print(larger_than_cols[-1][0], larger_than_cols[-1][1])

COMPLETED_TIME_UTC TOW_ARRIVED_TIME_UTC


In [194]:
larger_than_cols = [["COMPLETED_TIME_UTC", "SERVICE_TIME_UTC"], ["ASSIGNED_TIME_UTC", "SERVICE_TIME_UTC"],
	                    ["ENROUTE_TIME_UTC", "ASSIGNED_TIME_UTC"], ["ONSCENE_TIME_UTC", "ENROUTE_TIME_UTC"],
	                    ["TOW_IN_PROGRESS_TIME_UTC", "ONSCENE_TIME_UTC"],
	                    ["TOW_ARRIVED_TIME_UTC", "TOW_IN_PROGRESS_TIME_UTC"],
	                    ["COMPLETED_TIME_UTC", "TOW_ARRIVED_TIME_UTC"]]


batch.expect_column_pair_values_A_to_be_greater_than_B(larger_than_cols[-1][0], larger_than_cols[-1][1], mostly=0.94,
                                                       ignore_row_if='either_value_is_missing', or_equal=True,
                                                       result_format='SUMMARY', catch_exceptions=True)

{'success': True,
 'result': {'element_count': 2125493,
  'missing_count': 1615199,
  'missing_percent': 75.99173462344972,
  'unexpected_count': 26604,
  'unexpected_percent': 1.251662555463603,
  'unexpected_percent_nonmissing': 5.213465178896872,
  'partial_unexpected_list': [['2019-10-06 18:19:46', '2019-10-06 18:19:47'],
   ['2019-10-06 18:23:01', '2019-10-06 18:23:02'],
   ['2019-10-06 20:21:42', '2019-10-06 20:21:44'],
   ['2019-10-06 18:41:25', '2019-10-06 18:41:26'],
   ['2019-10-06 19:08:42', '2019-10-06 19:08:43'],
   ['2019-10-06 18:48:17', '2019-10-06 18:48:19'],
   ['2019-10-06 18:47:33', '2019-10-06 19:10:24'],
   ['2019-10-06 18:31:25', '2019-10-06 18:31:27'],
   ['2019-10-06 19:12:37', '2019-10-06 19:12:38'],
   ['2019-10-06 19:22:11', '2019-10-06 19:22:13'],
   ['2019-10-06 19:17:05', '2019-10-06 20:11:40'],
   ['2019-10-06 17:38:59', '2019-10-06 19:02:42'],
   ['2019-10-06 18:52:57', '2019-10-06 18:52:58'],
   ['2019-10-06 18:54:02', '2019-10-06 18:54:04'],
   ['2019

In [189]:
for cols in larger_than_cols:
    print(cols, batch.expect_column_pair_values_A_to_be_greater_than_B(cols[0], cols[1], mostly=0.98,
                                                       ignore_row_if='either_value_is_missing', or_equal=True,
                                                       result_format='SUMMARY', catch_exceptions=True), sep='\n')

['COMPLETED_TIME_UTC', 'SERVICE_TIME_UTC']
{'success': True, 'result': {'element_count': 2125493, 'missing_count': 256088, 'missing_percent': 12.048404770093338, 'unexpected_count': 7320, 'unexpected_percent': 0.344390689595308, 'unexpected_percent_nonmissing': 0.3915684402256333, 'partial_unexpected_list': [['2019-10-06 23:24:32', '2019-10-06 23:30:00'], ['2019-10-06 20:40:58', '2019-10-06 21:30:00'], ['2019-10-06 20:21:41', '2019-10-06 21:00:00'], ['2019-10-06 21:10:52', '2019-10-06 21:30:00'], ['2019-10-06 22:47:12', '2019-10-06 23:30:00'], ['2019-10-06 23:20:02', '2019-10-06 23:30:00'], ['2019-10-07 00:08:55', '2019-10-07 00:30:00'], ['2019-10-07 00:42:52', '2019-10-07 01:00:00'], ['2019-10-07 03:34:48', '2019-10-07 04:00:00'], ['2019-10-15 14:03:42', '2019-10-15 14:30:00'], ['2019-10-15 14:15:16', '2019-10-15 15:00:00'], ['2019-10-15 19:19:46', '2019-10-15 20:00:00'], ['2019-10-15 19:23:51', '2019-10-15 20:00:00'], ['2019-10-15 20:25:36', '2019-10-15 20:30:00'], ['2019-10-15 16:19

['ONSCENE_TIME_UTC', 'ENROUTE_TIME_UTC']
{'success': True, 'result': {'element_count': 2125493, 'missing_count': 576747, 'missing_percent': 27.13474003442966, 'unexpected_count': 12751, 'unexpected_percent': 0.5999078801953242, 'unexpected_percent_nonmissing': 0.8233112466472875, 'partial_unexpected_list': [['2019-10-06 18:07:11', '2019-10-06 18:19:45'], ['2019-10-06 16:25:31', '2019-10-06 16:25:34'], ['2019-10-06 16:54:26', '2019-10-06 17:00:20'], ['2019-10-06 16:35:34', '2019-10-06 16:48:55'], ['2019-10-06 17:44:09', '2019-10-06 17:46:03'], ['2019-10-06 17:43:13', '2019-10-06 17:52:09'], ['2019-10-06 17:40:47', '2019-10-06 17:55:38'], ['2019-10-06 18:27:20', '2019-10-06 18:27:44'], ['2019-10-06 18:20:01', '2019-10-06 18:23:25'], ['2019-10-06 18:02:54', '2019-10-06 18:35:24'], ['2019-10-06 18:48:11', '2019-10-06 18:53:30'], ['2019-10-06 18:56:54', '2019-10-06 20:20:17'], ['2019-10-06 19:58:14', '2019-10-06 20:08:38'], ['2019-10-06 19:40:22', '2019-10-06 19:40:23'], ['2019-10-06 19:43:

['COMPLETED_TIME_UTC', 'TOW_ARRIVED_TIME_UTC']
{'success': False, 'result': {'element_count': 2125493, 'missing_count': 1615199, 'missing_percent': 75.99173462344972, 'unexpected_count': 26604, 'unexpected_percent': 1.251662555463603, 'unexpected_percent_nonmissing': 5.213465178896872, 'partial_unexpected_list': [['2019-10-06 18:19:46', '2019-10-06 18:19:47'], ['2019-10-06 18:23:01', '2019-10-06 18:23:02'], ['2019-10-06 20:21:42', '2019-10-06 20:21:44'], ['2019-10-06 18:41:25', '2019-10-06 18:41:26'], ['2019-10-06 19:08:42', '2019-10-06 19:08:43'], ['2019-10-06 18:48:17', '2019-10-06 18:48:19'], ['2019-10-06 18:47:33', '2019-10-06 19:10:24'], ['2019-10-06 18:31:25', '2019-10-06 18:31:27'], ['2019-10-06 19:12:37', '2019-10-06 19:12:38'], ['2019-10-06 19:22:11', '2019-10-06 19:22:13'], ['2019-10-06 19:17:05', '2019-10-06 20:11:40'], ['2019-10-06 17:38:59', '2019-10-06 19:02:42'], ['2019-10-06 18:52:57', '2019-10-06 18:52:58'], ['2019-10-06 18:54:02', '2019-10-06 18:54:04'], ['2019-10-06 

### 8. Expecting columns to be certain data type

In [196]:
rule_df.dtypes

CASE_ID                                    int64
TASK_ID                                     int8
DRIVER_PROFILE_ID                        float64
DISPATCHER_PROFILE_ID                    float64
ENROUTE_LATITUDE                         float64
ENROUTE_LONGITUDE                        float64
ONSCENE_LATITUDE                         float64
ONSCENE_LONGITUDE                        float64
TOW_IN_PROGRESS_LATITUDE                 float64
TOW_IN_PROGRESS_LONGITUDE                float64
TOW_ARRIVED_LATITUDE                     float64
TOW_ARRIVED_LONGITUDE                    float64
COMPLETED_LATITUDE                       float64
COMPLETED_LONGITUDE                      float64
STATUS_SOURCE                             object
ASSIGNED_SOURCE_DETAILS                   object
ENROUTE_SOURCE_DETAILS                    object
ONSCENE_SOURCE_DETAILS                    object
TOW_IN_PROGRESS_SOURCE_DETAILS            object
TOW_ARRIVED_SOURCE_DETAILS                object
COMPLETED_SOURCE_DET

In [197]:
for x, y in batch.dtypes.iteritems():
    print(x, y)

CASE_ID int64
TASK_ID int8
DRIVER_PROFILE_ID float64
DISPATCHER_PROFILE_ID float64
ENROUTE_LATITUDE float64
ENROUTE_LONGITUDE float64
ONSCENE_LATITUDE float64
ONSCENE_LONGITUDE float64
TOW_IN_PROGRESS_LATITUDE float64
TOW_IN_PROGRESS_LONGITUDE float64
TOW_ARRIVED_LATITUDE float64
TOW_ARRIVED_LONGITUDE float64
COMPLETED_LATITUDE float64
COMPLETED_LONGITUDE float64
STATUS_SOURCE object
ASSIGNED_SOURCE_DETAILS object
ENROUTE_SOURCE_DETAILS object
ONSCENE_SOURCE_DETAILS object
TOW_IN_PROGRESS_SOURCE_DETAILS object
TOW_ARRIVED_SOURCE_DETAILS object
COMPLETED_SOURCE_DETAILS object
SERVICE_TIME_EASTERN datetime64[ns]
SERVICE_TIME_UTC datetime64[ns]
SERVICE_TIME_LOCAL datetime64[ns]
ASSIGNED_TIME_EASTERN datetime64[ns]
ASSIGNED_TIME_UTC datetime64[ns]
ASSIGNED_TIME_LOCAL datetime64[ns]
ENROUTE_TIME_EASTERN datetime64[ns]
ENROUTE_TIME_UTC datetime64[ns]
ENROUTE_TIME_LOCAL datetime64[ns]
ONSCENE_TIME_EASTERN datetime64[ns]
ONSCENE_TIME_UTC datetime64[ns]
ONSCENE_TIME_LOCAL datetime64[ns]
TOW_IN_

In [198]:
service_progress_data_types = dict(batch.dtypes.iteritems())

In [200]:
for key, val in service_progress_data_types.items():
    service_progress_data_types[key] = str(val)

In [201]:
print(service_progress_data_types)

{'CASE_ID': 'int64', 'TASK_ID': 'int8', 'DRIVER_PROFILE_ID': 'float64', 'DISPATCHER_PROFILE_ID': 'float64', 'ENROUTE_LATITUDE': 'float64', 'ENROUTE_LONGITUDE': 'float64', 'ONSCENE_LATITUDE': 'float64', 'ONSCENE_LONGITUDE': 'float64', 'TOW_IN_PROGRESS_LATITUDE': 'float64', 'TOW_IN_PROGRESS_LONGITUDE': 'float64', 'TOW_ARRIVED_LATITUDE': 'float64', 'TOW_ARRIVED_LONGITUDE': 'float64', 'COMPLETED_LATITUDE': 'float64', 'COMPLETED_LONGITUDE': 'float64', 'STATUS_SOURCE': 'object', 'ASSIGNED_SOURCE_DETAILS': 'object', 'ENROUTE_SOURCE_DETAILS': 'object', 'ONSCENE_SOURCE_DETAILS': 'object', 'TOW_IN_PROGRESS_SOURCE_DETAILS': 'object', 'TOW_ARRIVED_SOURCE_DETAILS': 'object', 'COMPLETED_SOURCE_DETAILS': 'object', 'SERVICE_TIME_EASTERN': 'datetime64[ns]', 'SERVICE_TIME_UTC': 'datetime64[ns]', 'SERVICE_TIME_LOCAL': 'datetime64[ns]', 'ASSIGNED_TIME_EASTERN': 'datetime64[ns]', 'ASSIGNED_TIME_UTC': 'datetime64[ns]', 'ASSIGNED_TIME_LOCAL': 'datetime64[ns]', 'ENROUTE_TIME_EASTERN': 'datetime64[ns]', 'ENROU

In [130]:
for col, typ in network_claims_data_types.items():
    print(batch.expect_column_values_to_be_of_type(col, typ, result_format='SUMMARY', catch_exceptions=True))

{'success': True, 'result': {'observed_value': 'int64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'int8'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'object_'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'int32'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'float64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'float64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'obser

### Review the expectations

Expectations that were true on this data batch were added. To view all the expectations you added so far about this data asset, do:

In [131]:
batch.get_expectation_suite()

2020-02-24T12:38:40-0500 - INFO - 	128 expectation(s) included in expectation_suite. Omitting 3 expectation(s) that failed when last run; set discard_failed_expectations=False to include them. result_format settings filtered.


{'data_asset_name': 'agero_dsa_pandas/default/network_claims',
 'meta': {'great_expectations.__version__': '0.8.8'},
 'expectations': [{'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_AMOUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_COUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_DETAILS'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_DATE_EASTERN'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_DATE_UTC'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_PAYMENT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_COUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_DETAILS'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_PAYME

In [41]:
batch.save_expectation_suite()

2020-01-31T18:26:29-0500 - INFO - 	64 expectation(s) included in expectation_suite. result_format settings filtered.


### You created and saved expectations for at least one of the data assets.

### We will show you how to set up validation - the process of checking if new files of this type conform to your expectations before they are processed by your pipeline's code. 

### Go to [integrate_validation_into_pipeline.ipynb](integrate_validation_into_pipeline.ipynb) to proceed.


