In [1]:
import os
import json
import sys
import great_expectations as ge
import great_expectations.jupyter_ux
from datetime import datetime
import math
import pandas as pd
os.chdir('/Users/mparayil/Desktop/Development/dsa-data-workflows/grtexp_agero_dsa/great_expectations')

2020-03-01T18:23:00-0500 - INFO - Great Expectations logging enabled at INFO level by JupyterUX module.


In [2]:
import ge_prod.ge_data_access as gda
import ge_prod.queries as queries

In [3]:
rule_query = queries.queries.get('customer_complaints').get('create_expectations_2019Q4')

In [4]:
rule_query

"SELECT * FROM customer_complaints where last_modified_time_utc >= to_date('2019-10-01') and last_modified_time_utc <= to_date('2019-12-31');"

# Author Expectations



[**Watch a short tutorial video**](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#video)

[**Read more in the tutorial**](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations)

**Reach out for help on** [**Great Expectations Slack**](https://tinyurl.com/great-expectations-slack)


### Get a DataContext object
[Read more in the tutorial](https://great-expectations.readthedocs.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-datacontext-object)




In [5]:
context = ge.data_context.DataContext()

2020-03-01T18:23:02-0500 - INFO - Using project config: /Users/mparayil/Desktop/Development/dsa-data-workflows/grtexp_agero_dsa/great_expectations/great_expectations.yml


### List data assets in your project

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#data-assets)


In [6]:
great_expectations.jupyter_ux.list_available_data_asset_names(context)

Inspecting your data sources. This may take a moment...


#### Pick one of the data asset names above and use as the value of data_asset_name argument below

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#get-batch)


### Specify data_asset & expectation_suite_name

In [7]:
data_asset_name = 'customer_complaints'
normalized_data_asset_name = context.normalize_data_asset_name(data_asset_name)
print(normalized_data_asset_name)

NormalizedDataAssetName(datasource='agero_dsa_pandas', generator='default', generator_asset='customer_complaints')


### Create a new empty expectation suite

In [8]:
expectation_suite_name = 'warnings_2019Q4'
# context.create_expectation_suite(data_asset_name=normalized_data_asset_name, expectation_suite_name=expectation_suite_name,
#                                 overwrite_existing=True)

In [9]:
context.list_expectation_suite_keys()

[{'data_asset_name': agero_dsa_pandas/default/customer_experience,
 {'data_asset_name': agero_dsa_pandas/default/network_outreach,
 {'data_asset_name': agero_dsa_pandas/default/network_outreach,
 {'data_asset_name': agero_dsa_pandas/default/network_outreach,
 {'data_asset_name': agero_dsa_pandas/default/customer_complaints,
 {'data_asset_name': agero_dsa_pandas/default/network_claims,
 {'data_asset_name': agero_dsa_pandas/default/network_claims,

### Get batch to create expectations against

In [9]:
rule_df = gda.snowflake_connector_to_df(rule_query)
# rule_df.to_pickle('temp_data/network_claims_2019Q4.pkl')

In [10]:
rule_df.shape

(21636, 22)

In [11]:
b_kwargs = {"dataset": rule_df}
batch = context.get_batch(normalized_data_asset_name, expectation_suite_name=expectation_suite_name,
                         batch_kwargs=b_kwargs)

In [14]:
batch.get_row_count()

21636

In [15]:
print(rule_df.shape)

(21636, 22)


In [16]:
[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']

['agero_dsa_pandas']

In [17]:
# getting rule_df batchId & fingerprint
rule_batch_fingerprint = batch.batch_fingerprint
rule_batch_id = batch.batch_id

In [18]:
print('rule_batch_fingerprint: ', rule_batch_fingerprint, sep='\n')
print('rule_batch_id: ', rule_batch_id, sep='\n')

rule_batch_fingerprint: 
{'partition_id': '20200301T194106.779337Z', 'fingerprint': 'cc92c920ae085fdd19e1a7570a842b05'}
rule_batch_id: 
{'timestamp': 1583091665.633889, 'PandasInMemoryDF': True, 'fingerprint': 'ef6f117e165851cdd3645bc0dc4d2fb0'}


## Author Expectations

[Read more in the tutorial](https://docs.greatexpectations.io/en/latest/getting_started/create_expectations.html?utm_source=notebook&utm_medium=create_expectations#create-expectations)

See available expectations in the [expectation glossary](https://docs.greatexpectations.io/en/latest/glossary.html?utm_source=notebook&utm_medium=create_expectations)


### Dataset exploration & understanding of fields to ensure rules reflect behavior of data
- Validating columns to exist in table shape
- Expected column count in table shape
- Expected set values to be seen in given column
- Expected columns to have null or non-null values X percentage of the time
- Expect column values to be of certain data type(s)
- Placing max and min values limits on numerical columns
- Average or median column value to be within a certain range
- Expecting column A to be large/less than column B

### 1. Validating to see if every column exists in table

In [19]:
# add more expectations here
column_names = batch.get_table_columns()
column_names

['COMPLAINT_CASE_NUMBER',
 'CASE_ID',
 'TASK_ID',
 'COMPLAINT_CATEGORY',
 'COMPLAINT_REASON',
 'COMPLAINT_REASON_DETAILS',
 'COMPLAINT_ORIGIN',
 'IS_EXTERNAL_COMPLAINT',
 'CASE_COMMENTS',
 'IS_CASE_CLOSED',
 'CASE_RESOLUTION',
 'CASE_SUMMARY',
 'IS_EXECUTIVE_ESCALATION',
 'IS_FROM_SOCIAL_MEDIA',
 'NAME',
 'COMPLAINT_TYPE',
 'COMPLAINT_SUBJECT',
 'CLIENT_ID',
 'CREATE_TIME_EASTERN',
 'CREATE_TIME_UTC',
 'LAST_MODIFIED_TIME_EASTERN',
 'LAST_MODIFIED_TIME_UTC']

In [392]:
colnames = list(batch.columns)
colnames.sort()

In [393]:
colnames

['CASE_COMMENTS',
 'CASE_ID',
 'CASE_RESOLUTION',
 'CASE_SUMMARY',
 'CLIENT_ID',
 'COMPLAINT_CASE_NUMBER',
 'COMPLAINT_CATEGORY',
 'COMPLAINT_ORIGIN',
 'COMPLAINT_REASON',
 'COMPLAINT_REASON_DETAILS',
 'COMPLAINT_SUBJECT',
 'COMPLAINT_TYPE',
 'CREATE_TIME_EASTERN',
 'CREATE_TIME_UTC',
 'IS_CASE_CLOSED',
 'IS_EXECUTIVE_ESCALATION',
 'IS_EXTERNAL_COMPLAINT',
 'IS_FROM_SOCIAL_MEDIA',
 'LAST_MODIFIED_TIME_EASTERN',
 'LAST_MODIFIED_TIME_UTC',
 'NAME',
 'TASK_ID']

In [22]:
master_column_names = ['CASE_COMMENTS', 'CASE_ID', 'CASE_RESOLUTION', 'CASE_SUMMARY', 'CLIENT_ID',
                       'COMPLAINT_CASE_NUMBER', 'COMPLAINT_CATEGORY', 'COMPLAINT_ORIGIN', 'COMPLAINT_REASON',
                       'COMPLAINT_REASON_DETAILS', 'COMPLAINT_SUBJECT', 'COMPLAINT_TYPE', 'CREATE_TIME_EASTERN',
                       'CREATE_TIME_UTC', 'IS_CASE_CLOSED', 'IS_EXECUTIVE_ESCALATION', 'IS_EXTERNAL_COMPLAINT',
                       'IS_FROM_SOCIAL_MEDIA', 'LAST_MODIFIED_TIME_EASTERN', 'LAST_MODIFIED_TIME_UTC', 'NAME',
                       'TASK_ID']

In [23]:
len(master_column_names)

22

In [24]:
len(column_names)

22

In [25]:
# Ensuring columns to exist
for col in master_column_names:
    print(col + ':', batch.expect_column_to_exist(col, result_format='BASIC', catch_exceptions=True), sep='\n')

CASE_COMMENTS:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
CASE_ID:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
CASE_RESOLUTION:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
CASE_SUMMARY:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
CLIENT_ID:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
COMPLAINT_CASE_NUMBER:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
COMPLAINT_CATEGORY:
{'success': True, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
COMPLAINT_ORIGIN:
{'success': True, 'excepti

### 2. Validating column count in table is always the same

In [26]:
print('# of columns in customer_complaints: ', len(column_names))

# of columns in customer_complaints:  22


In [27]:
print('# of columns in {}: '.format('network_claims'), len(master_column_names), '\n')
if len(column_names) == len(master_column_names):
    print(batch.expect_table_column_count_to_equal(len(column_names), result_format='SUMMARY'))
else:
    print(batch.expect_table_column_count_to_equal(len(master_column_names), result_format='SUMMARY'))

# of columns in network_claims:  22 

{'success': True, 'result': {'observed_value': 22}}


### 3. Checking which columns should not have null values

In [28]:
# identifying which columns should not be null
print(column_names)

['COMPLAINT_CASE_NUMBER', 'CASE_ID', 'TASK_ID', 'COMPLAINT_CATEGORY', 'COMPLAINT_REASON', 'COMPLAINT_REASON_DETAILS', 'COMPLAINT_ORIGIN', 'IS_EXTERNAL_COMPLAINT', 'CASE_COMMENTS', 'IS_CASE_CLOSED', 'CASE_RESOLUTION', 'CASE_SUMMARY', 'IS_EXECUTIVE_ESCALATION', 'IS_FROM_SOCIAL_MEDIA', 'NAME', 'COMPLAINT_TYPE', 'COMPLAINT_SUBJECT', 'CLIENT_ID', 'CREATE_TIME_EASTERN', 'CREATE_TIME_UTC', 'LAST_MODIFIED_TIME_EASTERN', 'LAST_MODIFIED_TIME_UTC']


In [29]:
rule_df.isnull().sum()

COMPLAINT_CASE_NUMBER             0
CASE_ID                         918
TASK_ID                        1046
COMPLAINT_CATEGORY             4650
COMPLAINT_REASON               4719
COMPLAINT_REASON_DETAILS      16195
COMPLAINT_ORIGIN                  0
IS_EXTERNAL_COMPLAINT             0
CASE_COMMENTS                     7
IS_CASE_CLOSED                    0
CASE_RESOLUTION                  60
CASE_SUMMARY                   4845
IS_EXECUTIVE_ESCALATION           0
IS_FROM_SOCIAL_MEDIA              0
NAME                              0
COMPLAINT_TYPE                   65
COMPLAINT_SUBJECT              5598
CLIENT_ID                        25
CREATE_TIME_EASTERN               0
CREATE_TIME_UTC                   0
LAST_MODIFIED_TIME_EASTERN        0
LAST_MODIFIED_TIME_UTC            0
dtype: int64

In [48]:
# Separating null & non-null columns
null_cols = list(batch.isnull().sum()[batch.isnull().sum() > 0].keys())
not_null_cols = list(batch.isnull().sum()[batch.isnull().sum() == 0].keys())

In [49]:
print('Viewing column null value counts: ', batch.isnull().sum(), sep='\n')

Viewing column null value counts: 
COMPLAINT_CASE_NUMBER             0
CASE_ID                         918
TASK_ID                        1046
COMPLAINT_CATEGORY             4650
COMPLAINT_REASON               4719
COMPLAINT_REASON_DETAILS      16195
COMPLAINT_ORIGIN                  0
IS_EXTERNAL_COMPLAINT             0
CASE_COMMENTS                     7
IS_CASE_CLOSED                    0
CASE_RESOLUTION                  60
CASE_SUMMARY                   4845
IS_EXECUTIVE_ESCALATION           0
IS_FROM_SOCIAL_MEDIA              0
NAME                              0
COMPLAINT_TYPE                   65
COMPLAINT_SUBJECT              5598
CLIENT_ID                        25
CREATE_TIME_EASTERN               0
CREATE_TIME_UTC                   0
LAST_MODIFIED_TIME_EASTERN        0
LAST_MODIFIED_TIME_UTC            0
dtype: int64


In [51]:
not_null_cols.sort()

In [54]:
print(not_null_cols)

['COMPLAINT_CASE_NUMBER', 'COMPLAINT_ORIGIN', 'CREATE_TIME_EASTERN', 'CREATE_TIME_UTC', 'IS_CASE_CLOSED', 'IS_EXECUTIVE_ESCALATION', 'IS_EXTERNAL_COMPLAINT', 'IS_FROM_SOCIAL_MEDIA', 'LAST_MODIFIED_TIME_EASTERN', 'LAST_MODIFIED_TIME_UTC', 'NAME']


In [55]:
null_cols.sort()

In [57]:
print(null_cols)

['CASE_COMMENTS', 'CASE_ID', 'CASE_RESOLUTION', 'CASE_SUMMARY', 'CLIENT_ID', 'COMPLAINT_CATEGORY', 'COMPLAINT_REASON', 'COMPLAINT_REASON_DETAILS', 'COMPLAINT_SUBJECT', 'COMPLAINT_TYPE', 'TASK_ID']


In [58]:
# checking for all columns that shouldn't be null are not
for col in not_null_cols:
    print(col, '\n', batch.expect_column_values_to_not_be_null(col, result_format='BASIC'))

COMPLAINT_CASE_NUMBER 
 {'success': True, 'result': {'element_count': 21636, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
COMPLAINT_ORIGIN 
 {'success': True, 'result': {'element_count': 21636, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
CREATE_TIME_EASTERN 
 {'success': True, 'result': {'element_count': 21636, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
CREATE_TIME_UTC 
 {'success': True, 'result': {'element_count': 21636, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
IS_CASE_CLOSED 
 {'success': True, 'result': {'element_count': 21636, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
IS_EXECUTIVE_ESCALATION 
 {'success': True, 'result': {'element_count': 21636, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': []}}
IS_EXTERNAL_COMPLAINT 
 {'success': True, 'result': {'element_co

In [35]:
print(f"capturing the following columns to not be null: \n {not_null_cols}")

capturing the following columns to not be null: 
 ['COMPLAINT_CASE_NUMBER', 'COMPLAINT_ORIGIN', 'IS_EXTERNAL_COMPLAINT', 'IS_CASE_CLOSED', 'IS_EXECUTIVE_ESCALATION', 'IS_FROM_SOCIAL_MEDIA', 'NAME', 'CREATE_TIME_EASTERN', 'CREATE_TIME_UTC', 'LAST_MODIFIED_TIME_EASTERN', 'LAST_MODIFIED_TIME_UTC']


### 4. Validating columns to have null values
- **columns to check:**
    - CASE_ID
    - TASK_ID
    - COMPLAINT_CATEGORY 
    - COMPLAINT_REASON
    - COMPLAINT_REASON_DETAILS
    - CASE_COMMENTS
    - CASE_RESOLUTION
    - CASE_SUMMARY 
    - COMPLAINT_TYPE 
    - COMPLAINT_SUBJECT 
    - CLIENT_ID 

In [36]:
batch.head()

Unnamed: 0,COMPLAINT_CASE_NUMBER,CASE_ID,TASK_ID,COMPLAINT_CATEGORY,COMPLAINT_REASON,COMPLAINT_REASON_DETAILS,COMPLAINT_ORIGIN,IS_EXTERNAL_COMPLAINT,CASE_COMMENTS,IS_CASE_CLOSED,...,IS_EXECUTIVE_ESCALATION,IS_FROM_SOCIAL_MEDIA,NAME,COMPLAINT_TYPE,COMPLAINT_SUBJECT,CLIENT_ID,CREATE_TIME_EASTERN,CREATE_TIME_UTC,LAST_MODIFIED_TIME_EASTERN,LAST_MODIFIED_TIME_UTC
0,1431510,2294742000.0,1.0,Network,Missed ETA - Delay/Capacity,> 120 mins,internal,False,Case Comments : \nTask 1 : \n[Service Provider...,True,...,True,False,Aliyah Julien,Contact Center Agent Comp,Verbal Apology,971.0,2019-12-22 07:45:04,2019-12-22 12:45:04,2019-12-23 12:12:47,2019-12-23 17:12:47
1,1431393,2294446000.0,1.0,Network,Customer Service - Unprofessional conduct,,internal,False,Case Comments : \nTask 1 : \n[Service Provider...,True,...,True,False,Nelon Noel,Service Provider Complain,Verbal Apology.,863.0,2019-12-21 11:30:06,2019-12-21 16:30:06,2019-12-24 10:02:24,2019-12-24 15:02:24
2,1431605,2294723000.0,4.0,Network,Customer Service - Rude/Tone,,external,True,Case Comments : \nTask 1 : \n[Disablement Loca...,True,...,True,False,Laura Lacasse,Contact Center Agent Comp,Phone number out of service,863.0,2019-12-23 08:04:06,2019-12-23 13:04:06,2019-12-24 09:06:24,2019-12-24 14:06:24
3,1429159,2294104000.0,1.0,Network,Customer Service - Unprofessional conduct,,text survey,False,Case Comments : \nTask 1 : \n[Disablement Loca...,True,...,True,False,Belinda Bhola,Service Provider Complain,No Customer Contact,863.0,2019-12-16 07:02:55,2019-12-16 12:02:55,2019-12-17 15:05:12,2019-12-17 20:05:12
4,1429886,2294271000.0,1.0,Network,Mishandled Coverage/Overages,,internal,False,Case Comments : \nTask 1 : \n[Disablement Loca...,True,...,True,False,Shadea Scott,Coverage Limits/Overages,Voicemail Apology,942.0,2019-12-17 13:45:42,2019-12-17 18:45:42,2019-12-18 09:39:26,2019-12-18 14:39:26


In [242]:
from typing import Union
from great_expectations.dataset import PandasDataset
def get_df_not_null_weights(df: Union[pd.DataFrame, PandasDataset], groupby_col: str, not_null_col: str) -> float:
    """
    Provides specified column's weight/percentage for it not to be null.

    Parameters
    -----------
    df: pd.DataFrame or great_expectations.dataset.PandasDataset
        dataframe object to look at
    groupby_col: str
        grouping column string to groupby dataframe on when looking at specified column in next parameter
    not_null_col: str
        column used from dataframe to calculate safe weight thresholds of when it would be not null

    Returns
    ------------
    float
        Not null weight of specified column lowered by 5% after looking at the 10% quartile
    """

    df_group = df.groupby(df[groupby_col].dt.date)
    df_group = df_group.apply(lambda x: x[not_null_col].notnull().mean())

    adjusted_weight = df_group.quantile(0.1, interpolation='lower')
    return adjusted_weight.round(4)

In [243]:
for col in null_cols:
    w = get_df_not_null_weights(rule_df, 'LAST_MODIFIED_TIME_UTC', col)
    print(col, w, sep='\n')

CASE_COMMENTS
1.0
CASE_ID
0.9252
CASE_RESOLUTION
0.9933
CASE_SUMMARY
0.617
CLIENT_ID
0.9958
COMPLAINT_CATEGORY
0.6236
COMPLAINT_REASON
0.6234
COMPLAINT_REASON_DETAILS
0.1667
COMPLAINT_SUBJECT
0.5414
COMPLAINT_TYPE
0.9919
TASK_ID
0.9182


In [207]:
batch.head()

Unnamed: 0,COMPLAINT_CASE_NUMBER,CASE_ID,TASK_ID,COMPLAINT_CATEGORY,COMPLAINT_REASON,COMPLAINT_REASON_DETAILS,COMPLAINT_ORIGIN,IS_EXTERNAL_COMPLAINT,CASE_COMMENTS,IS_CASE_CLOSED,...,IS_EXECUTIVE_ESCALATION,IS_FROM_SOCIAL_MEDIA,NAME,COMPLAINT_TYPE,COMPLAINT_SUBJECT,CLIENT_ID,CREATE_TIME_EASTERN,CREATE_TIME_UTC,LAST_MODIFIED_TIME_EASTERN,LAST_MODIFIED_TIME_UTC
0,1431510,2294742000.0,1.0,Network,Missed ETA - Delay/Capacity,> 120 mins,internal,False,Case Comments : \nTask 1 : \n[Service Provider...,True,...,True,False,Aliyah Julien,Contact Center Agent Comp,Verbal Apology,971.0,2019-12-22 07:45:04,2019-12-22 12:45:04,2019-12-23 12:12:47,2019-12-23 17:12:47
1,1431393,2294446000.0,1.0,Network,Customer Service - Unprofessional conduct,,internal,False,Case Comments : \nTask 1 : \n[Service Provider...,True,...,True,False,Nelon Noel,Service Provider Complain,Verbal Apology.,863.0,2019-12-21 11:30:06,2019-12-21 16:30:06,2019-12-24 10:02:24,2019-12-24 15:02:24
2,1431605,2294723000.0,4.0,Network,Customer Service - Rude/Tone,,external,True,Case Comments : \nTask 1 : \n[Disablement Loca...,True,...,True,False,Laura Lacasse,Contact Center Agent Comp,Phone number out of service,863.0,2019-12-23 08:04:06,2019-12-23 13:04:06,2019-12-24 09:06:24,2019-12-24 14:06:24
3,1429159,2294104000.0,1.0,Network,Customer Service - Unprofessional conduct,,text survey,False,Case Comments : \nTask 1 : \n[Disablement Loca...,True,...,True,False,Belinda Bhola,Service Provider Complain,No Customer Contact,863.0,2019-12-16 07:02:55,2019-12-16 12:02:55,2019-12-17 15:05:12,2019-12-17 20:05:12
4,1429886,2294271000.0,1.0,Network,Mishandled Coverage/Overages,,internal,False,Case Comments : \nTask 1 : \n[Disablement Loca...,True,...,True,False,Shadea Scott,Coverage Limits/Overages,Voicemail Apology,942.0,2019-12-17 13:45:42,2019-12-17 18:45:42,2019-12-18 09:39:26,2019-12-18 14:39:26


In [84]:
# calculating weight for columns of how often they should be null
null_percents = (1 -(batch.isnull().sum() / len(batch))[batch.isnull().sum() / len(batch) > 0])
for x, y in sorted(null_percents.items()):
    print(x, y, sep='\n')
    
not_null_weights = dict(null_percents)

# lowering weights by one thousandth of decimal
for key, weight in not_null_weights.items():
    not_null_weights[key] = round((weight - 0.015), 4)
    
print('---------------------------------------')
print('not null weights:')
for x, y in sorted(not_null_weights.items()):
    print(x,y)

CASE_COMMENTS
0.9996764651506748
CASE_ID
0.9575707154742097
CASE_RESOLUTION
0.9972268441486412
CASE_SUMMARY
0.7760676650027731
CLIENT_ID
0.9988445183952671
COMPLAINT_CATEGORY
0.7850804215196894
COMPLAINT_REASON
0.7818912922906267
COMPLAINT_REASON_DETAILS
0.251479016454058
COMPLAINT_SUBJECT
0.7412645590682196
COMPLAINT_TYPE
0.9969957478276946
TASK_ID
0.9516546496579774
---------------------------------------
not null weights:
CASE_COMMENTS 0.9847
CASE_ID 0.9426
CASE_RESOLUTION 0.9822
CASE_SUMMARY 0.7611
CLIENT_ID 0.9838
COMPLAINT_CATEGORY 0.7701
COMPLAINT_REASON 0.7669
COMPLAINT_REASON_DETAILS 0.2365
COMPLAINT_SUBJECT 0.7263
COMPLAINT_TYPE 0.982
TASK_ID 0.9367


In [244]:
null_col_weights = {'CASE_COMMENTS': 0.99, 'CASE_ID': 0.94, 'CASE_RESOLUTION': 0.99, 'CASE_SUMMARY': 0.75,
	                    'CLIENT_ID': 0.99, 'COMPLAINT_CATEGORY': 0.75, 'COMPLAINT_REASON': 0.75,
	                    'COMPLAINT_REASON_DETAILS': 0.22, 'COMPLAINT_SUBJECT': 0.72, 'COMPLAINT_TYPE': 0.99,
	                    'TASK_ID': 0.93}

In [245]:
for col, weight in null_col_weights.items():
    print(col, batch.expect_column_values_to_not_be_null(col, mostly=weight, include_config=True,
                                                           catch_exceptions=True,
                                                           result_format='SUMMARY'), sep='\n')

CASE_COMMENTS
{'success': True, 'result': {'element_count': 21636, 'unexpected_count': 7, 'unexpected_percent': 0.032353484932519876, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'CASE_COMMENTS', 'mostly': 0.99, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
CASE_ID
{'success': True, 'result': {'element_count': 21636, 'unexpected_count': 918, 'unexpected_percent': 4.242928452579035, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'CASE_ID', 'mostly': 0.94, 'result_format': 'SUMMARY'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
CASE_RESOLUTION
{'success': True, 'result': {'element_count': 21636, 'unexpected_count': 60, 'unexpected_percent': 0.27731558513588467, 'partia

### 5. Expecting column values to be in a set
- COMPLAINT_CATEGORY
- COMPLAINT_REASON
- COMPLAINT_REASON_DETAILS
- COMPLAINT_ORGIN
- CASE_RESOLUTION
- COMPLAINT_TYPE

In [134]:
from snowflake.connector.converter_null import SnowflakeNoConverterToPython
def snowflake_connector_to_df(query: str) -> pd.DataFrame:
    sf_creds = load_credentials()
    ctx = snowflake.connector.connect(
        user=sf_creds['dsa_username'],
        password=sf_creds['dsa_password'],
        account=sf_creds['dsa_account'],
        database=sf_creds['dsa_master_database'],
        schema=sf_creds['dsa_schema'],
        warehouse=sf_creds['dsa_warehouse'],
        role=sf_creds['dsa_etl_role'],
        converter_class=SnowflakeNoConverterToPython
    )
    cur = ctx.cursor()
    try:
        cur.execute(query)
        df = cur.fetch_pandas_all()
    finally:
        cur.close()
    return df

In [135]:
from snowflake.connector.converter_null import SnowflakeNoConverterToPython
def snowflake_connector_to_df(query: str) -> pd.DataFrame:
    sf_creds = gda.load_credentials()
    ctx = snowflake.connector.connect(
        user=sf_creds['dsa_username'],
        password=sf_creds['dsa_password'],
        account=sf_creds['dsa_account'],
        database=sf_creds['dsa_master_database'],
        schema=sf_creds['dsa_schema'],
        warehouse=sf_creds['dsa_warehouse'],
        role=sf_creds['dsa_etl_role'],
        converter_class=SnowflakeNoConverterToPython
    )
    cur = ctx.cursor()
    try:
        cur.execute(query)
        df = cur.fetch_pandas_all()
    finally:
        cur.close()
    return df

In [409]:
master_categorical_columns = ["COMPLAINT_CATEGORY", "COMPLAINT_REASON", "COMPLAINT_REASON_DETAILS",
                             "COMPLAINT_ORIGIN", "CASE_RESOLUTION", "COMPLAINT_TYPE"]
def get_categorical_columns_values(df: Union[pd.DataFrame, PandasDataset], cols: list, table_name: str):
    categorical_weights = {}
    for col in cols:
        unique_weights = df[col].value_counts(normalize=True) * 100
        categorical_weights[col] = unique_weights.values.mean().round(5)
        cat_weight_dict = {c:w for (c,w) in categorical_weights.items() 
                                    if w > 0.9 if (df[c].dtypes != bool and w != 50.0) 
                                    if c not in ['TASK_ID', 'task_id'] 
                                    if c not in ['CLIENT_ID', 'client_id']}
    execute_strings = ' '.join(
        f"SELECT DISTINCT {col_name} FROM {table_name};" for col_name in cat_weight_dict.keys())
    
    sf_creds = gda.load_credentials()
    ctx = snowflake.connector.connect(
    user=sf_creds['dsa_username'],
    password=sf_creds['dsa_password'],
    account=sf_creds['dsa_account'],
    database=sf_creds['dsa_master_database'],
    schema=sf_creds['dsa_schema'],
    warehouse=sf_creds['dsa_warehouse'],
    role=sf_creds['dsa_etl_role'])
    
    cursor_list = ctx.execute_string(execute_strings, remove_comments=True, return_cursors=True)
    category_col_values = {}
    for cur in cursor_list:
        col_names = ','.join([col[0] for col in cur.description])
        cat_values = [x[0] for x in cur.fetchall() if x[0]]
        category_col_values[col_names] = cat_values
    return category_col_values
        

dtest = get_categorical_columns_values(rule_df, master_column_names, 'customer_complaints')

In [422]:
for x,y in dtest.items():
    print(x, len(y))

CASE_RESOLUTION 42
COMPLAINT_CATEGORY 11
COMPLAINT_ORIGIN 23
COMPLAINT_REASON 89
COMPLAINT_REASON_DETAILS 38
COMPLAINT_TYPE 62


In [419]:
[len(x) for x in val_set_list]

[8, 73, 8, 13, 34, 39]

In [248]:
execute_strings = ' '.join(f"SELECT DISTINCT {col} FROM customer_complaints;" for col in cat_dict)

In [249]:
execute_strings

'SELECT DISTINCT CASE_RESOLUTION FROM customer_complaints; SELECT DISTINCT COMPLAINT_CATEGORY FROM customer_complaints; SELECT DISTINCT COMPLAINT_ORIGIN FROM customer_complaints; SELECT DISTINCT COMPLAINT_REASON FROM customer_complaints; SELECT DISTINCT COMPLAINT_REASON_DETAILS FROM customer_complaints; SELECT DISTINCT COMPLAINT_TYPE FROM customer_complaints;'

In [292]:
for v in category_col_values.values():
    print(len(v))

42
11
23
89
38
62


In [269]:
frames = []
for cur in cursor_list:
    df = cur.fetch_pandas_all()
    frames.append(df)
cat_df = pd.concat(frames)

In [272]:
cat_df3 = cat_df.drop_duplicates()

In [271]:
cat_df.COMPLAINT_CATEGORY.nunique()

12

In [428]:
val_set_list = []

for col in master_categorical_columns:
    list_sets = list(batch.get_column_value_counts(col, sort='count').keys())
    val_set_list.append(list_sets)

In [429]:
for c, v in zip(master_categorical_columns, val_set_list):
    print(c, len(v))

COMPLAINT_CATEGORY 8
COMPLAINT_REASON 73
COMPLAINT_REASON_DETAILS 8
COMPLAINT_ORIGIN 13
CASE_RESOLUTION 34
COMPLAINT_TYPE 39


In [79]:
for col, vals in zip(category_cols, val_set_list):
    print(col, '\n', batch.expect_column_values_to_be_in_set(col, vals, result_format='BASIC', 
                                        include_config=True, catch_exceptions=True), '\n')

ORIGINAL_CLAIM_STATUS_CODE 
 {'success': True, 'result': {'element_count': 2678943, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': {'column': 'ORIGINAL_CLAIM_STATUS_CODE', 'value_set': ['AP', 'DP', 'FC', 'FP', 'HD', 'IN', 'PA', 'PC', 'PD', 'PJ', 'SP', 'VD'], 'result_format': 'BASIC'}}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}} 

ORIGINAL_CLAIM_TYPE_CODE 
 {'success': True, 'result': {'element_count': 2678943, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}, 'expectation_config': {'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': {'column': 'ORIGINAL_CLAIM_TYPE_CODE', 'value_set': ['OWN', 'VND'

In [431]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### 8. Determine if columns are unique per row
- CASE_ID, TASK_ID
- CASE_ID
- COMPLAINT_CASE_NUMBER

In [441]:
print(c, batch.expect_column_values_to_be_unique("COMPLAINT_CASE_NUMBER", result_format='BASIC'), sep='\n')

CASE_ID
{'success': True, 'result': {'element_count': 21636, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_list': []}}


In [442]:
batch.expect_multicolumn_values_to_be_unique(column_list=['CASE_ID', 'TASK_ID'], result_format='SUMMARY',
                                            catch_exceptions=True, include_config=True)

{'success': True,
 'result': {'element_count': 21636,
  'missing_count': 918,
  'missing_percent': 4.242928452579035,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': [],
  'partial_unexpected_index_list': [],
  'partial_unexpected_counts': []},
 'expectation_config': {'expectation_type': 'expect_multicolumn_values_to_be_unique',
  'kwargs': {'column_list': ['CASE_ID', 'TASK_ID'],
   'result_format': 'SUMMARY'}},
 'exception_info': {'raised_exception': False,
  'exception_message': None,
  'exception_traceback': None}}

### 9. Looking at column A to be greater than column B
- LAST_MODIFIED_TIME_UTC > CREATE_TIME_UTC

In [443]:
batch.expect_column_pair_values_A_to_be_greater_than_B('LAST_MODIFIED_TIME_UTC', 'CREATE_TIME_UTC', 
                                                       ignore_row_if='either_value_is_missing', 
                                                       result_format='SUMMARY', catch_exceptions=True)

{'success': True,
 'result': {'element_count': 21636,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': [],
  'partial_unexpected_index_list': [],
  'partial_unexpected_counts': []},
 'exception_info': {'raised_exception': False,
  'exception_message': None,
  'exception_traceback': None}}

In [119]:
batch.expect_column_pair_values_A_to_be_greater_than_B('ORIGINAL_CLAIM_APPROVED_DATE_UTC', 
                                                       'ORIGINAL_CLAIM_SUBMITTED_DATE_UTC',
                                                       mostly=0.99, ignore_row_if='either_value_is_missing', 
                                                       result_format='SUMMARY', catch_exceptions=True)

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 66043,
  'missing_percent': 2.4692470447214765,
  'unexpected_count': 151,
  'unexpected_percent': 0.005645659702813969,
  'unexpected_percent_nonmissing': 0.005788594398940726,
  'partial_unexpected_list': [['2019-10-16 14:39:34', '2019-11-15 05:00:00'],
   ['2019-10-16 13:39:14', '2019-11-15 05:00:00'],
   ['2019-12-17 05:10:42', '2019-12-17 05:10:42'],
   ['2019-10-16 13:39:14', '2019-11-15 05:00:00'],
   ['2019-10-16 13:39:14', '2019-11-15 05:00:00'],
   ['2019-10-16 08:18:36', '2019-11-15 05:00:00'],
   ['2019-10-16 13:39:14', '2019-11-15 05:00:00'],
   ['2020-01-03 13:53:34', '2020-12-31 05:00:00'],
   ['2019-12-15 05:01:23', '2019-12-15 05:01:23'],
   ['2019-12-28 05:00:42', '2019-12-28 05:00:42'],
   ['2020-01-03 13:53:34', '2020-12-31 05:00:00'],
   ['2020-01-23 17:00:13', '2020-01-23 17:00:13'],
   ['2020-01-01 05:14:15', '2020-01-01 05:14:15'],
   ['2020-01-21 05:18:08', '2020-01-21 05:18:08'],
   ['2

In [124]:
batch.expect_column_pair_values_A_to_be_greater_than_B('MODIFIED_DATE_UTC','ADDPAY_APPROVED_DATE_UTC',
                                                       or_equal=True,
                                                       ignore_row_if='either_value_is_missing', 
                                                       result_format='SUMMARY', catch_exceptions=True)

{'success': True,
 'result': {'element_count': 2674621,
  'missing_count': 2578267,
  'missing_percent': 96.39747089400704,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': [],
  'partial_unexpected_index_list': [],
  'partial_unexpected_counts': []},
 'exception_info': {'raised_exception': False,
  'exception_message': None,
  'exception_traceback': None}}

### 8. Expecting columns to be certain data type

In [444]:
rule_df.dtypes

COMPLAINT_CASE_NUMBER                 object
CASE_ID                              float64
TASK_ID                              float64
COMPLAINT_CATEGORY                    object
COMPLAINT_REASON                      object
COMPLAINT_REASON_DETAILS              object
COMPLAINT_ORIGIN                      object
IS_EXTERNAL_COMPLAINT                   bool
CASE_COMMENTS                         object
IS_CASE_CLOSED                          bool
CASE_RESOLUTION                       object
CASE_SUMMARY                          object
IS_EXECUTIVE_ESCALATION                 bool
IS_FROM_SOCIAL_MEDIA                    bool
NAME                                  object
COMPLAINT_TYPE                        object
COMPLAINT_SUBJECT                     object
CLIENT_ID                            float64
CREATE_TIME_EASTERN           datetime64[ns]
CREATE_TIME_UTC               datetime64[ns]
LAST_MODIFIED_TIME_EASTERN    datetime64[ns]
LAST_MODIFIED_TIME_UTC        datetime64[ns]
dtype: obj

In [445]:
for x, y in batch.dtypes.iteritems():
    print(x, y)

COMPLAINT_CASE_NUMBER object
CASE_ID float64
TASK_ID float64
COMPLAINT_CATEGORY object
COMPLAINT_REASON object
COMPLAINT_REASON_DETAILS object
COMPLAINT_ORIGIN object
IS_EXTERNAL_COMPLAINT bool
CASE_COMMENTS object
IS_CASE_CLOSED bool
CASE_RESOLUTION object
CASE_SUMMARY object
IS_EXECUTIVE_ESCALATION bool
IS_FROM_SOCIAL_MEDIA bool
NAME object
COMPLAINT_TYPE object
COMPLAINT_SUBJECT object
CLIENT_ID float64
CREATE_TIME_EASTERN datetime64[ns]
CREATE_TIME_UTC datetime64[ns]
LAST_MODIFIED_TIME_EASTERN datetime64[ns]
LAST_MODIFIED_TIME_UTC datetime64[ns]


In [446]:
customer_complaints_data_types = dict(batch.dtypes.iteritems())

In [448]:
for key, val in customer_complaints_data_types.items():
    customer_complaints_data_types[key] = str(val)

In [451]:
print(customer_complaints_data_types)

{'COMPLAINT_CASE_NUMBER': 'object', 'CASE_ID': 'float64', 'TASK_ID': 'float64', 'COMPLAINT_CATEGORY': 'object', 'COMPLAINT_REASON': 'object', 'COMPLAINT_REASON_DETAILS': 'object', 'COMPLAINT_ORIGIN': 'object', 'IS_EXTERNAL_COMPLAINT': 'bool', 'CASE_COMMENTS': 'object', 'IS_CASE_CLOSED': 'bool', 'CASE_RESOLUTION': 'object', 'CASE_SUMMARY': 'object', 'IS_EXECUTIVE_ESCALATION': 'bool', 'IS_FROM_SOCIAL_MEDIA': 'bool', 'NAME': 'object', 'COMPLAINT_TYPE': 'object', 'COMPLAINT_SUBJECT': 'object', 'CLIENT_ID': 'float64', 'CREATE_TIME_EASTERN': 'datetime64[ns]', 'CREATE_TIME_UTC': 'datetime64[ns]', 'LAST_MODIFIED_TIME_EASTERN': 'datetime64[ns]', 'LAST_MODIFIED_TIME_UTC': 'datetime64[ns]'}


In [130]:
for col, typ in network_claims_data_types.items():
    print(batch.expect_column_values_to_be_of_type(col, typ, result_format='SUMMARY', catch_exceptions=True))

{'success': True, 'result': {'observed_value': 'int64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'int8'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'object_'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'int32'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'float64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'observed_value': 'float64'}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}
{'success': True, 'result': {'obser

### Review the expectations

Expectations that were true on this data batch were added. To view all the expectations you added so far about this data asset, do:

In [131]:
batch.get_expectation_suite()

2020-02-24T12:38:40-0500 - INFO - 	128 expectation(s) included in expectation_suite. Omitting 3 expectation(s) that failed when last run; set discard_failed_expectations=False to include them. result_format settings filtered.


{'data_asset_name': 'agero_dsa_pandas/default/network_claims',
 'meta': {'great_expectations.__version__': '0.8.8'},
 'expectations': [{'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_AMOUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_COUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDCHARGE_DETAILS'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_DATE_EASTERN'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_DATE_UTC'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_APPROVED_PAYMENT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_COUNT'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_DETAILS'}},
  {'expectation_type': 'expect_column_to_exist',
   'kwargs': {'column': 'ADDPAY_PAYME

In [41]:
batch.save_expectation_suite()

2020-01-31T18:26:29-0500 - INFO - 	64 expectation(s) included in expectation_suite. result_format settings filtered.


### You created and saved expectations for at least one of the data assets.

### We will show you how to set up validation - the process of checking if new files of this type conform to your expectations before they are processed by your pipeline's code. 

### Go to [integrate_validation_into_pipeline.ipynb](integrate_validation_into_pipeline.ipynb) to proceed.


