In [1]:
import datetime
import os
import re

import great_expectations as ge
import numpy as np
import pandas as pd
import rad_pipeline.rad_pipeline as rp
import rad_pipeline.zipcodes as zc

In [2]:
import importlib
importlib.reload(rp)
importlib.reload(zc)
importlib.reload(ge)

<module 'great_expectations' from '/Users/alexhasha/miniconda3/envs/rad_pipeline_env/lib/python3.8/site-packages/great_expectations/__init__.py'>

In [3]:
ge.__file__

'/Users/alexhasha/miniconda3/envs/rad_pipeline_env/lib/python3.8/site-packages/great_expectations/__init__.py'

In [4]:
EXPECTATIONS_DIR = "../data/expectations"

In [5]:
from great_expectations.data_context.data_context import ExpectationSuite
import os
import datetime
from tempfile import TemporaryDirectory
import great_expectations as ge
from great_expectations.dataset import PandasDataset

evaluation_params = {"now": datetime.datetime.now(), "now_minus_48h": datetime.datetime.now() - datetime.timedelta(days=2)}
test_data = {"data_refresh": [datetime.datetime.now(), datetime.datetime.now() - datetime.timedelta(days=1)]}
test_df = pd.DataFrame(test_data)
dataset_name = "test_pandas_source"

with TemporaryDirectory() as tempdir:
    ge_path = os.path.join(tempdir, "great_expectations")
    ge.DataContext.create(tempdir, usage_statistics_enabled=False)
    context = ge.DataContext(ge_path)

    context.add_datasource(dataset_name, class_name="PandasDatasource")

    batch_kwargs = {"dataset": test_df, "datasource": dataset_name, "PandasInMemoryDF": True, "ge_batch_id": "test_id",}

    empty_suite = context.create_expectation_suite("test_suite")

    batch = context.get_batch(batch_kwargs=batch_kwargs, expectation_suite_name=empty_suite)
    for param in evaluation_params:
        batch.set_evaluation_parameter(param, evaluation_params[param])

    # Add expectation that will succeed using the datetime in a $PARAMETER
    batch.expect_column_max_to_be_between(column="data_refresh", min_value={"$PARAMETER": "now_minus_48h"})
    result = batch.validate()
    assert result.success
    batch.save_expectation_suite()
    assert isinstance(batch, PandasDataset)

    # Check that we can load the saved expectation suite
    reloaded_expectation_suite = context.get_expectation_suite("test_suite")
    assert isinstance(reloaded_expectation_suite, ExpectationSuite)

#     # Check that we can build Data Docs
#     index_page_locator_infos = context.build_data_docs()
#     assert index_page_locator_infos["local_site"] == f"file://{ge_path}/uncommitted/data_docs/local_site/index.html"

#     # Check that we can reload the expectation suite and validate
#     reloaded_batch = context.get_batch(batch_kwargs=batch_kwargs, expectation_suite_name=reloaded_expectation_suite)

#     run_id = {
#         "run_name": f"reloaded_{dataset_name}_{datetime.datetime.now()}",
#         "run_time": datetime.datetime.now(),
#     }
#     reloaded_results = context.run_validation_operator(
#         "action_list_operator", assets_to_validate=[reloaded_batch], run_id=run_id,
#     )

#     assert reloaded_results.success

## ASHP

### Raw data expectations

In [6]:
ashp = ge.from_pandas(rp.load_ashp())

In [7]:
#Expect row count to increase in subsequent files
result = ashp.expect_table_row_count_to_be_between(19964, 30000)
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "min_value": 19964,
      "max_value": 30000,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_row_count_to_be_between"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 19964
  }
}


In [8]:
field_map = rp.FIELDS['Air-source Heat Pumps']
for key_col in field_map:
    result = ashp.expect_column_to_exist(field_map[key_col])
    assert result.success
    print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Rebate Amount ",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}
{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Total System Costs",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}
{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Site Zip Code",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "except

In [9]:
# Expect zipcode to be mostly numbers and mostly populated

In [10]:
result = ashp.expect_column_values_to_not_be_null(field_map['zip'])
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Site Zip Code",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_not_be_null"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 19964,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  }
}


In [11]:
result = ashp.expect_column_values_to_be_in_type_list(field_map['zip'], ['int', 'float', 'str'])
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Site Zip Code",
      "type_list": [
        "int",
        "float",
        "str"
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_in_type_list__map"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 19964,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}


In [12]:
result = ashp.expect_column_values_to_match_regex(field_map['zip'], r"^\s*([0-9]{3,5})(?:[.]0)?(?:-([0-9]{4})|-)?\s*$", mostly=0.98)
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Site Zip Code",
      "regex": "^\\s*([0-9]{3,5})(?:[.]0)?(?:-([0-9]{4})|-)?\\s*$",
      "mostly": 0.98,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_match_regex"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 19964,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 3,
    "unexpected_percent": 0.015027048687637747,
    "unexpected_percent_total": 0.015027048687637747,
    "unexpected_percent_nonmissing": 0.015027048687637747,
    "partial_unexpected_list": [
      20,
      "019081047",
      "0212y"
    ]
  }
}


In [13]:
# Expect rebate and cost to be mostly numbers

In [14]:
result = ashp.expect_column_values_to_be_in_type_list(field_map['rebate'], ['int', 'float'], mostly=0.99)
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Rebate Amount ",
      "type_list": [
        "int",
        "float"
      ],
      "mostly": 0.99,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_in_type_list__map"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 19964,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 1,
    "unexpected_percent": 0.005009016229212583,
    "unexpected_percent_total": 0.005009016229212583,
    "unexpected_percent_nonmissing": 0.005009016229212583,
    "partial_unexpected_list": [
      "Not Applicable"
    ]
  }
}


In [15]:
result = ashp.expect_column_values_to_be_in_type_list(field_map['cost'], ['int', 'float'], mostly=0.99)
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Total System Costs",
      "type_list": [
        "int",
        "float"
      ],
      "mostly": 0.99,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_in_type_list__map"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 19964,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 19,
    "unexpected_percent": 0.09517130835503908,
    "unexpected_percent_total": 0.09517130835503908,
    "unexpected_percent_nonmissing": 0.09517130835503908,
    "partial_unexpected_list": [
      "1000-labor only",
      "1200-labor only",
      "31900-with boiler",
      "925-labor only",
      "18668.75-with 5C42 unit",
      "1000-labor only",
      "25490-with Bosch",
      "57814.74-with Trane",
      "22725-with Lenno

In [16]:
result = ashp.expect_column_values_to_be_of_type(field_map["date"], "datetime64")
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Date Rebate Payment Approved by MassCEC",
      "type_": "datetime64",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_of_type__aggregate"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "datetime64"
  }
}


In [17]:
result = ashp.expect_column_values_to_be_between(field_map["date"], 
                                                    ashp[field_map["date"]].min(), 
                                                    pd.Timestamp.now(),
                                                    parse_strings_as_datetimes=True)
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 19964,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

In [18]:
## TODO: Figure out how to validate date ranges

# ashp.set_evaluation_parameter("start", ashp[field_map["date"]].min().to_pydatetime())
# ashp.set_evaluation_parameter("end", datetime.datetime.now())
# result = ashp.expect_column_values_to_be_between(field_map["date"], 
#                                         min_value={"$PARAMETER": "start"},
#               
# max_value={"$PARAMETER": "end"}
#                                        )
# assert result.success
# print(result)

In [19]:
ashp.save_expectation_suite("../data/expectations/ashp_raw_expectations.json")

In [20]:
validation_results = ashp.validate(expectation_suite="../data/expectations/ashp_raw_expectations.json", catch_exceptions=False)

### Clean data expectations

In [21]:
ashp_cleaned = ge.from_pandas(rp.clean_data_load("Air-source Heat Pumps"))

In [22]:
ashp_cleaned.shape[0] / ashp.shape[0]

0.9445001001803246

In [23]:
# Check that we have dropped no more than 7% of data
result = ashp_cleaned.expect_table_row_count_to_be_between(int(0.93*ashp.shape[0]), ashp.shape[0])  
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "min_value": 18566,
      "max_value": 19964,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_row_count_to_be_between"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 18856
  }
}


In [24]:
result = ashp_cleaned.expect_column_to_exist("town_valid")
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "town_valid",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}


In [25]:
result = ashp_cleaned.expect_column_to_exist("rebate")
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "rebate",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}


In [26]:
result = ashp_cleaned.expect_column_to_exist("cost")
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "cost",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}


In [27]:
result = ashp_cleaned.expect_column_values_to_be_in_set("town_valid", [True])
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "town_valid",
      "value_set": [
        true
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_in_set"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 18856,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}


In [28]:
result = ashp_cleaned.expect_column_values_to_be_of_type("cost", "float")
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "cost",
      "type_": "float",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_of_type__aggregate"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "float64"
  }
}


In [29]:
result = ashp_cleaned.expect_column_values_to_be_of_type("rebate", "float")
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "rebate",
      "type_": "float",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_of_type__aggregate"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "float64"
  }
}


In [30]:
ashp_cleaned.save_expectation_suite("../data/expectations/ashp_clean_expectations.json")


## GSHP

In [31]:
gshp = ge.from_pandas(rp.load_gshp())

In [32]:
#Expect row count to increase in subsequent files
result = gshp.expect_table_row_count_to_be_between(538, 1000)
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "min_value": 538,
      "max_value": 1000,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_row_count_to_be_between"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 538
  }
}


In [33]:
field_map = rp.FIELDS['Ground-source Heat Pumps']
for key_col in field_map:
    result = gshp.expect_column_to_exist(field_map[key_col])
    assert result.success
    print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Rebate Amount",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}
{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Total System Cost",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}
{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Site City/Town",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "excepti

In [34]:
result = gshp.expect_column_values_to_be_in_type_list(field_map['rebate'], ['int', 'float'], mostly=0.99)
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Rebate Amount",
      "type_list": [
        "int",
        "float"
      ],
      "mostly": 0.99,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_in_type_list__map"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 538,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 2,
    "unexpected_percent": 0.37174721189591076,
    "unexpected_percent_total": 0.37174721189591076,
    "unexpected_percent_nonmissing": 0.37174721189591076,
    "partial_unexpected_list": [
      "$10000.00",
      "$10000.00"
    ]
  }
}


In [35]:
result = gshp.expect_column_values_to_be_in_type_list(field_map['cost'], ['int', 'float'], mostly=0.99)
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "Total System Cost",
      "type_list": [
        "int",
        "float"
      ],
      "mostly": 0.99,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_in_type_list__map"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 538,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 3,
    "unexpected_percent": 0.5576208178438662,
    "unexpected_percent_total": 0.5576208178438662,
    "unexpected_percent_nonmissing": 0.5576208178438662,
    "partial_unexpected_list": [
      "$117910",
      "$81355",
      "$40445"
    ]
  }
}


In [36]:
gshp.expect_column_values_to_be_between(field_map["date"], 
                                        gshp[field_map["date"]].min(), 
                                        pd.Timestamp.now(),
                                        parse_strings_as_datetimes=True)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 538,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

In [37]:
## TODO: Figure out how to get datetime expectations working
# gshp.set_evaluation_parameter("start", gshp[field_map["date"]].min())
# gshp.set_evaluation_parameter("end", pd.Timestamp.now())

# result = gshp.expect_column_values_to_be_between(field_map["date"], 
#                                         min_value={"$PARAMETER": "start"},
#                                         max_value={"$PARAMETER": "end"}
#                                        )
# assert result.success
# print(result)

In [38]:
gshp.save_expectation_suite("../data/expectations/gshp_raw_expectations.json")

### Clean data expectations

In [39]:
gshp_cleaned = ge.from_pandas(rp.clean_data_load("Ground-source Heat Pumps"))

gshp_cleaned.shape[0] / gshp.shape[0]

0.9609665427509294

In [40]:
# Check that we have dropped no more than 5% of data
result = gshp_cleaned.expect_table_row_count_to_be_between(int(0.95*gshp.shape[0]), gshp.shape[0])  
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "min_value": 511,
      "max_value": 538,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_row_count_to_be_between"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 517
  }
}


In [41]:
result = gshp_cleaned.expect_column_to_exist("town_valid")
assert result.success
print(result)

{
  "success": true,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "town_valid",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}


In [43]:
result = gshp_cleaned.expect_column_to_exist("rebate")
print(result)
assert result.success


{
  "success": false,
  "meta": {},
  "expectation_config": {
    "kwargs": {
      "column": "rebate",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}


AssertionError: 

In [None]:
result = gshp_cleaned.expect_column_to_exist("cost")
assert result.success
print(result)

In [None]:
result = gshp_cleaned.expect_column_values_to_be_in_set("town_valid", [True])
assert result.success
print(result)

In [None]:
result = gshp_cleaned.expect_column_values_to_be_of_type("cost", "float")
assert result.success
print(result)

In [None]:
result = gshp_cleaned.expect_column_values_to_be_of_type("rebate", "float")
assert result.success
print(result)

In [None]:
gshp_cleaned.save_expectation_suite("../data/expectations/gshp_clean_expectations.json")

## Solar

## Electric Vehicles (EVs)