In [1]:
import rad_pipeline.zipcodes as zc
import rad_pipeline.rad_pipeline as rp
import great_expectations as ge
import datacompy

In [2]:
gshp = rp.load_gshp()
gshp_cleaned = rp.data_clean(gshp, "Ground-source Heat Pumps")
gshp_cleaned = rp.data_checkpoint(
    gshp_cleaned,
    "Ground-source Heat Pumps"
    )

In [3]:
gshp_from_disk = rp.clean_data_load("Ground-source Heat Pumps")

In [4]:
compare = datacompy.Compare(
    gshp_cleaned,
    gshp_from_disk,
    on_index=True,
    abs_tol=1e-5, #Optional, defaults to 0
    rel_tol=1e-5, #Optional, defaults to 0
    df1_name='In Memory', #Optional, defaults to 'df1'
    df2_name='From Disk' #Optional, defaults to 'df2'
    )
compare.matches(ignore_extra_columns=False)

True

In [5]:
# This method prints out a human-readable report summarizing and sampling differences
print(compare.report())

DataComPy Comparison
--------------------

DataFrame Summary
-----------------

   DataFrame  Columns  Rows
0  In Memory       43   517
1  From Disk       43   517

Column Summary
--------------

Number of columns in common: 43
Number of columns in In Memory but not in From Disk: 0
Number of columns in From Disk but not in In Memory: 0

Row Summary
-----------

Matched on: index
Any duplicates on match values: No
Absolute Tolerance: 1e-05
Relative Tolerance: 1e-05
Number of rows in common: 517
Number of rows in In Memory but not in From Disk: 0
Number of rows in From Disk but not in In Memory: 0

Number of rows with some compared columns unequal: 0
Number of rows with all compared columns equal: 517

Column Comparison
-----------------

Number of columns compared with some values unequal: 0
Number of columns compared with all values equal: 43
Total number of values which compare unequal: 0




In [None]:
gshp_from_disk.dtypes

In [None]:
gshp_cleaned.zip_cleaned.dtype

In [None]:
gshp_cleaned.zip_cleaned.head()

In [None]:
gshp_cleaned.zip_cleaned.unique()

In [None]:
gshp_from_disk.zip_cleaned

In [None]:
from tempfile import TemporaryDirectory
import os
import pandas as pd

In [None]:

def test_io(df):
    with TemporaryDirectory() as tempdir:
        pkl_file = os.path.join(tempdir, "test.pkl")
        df.to_pickle(pkl_file)
        df_from_disk = pd.read_pickle(pkl_file)
    compare = datacompy.Compare(
        df,
        df_from_disk,
        on_index=True,
        abs_tol=1e-5, #Optional, defaults to 0
        rel_tol=1e-5, #Optional, defaults to 0
        df1_name='In Memory', #Optional, defaults to 'df1'
        df2_name='From Disk' #Optional, defaults to 'df2'
    )
    return compare

In [None]:
compare = test_io(gshp_cleaned)

In [None]:
compare.matches(ignore_extra_columns=False)

In [None]:
print(compare.report())