# Data preparation
* In this stage, data is explored, cleaned, and prepared for modeling.
* GX is used to create and apply data quality definitions to the input data.
* This demo uses the sample [Heart Disease datasets](https://archive.ics.uci.edu/dataset/45/heart+disease) available from the UCI ML Repository.

In [1]:
import collections

from typing import Union

import altair as alt
import great_expectations as gx
import great_expectations.expectations as gxe
import pandas as pd
import pathlib
import sqlalchemy

import demo_code as demo

In [2]:
import warnings

# Suppress known DeprecationWarnings and FutureWarnings for demo.
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

## Collect data

### Download raw patient data

Download heart disease data from https://archive.ics.uci.edu/dataset/45/heart+disease.

In [3]:
DATA_DIR = pathlib.Path("/notebooks/data")

RAW_DATA_DIR = DATA_DIR / "raw"
CLEANED_DATA_DIR = DATA_DIR / "cleaned"

In [4]:
if False:
    demo.data.download_uci_heart_disease_data(RAW_DATA_DIR)

### Ingest raw patient data into dataframe

In [5]:
# Columns and descriptions are defined based on the data dictionary in heart_disease.names file.
COL2DESCRIPTION = collections.OrderedDict(
    {
        "age": "Age in years",
        "sex": "sex; 1: male, 0: female",
        "cp": "Chest pain type; 1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic",
        "trestbps": "Resting blood pressure in mm Hg on admission to the hospital",
        "chol": "Serum cholesterol in mg/dl",
        "fbs": "Fasting blood sugar > 120 mg/dl; 1: true, 0: false",
        "restecg": "Resting electrocardiographic results; 0: normal, 1: having ST-T wave abnormality, 2: showing probable or definite left ventricular hypertrophy",
        "thalach": "Maximum heart rate achieved",
        "exang": "Exercise induced angina; 1: yes, 0: no",
        "oldpeak": "ST depression induced by exercise relative to rest",
        "slope": "Slope of the peak exercise ST segment; 1: upsloping, 2: flat, 3: downsloping",
        "ca": "Major vessels (0-3) colored by flourosopy",
        "thal": "Heart defect; 3 = normal; 6 = fixed defect; 7 = reversable defect",
        "num": "Diagnosis of heart disease",
    }
)

COLUMNS = list(COL2DESCRIPTION.keys())

In [6]:
# Sample data is split across datasets gathered from four health centers.
DATASET_NAME2FILE = {
    "va": "processed.va.data",
    "hungarian": "processed.hungarian.data",
    "switzerland": "processed.switzerland.data",
    "cleveland": "processed.cleveland.data",
}

samples = []

for dataset_name, dataset_file in DATASET_NAME2FILE.items():
    df_dataset = pd.read_csv(RAW_DATA_DIR / dataset_file, names=COLUMNS)
    df_dataset["dataset"] = dataset_name
    samples.append(df_dataset)

df_heart_disease = pd.concat(samples)

### Display sample of raw patient data

In [7]:
display(df_heart_disease.head())
display(df_heart_disease.tail())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,dataset
0,63.0,1.0,4.0,140,260,0,1,112,1,3.0,2,?,?,2,va
1,44.0,1.0,4.0,130,209,0,1,127,0,0.0,?,?,?,0,va
2,60.0,1.0,4.0,132,218,0,1,140,1,1.5,3,?,?,2,va
3,55.0,1.0,4.0,142,228,0,1,149,1,2.5,1,?,?,1,va
4,66.0,1.0,3.0,110,213,1,2,99,1,1.3,2,?,?,0,va


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,dataset
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1,cleveland
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2,cleveland
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3,cleveland
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1,cleveland
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0,cleveland


## Examine raw patient data using GX and Expectations

### Define GX data validation workflow

In [8]:
# Get Data Context.
context = gx.get_context(mode="ephemeral")

In [9]:
# Code to create a containerized Data Docs site for demo.
context.add_data_docs_site(
    site_config={
        "class_name": "SiteBuilder",
        "show_how_to_buttons": False,
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "base_directory": "/gx/gx_volume/data_docs",
        },
        "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
    },
    site_name="GX in the ML pipeline demo",
)

In [10]:
# Create Data Source, Data Asset, Batch Definition, and get Batch.
pandas_data_source = context.data_sources.add_pandas("pandas")
pandas_data_asset = pandas_data_source.add_dataframe_asset(
    name="Raw heart disease data"
)

pandas_batch_definition = pandas_data_asset.add_batch_definition_whole_dataframe(
    "batch definition"
)
batch = pandas_batch_definition.get_batch(
    batch_parameters={"dataframe": df_heart_disease}
)

# Create an Expectation Suite.
suite = context.suites.add(
    gx.core.expectation_suite.ExpectationSuite(name="Heart disease data: exploratory")
)

In [11]:
# Create and add Expectations to the Suite, based on provided data definitions.
suite.add_expectation(
    gxe.ExpectColumnValuesToBeBetween(column="age", min_value=0, max_value=99)
)

suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="sex", value_set=[0, 1]))

suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="cp", value_set=[1, 2, 3, 4])
)

suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="trestbps", type_="int"))

# Run Expectations against data.
validation_result = batch.validate(suite)

Calculating Metrics: 100%|██████████| 34/34 [00:00<00:00, 734.74it/s]


### Explore Validation Results

In [12]:
# Running the Validation returns a ExpectationSuiteValidationResult object.
type(validation_result)

great_expectations.core.expectation_validation_result.ExpectationSuiteValidationResult

In [13]:
# The success key indicates whether or not all Expectations passed.
validation_result["success"]

False

In [14]:
# The statistics key contains summary information on how many Expectations passed.
validation_result["statistics"]

{'evaluated_expectations': 4,
 'successful_expectations': 3,
 'unsuccessful_expectations': 1,
 'success_percent': 75.0}

In [15]:
# The results key provides a list of results for each Expectation that was run.

# Examine results from a passing Expectation.
validation_result["results"][0]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_between",
    "kwargs": {
      "batch_id": "pandas-Raw heart disease data",
      "column": "age",
      "min_value": 0.0,
      "max_value": 99.0
    },
    "meta": {},
    "id": "7b08b70e-a9c7-4995-96ea-185378d9cd9d"
  },
  "result": {
    "element_count": 920,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [16]:
# Examine results from a failing Expectation.
validation_result["results"][3]

{
  "success": false,
  "expectation_config": {
    "type": "expect_column_values_to_be_of_type",
    "kwargs": {
      "batch_id": "pandas-Raw heart disease data",
      "column": "trestbps",
      "type_": "int"
    },
    "meta": {},
    "id": "8d810bb3-3df1-4e46-bedf-bffbecb5afdb"
  },
  "result": {
    "element_count": 920,
    "unexpected_count": 920,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "140",
      "130",
      "132",
      "142",
      "110",
      "120",
      "150",
      "180",
      "120",
      "160",
      "126",
      "140",
      "110",
      "?",
      "128",
      "120",
      "170",
      "110",
      "126",
      "152"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0,
    "partial_unexpected_counts": [
      {
        "value": "110",
        "count": 3
      },
      {
        "value": "120",
        "count": 3
      },
      {
        "val

In [17]:
# Parse the Validation Result object to summarize result data by Expectation.
expectation_results = []

for expectation_result in validation_result.describe_dict()["expectations"]:

    params = expectation_result["kwargs"].copy()
    del params["batch_id"]
    del params["column"]

    expectation_results.append(
        {
            "expectation": expectation_result["expectation_type"],
            "column": expectation_result["kwargs"]["column"],
            "params": params,
            "success": expectation_result["success"],
            "element_count": expectation_result["result"]["element_count"],
            "unexpected_count": expectation_result["result"]["unexpected_count"],
            "unexpected_percent": expectation_result["result"]["unexpected_percent"],
            "missing_count": expectation_result["result"]["missing_count"],
            "missing_percent": expectation_result["result"]["missing_percent"],
        }
    )

df_expectation_results = pd.DataFrame(expectation_results)
df_expectation_results

Unnamed: 0,expectation,column,params,success,element_count,unexpected_count,unexpected_percent,missing_count,missing_percent
0,expect_column_values_to_be_between,age,"{'min_value': 0.0, 'max_value': 99.0}",True,920,0,0.0,0,0.0
1,expect_column_values_to_be_in_set,sex,"{'value_set': [0, 1]}",True,920,0,0.0,0,0.0
2,expect_column_values_to_be_in_set,cp,"{'value_set': [1, 2, 3, 4]}",True,920,0,0.0,0,0.0
3,expect_column_values_to_be_of_type,trestbps,{'type_': 'int'},False,920,920,100.0,0,0.0


### Explore Validation Results in Data Docs

In [18]:
# Create the ValidationDefinition, which pairs a Batch of data with an Expectation Suite.
validation_definition = gx.ValidationDefinition(
    name="demo validation definition",
    data=pandas_batch_definition,
    suite=suite,
)

results = validation_definition.run(batch_parameters={"dataframe": df_heart_disease})

Calculating Metrics: 100%|██████████| 34/34 [00:00<00:00, 144.21it/s]


In [19]:
# Run the ValidationDefinition using a Checkpoint, and write the results to Data Docs.
checkpoint = context.checkpoints.add(
    gx.Checkpoint(
        name="checkpoint",
        validation_definitions=[validation_definition],
        actions=[gx.checkpoint.actions.UpdateDataDocsAction(name="update_data_docs")],
    )
)

results = checkpoint.run(batch_parameters={"dataframe": df_heart_disease})

Calculating Metrics: 100%|██████████| 34/34 [00:00<00:00, 806.55it/s]


### View Expectation Suite and Validation Results in [Data Docs](http://localhost:3000)

## Clean data and preserve data quality standards in an Expectation Suite

In [20]:
df_cleaned = df_heart_disease.copy()

### Create Expectation Suite for schema and validity requirements

In [21]:
# Create a new suite that codifies schema and validity standards for data used for modeling.
schema_and_validity_suite = gx.ExpectationSuite(
    name="Heart disease data: schema and validity"
)


def clean_question_mark_for_float_columns(x: str) -> Union[float, None]:
    "Nullify question marks and return figures as floats."
    x = str(x).strip()
    if x == "?":
        return None
    else:
        return float(x)


def clean_question_mark_for_int_columns(x: str) -> Union[int, None]:
    "Nullify question marks and return figures as int."
    x = str(x).strip()
    if x == "?":
        return None
    else:
        return int(float(x))


MOSTLY = 0.9
INTEGER_TYPES = ["int", "BIGINT"]
FLOAT_TYPES = ["float", "DOUBLE_PRECISION"]


# Dataset schema expectations.
schema_and_validity_suite.add_expectation(
    gxe.ExpectTableColumnsToMatchSet(column_set=COLUMNS)
)

# Age
df_cleaned["age"] = df_cleaned["age"].astype(int)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="age", type_list=INTEGER_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeBetween(column="age", min_value=0)
)

# Sex
df_cleaned["sex"] = df_cleaned["sex"].astype(int)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="sex", type_list=INTEGER_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="sex", value_set=[0, 1])
)

# Chest pain
df_cleaned["cp"] = df_cleaned["cp"].astype(int)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="cp", type_list=INTEGER_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="cp", value_set=[1, 2, 3, 4])
)

# Resting blood pressure (trestbps)
df_cleaned["trestbps"] = df_cleaned["trestbps"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="trestbps", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="trestbps", mostly=MOSTLY)
)

# Cholesterol (chol)
df_cleaned["chol"] = df_cleaned["chol"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="chol", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="chol", mostly=MOSTLY)
)

# Fasting blood sugar (fbs)
df_cleaned["fbs"] = df_cleaned["fbs"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="fbs", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="fbs", value_set=[0, 1])
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="fbs", mostly=MOSTLY)
)

# Resting electrocardiographic results (restecg)
df_cleaned["restecg"] = df_cleaned["restecg"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="restecg", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="restecg", value_set=[0, 1, 2])
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="restecg", mostly=MOSTLY)
)

# Maximum heart rate achieved (restecg)
df_cleaned["thalach"] = df_cleaned["thalach"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="thalach", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeBetween(column="thalach", min_value=0)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="thalach", mostly=MOSTLY)
)

# Exercise induced angina (exang)
df_cleaned["exang"] = df_cleaned["exang"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="exang", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="exang", value_set=[0, 1])
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="exang", mostly=MOSTLY)
)

# ST depression induced by exercise relative to rest (oldpeak)
df_cleaned["oldpeak"] = df_cleaned["oldpeak"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="oldpeak", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="oldpeak", mostly=MOSTLY)
)

# Slope of the peak exercise ST segment (slope)
df_cleaned["slope"] = df_cleaned["slope"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="slope", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="slope", value_set=[1, 2, 3])
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="slope", mostly=0.6)
)

# Major vessels (0-3) colored by flourosopy (ca)
df_cleaned["ca"] = df_cleaned["ca"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="ca", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="ca", value_set=[0, 1, 2, 3])
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="ca", mostly=0.3)
)

# Heart defect (thal)
df_cleaned["thal"] = df_cleaned["thal"].apply(
    lambda x: clean_question_mark_for_float_columns(x)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInTypeList(column="thal", type_list=FLOAT_TYPES)
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="thal", value_set=[3, 6, 7])
)
schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToNotBeNull(column="thal", mostly=0.4)
)

# Diagnosis of heart disease (num)
df_cleaned["num"] = df_cleaned["num"].astype(int)
df_cleaned["num"] = df_cleaned["num"].apply(lambda x: 0 if x == 0 else 1)

schema_and_validity_suite.add_expectation(
    gxe.ExpectColumnValuesToBeInSet(column="num", value_set=[0, 1])
)

CLEANED_COLUMNS = [
    "age",
    "sex",
    "cp",
    "trestbps",
    "chol",
    "fbs",
    "restecg",
    "thalach",
    "exang",
    "oldpeak",
    "slope",
    "ca",
    "thal",
    "dataset",
    "num",
]

df_cleaned = df_cleaned[CLEANED_COLUMNS]

### Create Expectation Suite for distribution requirements

In [22]:
# Generate age bins and weights.
bins, weights = [], []
min_age, max_age = 25, 80

for x in range(min_age, max_age, 5):
    bin_count = df_cleaned[
        (df_cleaned["age"] >= x) & (df_cleaned["age"] < x + 5)
    ].shape[0]
    proportion = round(bin_count / df_cleaned.shape[0], 3)
    # print(f"{x}-{x+5}:\t{bin_count}\t{proportion}")
    bins.append(x)
    weights.append(proportion)

bins.append(max_age)

display(bins)
display(weights)

[25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]

[0.004, 0.018, 0.064, 0.107, 0.124, 0.196, 0.212, 0.163, 0.078, 0.026, 0.008]

In [23]:
alt.Chart(df_cleaned).mark_bar().encode(
    alt.X("age", bin=alt.Bin(extent=[25, 80], step=5)),
    alt.Y("count()"),
    tooltip=["age", "count()"],
)

In [24]:
distribution_suite = gx.core.expectation_suite.ExpectationSuite(
    name="Heart disease data: distribution"
)

distribution_suite.add_expectation(
    gxe.ExpectColumnQuantileValuesToBeBetween(
        column="age",
        quantile_ranges={
            "quantiles": [0, 0.25, 0.5, 0.75, 1],
            "value_ranges": [[25, 30], [45, 50], [50, 55], [58, 62], [75, 80]],
        },
    )
)

distribution_suite.add_expectation(
    gxe.ExpectColumnKLDivergenceToBeLessThan(
        column="age",
        partition_object={
            "bins": [25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
            "weights": [
                0.004,
                0.018,
                0.064,
                0.107,
                0.124,
                0.196,
                0.212,
                0.163,
                0.078,
                0.026,
                0.008,
            ],
        },
        threshold=0.1,
    )
)

ExpectColumnKLDivergenceToBeLessThan(id=None, meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=False, rendered_content=None, windows=None, batch_id=None, column='age', row_condition=None, condition_parser=None, partition_object={'bins': [25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80], 'weights': [0.004, 0.018, 0.064, 0.107, 0.124, 0.196, 0.212, 0.163, 0.078, 0.026, 0.008]}, threshold=0.1, internal_weight_holdout=0, tail_weight_holdout=0, bucketize_data=True, min_value=None, max_value=None)

### Persist prepared data to Postgres

In [25]:
POSTGRES_CONNECTION_STRING = "postgresql://gx_user:gx_user_password@postgres:5432/demo"
engine = sqlalchemy.create_engine(POSTGRES_CONNECTION_STRING)
df_cleaned.to_sql("heart_disease", engine, if_exists="replace", index=False)

920

### Run Expectation Suites against prepared data

In [26]:
# Create Data Source, Data Asset, Batch Definition, and get Batch.
postgres_data_source = context.data_sources.add_postgres(
    "postgres", connection_string=POSTGRES_CONNECTION_STRING
)

postgres_data_asset = postgres_data_source.add_table_asset(
    name="Prepared heart disease data", table_name="heart_disease"
)

postgres_batch_definition = postgres_data_asset.add_batch_definition_whole_table(
    "prepared data batch definition"
)

context.suites.add(schema_and_validity_suite)
context.suites.add(distribution_suite)

schema_and_validity_validation_definition = gx.ValidationDefinition(
    name="validation definition for schema and validity",
    data=postgres_batch_definition,
    suite=schema_and_validity_suite,
)

distribution_validation_definition = gx.ValidationDefinition(
    name="validation definition for distribution",
    data=postgres_batch_definition,
    suite=distribution_suite,
)

context.validation_definitions.add(schema_and_validity_validation_definition)
context.validation_definitions.add(distribution_validation_definition)

checkpoint = context.checkpoints.add(
    gx.Checkpoint(
        name="cleaned data checkpoint",
        validation_definitions=[
            schema_and_validity_validation_definition,
            distribution_validation_definition,
        ],
        actions=[gx.checkpoint.actions.UpdateDataDocsAction(name="update_data_docs")],
    )
)

results = checkpoint.run()

Calculating Metrics: 100%|██████████| 164/164 [00:00<00:00, 647.97it/s]
Calculating Metrics: 100%|██████████| 12/12 [00:00<00:00, 340.95it/s]


### View Expectation Suite and Validation Results in [Data Docs](http://localhost:3000)

## Store Expectation Suite and Validation Results in GX Cloud 

### Clean GX Cloud sandbox for demo

In [27]:
def remove_demo_data_from_cloud(context: gx.data_context.CloudDataContext) -> None:
    """Clean demo entities from GX Cloud sandbox."""

    if (context.get_cloud_config())[
        "organization_id"
    ] != "24017178-6ab4-46f6-93ae-dc09cacf0db4":
        raise Exception("Not the sandbox!")

    # Remove Checkpoint.
    cloud_checkpoints = [x.name for x in context.checkpoints.all()]
    checkpoint_name = "Prepared heart disease data checkpoint"

    if checkpoint_name in cloud_checkpoints:
        context.checkpoints.delete(name=checkpoint_name)
        print(f"Removed Checkpoint: {checkpoint_name}")

    # Remove Expectation Suites.
    cloud_suites = [x["name"] for x in context.suites.all()]

    for x in [
        "Heart disease data: schema and validity",
        "Heart disease data: distribution",
    ]:
        if x in cloud_suites:
            context.suites.delete(name=x)
            print(f"Removed Expectation Suite: {x}")

    # Remove Validation Definitions.
    # cloud_validation_definitions = [x.name for x in context.validation_definitions.all()]
    # print("CLOUD DEFS")
    # print(cloud_validation_definitions)
    # for x in ["schema and validity validation definition", "distribution validation definition"]:
    #     if x in cloud_validation_definitions:
    #         context.validation_definitions.delete(name=x)
    #         print(f"Removed Validation Definition: {x}")

    # Remove Data Source.
    cloud_data_sources = list(context.data_sources.all().keys())
    data_source_name = "demo database"

    if data_source_name in cloud_data_sources:
        context.data_sources.delete(name=data_source_name)
        print(f"Removed Data Source: {data_source_name}")


# remove_demo_data_from_cloud(gx.get_context(mode="cloud"))

### Save Expectation Suite to GX Cloud and run data validation

In [28]:
# Get a Data Context for GX Cloud organization.
cloud_context = gx.get_context(mode="cloud")

# Create Data Source, Data Asset, Batch Definition.
cloud_data_source = cloud_context.data_sources.add_postgres(
    "demo database", connection_string=POSTGRES_CONNECTION_STRING
)

cloud_data_asset = cloud_data_source.add_table_asset(
    name="Prepared heart disease data", table_name="heart_disease"
)

cloud_batch_definition = cloud_data_asset.add_batch_definition_whole_table(
    "prepared data batch definition"
)

# Add previously defined Expectation Suites.
cloud_context.suites.add(schema_and_validity_suite)
cloud_context.suites.add(distribution_suite)

# Create Validation Definitions.
cloud_schema_and_validity_validation_definition = gx.ValidationDefinition(
    name="schema and validity validation definition",
    data=cloud_batch_definition,
    suite=schema_and_validity_suite,
)

cloud_distribution_validation_definition = gx.ValidationDefinition(
    name="distribution validation definition",
    data=cloud_batch_definition,
    suite=distribution_suite,
)

cloud_context.validation_definitions.add(
    cloud_schema_and_validity_validation_definition
)
cloud_context.validation_definitions.add(cloud_distribution_validation_definition)

# Create Checkpoint.
cloud_checkpoint = cloud_context.checkpoints.add(
    gx.Checkpoint(
        name="Prepared heart disease data checkpoint",
        validation_definitions=[
            cloud_schema_and_validity_validation_definition,
            cloud_distribution_validation_definition,
        ],
    )
)

# Run Checkpoint (and persist Validation Results in GX Cloud).
results = cloud_checkpoint.run()

Calculating Metrics: 100%|██████████| 12/12 [00:00<00:00, 225.68it/s]
Calculating Metrics: 100%|██████████| 164/164 [00:00<00:00, 706.86it/s]


### View Expectation Suites and Validation Results in [GX Cloud](https://app.greatexpectations.io/organizations/gx-rachel-sandbox/data-assets)