In [None]:
import collections
import io
import shutil
import zipfile

from typing import Union

import great_expectations as gx
import great_expectations.expectations as gxe
import pandas as pd
import pathlib
import requests
import ucimlrepo

## Fetch data

Download heart disease data from https://archive.ics.uci.edu/dataset/45/heart+disease.

In [None]:
DATA_DIR = pathlib.Path("/notebooks/data")

In [None]:
response = requests.get("https://archive.ics.uci.edu/static/public/45/heart+disease.zip")

if False:
    print("Fetching data...")
    if response.status_code == 200:
        shutil.rmtree(DATA_DIR)
        DATA_DIR.mkdir(parents=True, exist_ok=True)
    
        with zipfile.ZipFile(io.BytesIO(response.content)) as fh:
            fh.extractall(DATA_DIR)
            print("Downloaded data.")
    else:
        print(f"Failed to download data: {response.status_code}")
else:
    print("Not attempting to download data.")

## Ingest data

In [None]:
COL2DESCRIPTION = collections.OrderedDict({
    "age" : "Age in years",
    "sex" : "sex; 1: male, 0: female",
    "cp" : "Chest pain type; 1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic",
    "trestbps" : "Resting blood pressure in mm Hg on admission to the hospital",
    "chol" : "Serum cholesterol in mg/dl",
    "fbs" : "Fasting blood sugar > 120 mg/dl; 1: true, 0: false",
    "restecg" : "Resting electrocardiographic results; 0: normal, 1: having ST-T wave abnormality, 2: showing probable or definite left ventricular hypertrophy",
    "thalach" : "Maximum heart rate achieved",
    "exang" : "Exercise induced angina; 1: yes, 0: no",
    "oldpeak" : "ST depression induced by exercise relative to rest",
    "slope" : "Slope of the peak exercise ST segment; 1: upsloping, 2: flat, 3: downsloping",
    "ca" : "Major vessels (0-3) colored by flourosopy",
    "thal" : "Heart defect; 3 = normal; 6 = fixed defect; 7 = reversable defect",
    "num" : "Diagnosis of heart disease",
})

COLUMNS = list(COL2DESCRIPTION.keys())

In [None]:
DATASET_NAME2FILE = {
    "va" : "processed.va.data",
    "hungarian" : "processed.hungarian.data",
    "switzerland" : "processed.switzerland.data",
    "cleveland" : "processed.cleveland.data",
}

samples = []

for dataset_name, dataset_file in DATASET_NAME2FILE.items():
    df_dataset = pd.read_csv(DATA_DIR / dataset_file, names=COLUMNS)
    df_dataset["dataset"] = dataset_name
    samples.append(df_dataset)

df_heart_disease = pd.concat(samples)

In [None]:
display(df_heart_disease.head())
display(df_heart_disease.tail())

In [None]:
df_heart_disease.dtypes

## Examine raw data relative to Expectations

In [None]:
# Get Data Context.
context = gx.get_context()

In [None]:
# Create dockerized Data Docs site for demo.
context.add_data_docs_site(
    site_config={
        "class_name": "SiteBuilder",
        "show_how_to_buttons": False,
        "store_backend":
            {
                "class_name": "TupleFilesystemStoreBackend",
                "base_directory": "/gx/gx_volume/data_docs",
            },
        "site_index_builder" : {"class_name": "DefaultSiteIndexBuilder"},
    },
    site_name="GX in the ML pipeline demo"
)

In [None]:
# Create Data Source, Data Asset, Batch Definition, and get Batch.
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="heart disease data")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df_heart_disease})

# Create an Expectation Suite.
suite = context.suites.add(
    gx.core.expectation_suite.ExpectationSuite(name="expectations")
)

In [None]:
# Create and add Expectations to the Suite, based on provided data definitions.
suite.add_expectation(gxe.ExpectColumnValuesToBeBetween(column="age", min_value=0, max_value=99))
suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="sex", value_set=[0, 1]))
suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="cp", value_set=[1,2,3,4]))
suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="trestbps", type_="int"))

# Run Expectations against data.
validation_result = batch.validate(suite)

## Explore Validation Results

In [None]:
type(validation_result)

In [None]:
validation_result.describe_dict().keys()

In [None]:
validation_result.describe_dict()["success"]

In [None]:
validation_result.describe_dict()["statistics"]

In [None]:
print(validation_result.describe_dict()["result_url"])

In [None]:
validation_result.describe_dict()["expectations"][0]

In [None]:
expectation_results = []

for expectation_result in validation_result.describe_dict()["expectations"]:

    params = expectation_result["kwargs"].copy()
    del params["batch_id"]
    del params["column"]
    
    expectation_results.append(
        {
            "expectation" : expectation_result["expectation_type"],
            "column" : expectation_result["kwargs"]["column"],
            "params" : params,
            "success" : expectation_result["success"],
            "element_count" : expectation_result["result"]["element_count"],
            "unexpected_count" : expectation_result["result"]["unexpected_count"],
            "unexpected_percent" : expectation_result["result"]["unexpected_percent"],
            "missing_count" : expectation_result["result"]["missing_count"],
            "missing_percent" : expectation_result["result"]["missing_percent"],
        }
    )

df_expectation_results = pd.DataFrame(expectation_results)
df_expectation_results

## Explore Validation Results in Data Docs

In [None]:
validation_definition = gx.ValidationDefinition(
    name="demo validation definition",
    data=batch_definition,
    suite=suite,
)

results = validation_definition.run(batch_parameters={"dataframe": df_heart_disease})

In [None]:
context.list_data_docs_sites()

In [None]:
checkpoint = context.checkpoints.add(gx.Checkpoint(
    name="checkpoint",
    validation_definitions=[validation_definition],
    actions=[gx.checkpoint.actions.UpdateDataDocsAction(name="update_data_docs")]
))

results = checkpoint.run(batch_parameters={"dataframe": df_heart_disease})

<mark>**Check out results in Data Docs.**</mark>

## Clean data and establish governing Expectation Suite

In [None]:
df_cleaned = df_heart_disease.copy()

final_suite = gx.core.expectation_suite.ExpectationSuite(name="Heart disease data expectations")

def clean_question_mark_for_float_columns(x: str) -> Union[float, None]:
    "Nullify question marks and return figures as floats."
    x = str(x).strip()
    if x == "?":
        return None
    else:
        return float(x)

def clean_question_mark_for_int_columns(x: str) -> Union[int, None]:
    "Nullify question marks and return figures as int."
    x = str(x).strip()
    if x == "?":
        return None
    else:
        return int(float(x))

MOSTLY = 0.9

# Dataset schema expectations.
final_suite.add_expectation(gxe.ExpectTableColumnsToMatchSet(column_set=COLUMNS))

# Age
df_cleaned["age"] = df_cleaned["age"].astype(int)
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="age", type_="int"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeBetween(column="age", min_value=0))

# Sex
df_cleaned["sex"] = df_cleaned["sex"].astype(int)
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="sex", type_="int"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="sex", value_set=[0, 1]))

# Chest pain
df_cleaned["cp"] = df_cleaned["cp"].astype(int)
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="cp", type_="int"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="cp", value_set=[1,2,3,4]))

# Resting blood pressure (trestbps)
df_cleaned["trestbps"] = df_cleaned["trestbps"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="trestbps", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="trestbps", mostly=MOSTLY))

# Cholesterol (chol)
df_cleaned["chol"] = df_cleaned["chol"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="chol", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="chol", mostly=MOSTLY))

# Fasting blood sugar (fbs)
df_cleaned["fbs"] = df_cleaned["fbs"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="fbs", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="fbs", value_set=[0, 1]))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="fbs", mostly=MOSTLY))

# Resting electrocardiographic results (restecg)
df_cleaned["restecg"] = df_cleaned["restecg"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="restecg", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="restecg", value_set=[0, 1, 2]))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="restecg", mostly=MOSTLY))

# Maximum heart rate achieved (restecg)
df_cleaned["thalach"] = df_cleaned["thalach"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="thalach", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeBetween(column="thalach", min_value=0))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="thalach", mostly=MOSTLY))

# Exercise induced angina (exang)
df_cleaned["exang"] = df_cleaned["exang"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="exang", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="exang", value_set=[0, 1]))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="exang", mostly=MOSTLY))

# ST depression induced by exercise relative to rest (oldpeak)
df_cleaned["oldpeak"] = df_cleaned["oldpeak"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="oldpeak", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="oldpeak", mostly=MOSTLY))

# Slope of the peak exercise ST segment (slope)
df_cleaned["slope"] = df_cleaned["slope"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="slope", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="slope", value_set=[1, 2, 3]))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="slope", mostly=MOSTLY))

# Major vessels (0-3) colored by flourosopy (ca)
df_cleaned["ca"] = df_cleaned["ca"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="ca", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="ca", value_set=[0, 1, 2, 3]))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="ca", mostly=MOSTLY))

# Heart defect (thal)
df_cleaned["thal"] = df_cleaned["thal"].apply(lambda x: clean_question_mark_for_float_columns(x))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column="thal", type_="float"))
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="thal", value_set=[3, 6, 7]))
final_suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column="thal", mostly=MOSTLY))

# Diagnosis of heart disease (num)
df_cleaned["num"] = df_cleaned["num"].astype(int)
final_suite.add_expectation(gxe.ExpectColumnValuesToBeInSet(column="num", value_set=[0, 1]))

In [None]:
# Check the cleaned data against the final Expectation Suite.

# Create Data Source, Data Asset, Batch Definition, and get Batch.
data_asset_cleaned = data_source.add_dataframe_asset(name="Cleaned heart disease data")

batch_definition_cleaned = data_asset_cleaned.add_batch_definition_whole_dataframe("Cleaned data batch definition")

validation_definition_cleaned = gx.ValidationDefinition(
    name="Cleaned data validation definition",
    data=batch_definition_cleaned,
    suite=final_suite,
)

context.suites.add(final_suite)

context.validation_definitions.add(validation_definition_cleaned)

results = validation_definition.run(batch_parameters={"dataframe": df_cleaned})

checkpoint_cleaned = context.checkpoints.add(gx.Checkpoint(
    name="cleaned data checkpoint",
    validation_definitions=[validation_definition_cleaned],
    actions=[gx.checkpoint.actions.UpdateDataDocsAction(name="update_data_docs")]
))

results = checkpoint_cleaned.run(batch_parameters={"dataframe": df_cleaned})

In [None]:
df_cleaned[df_cleaned["trestbps"].isna()].shape

In [None]:
df_cleaned.dtypes

## UCI ML data

In [None]:
heart_disease = ucimlrepo.fetch_ucirepo(id=45)

X = heart_disease.data.features 
y = heart_disease.data.targets 

display(X.shape)
display(X.head())