# Model development and deployment
* In this stage, the prepared data is used for feature engineering and then model development.
* The original definition of data quality for the `age` column is used to verify a consistent distribution between train and set sets.
* The model is registered and served with a containerized MLFlow server.

In [None]:
import great_expectations as gx

import json
import mlflow
import pandas as pd
import pathlib
import requests
import sklearn
import sqlalchemy
from sklearn.ensemble import RandomForestClassifier

import demo_code as demo

Use MLflow autologging for feature engineering and model development.

In [None]:
mlflow.autolog()

## Load prepared data

Load the cleaned and curated data from the data preparation phase.

In [None]:
df_prepared_data = pd.read_sql_query(
    "select * from heart_disease",
    con=sqlalchemy.create_engine(demo.data.POSTGRES_CONNECTION_STRING),
)

Display a sample of the prepared data.

In [None]:
df_prepared_data.head(n=10)

## Develop features for modeling

Define transformation function to turn prepared data into features.

In [None]:
def featurize_data(df: pd.DataFrame) -> pd.DataFrame:
    """Transform data to features for training or inference."""

    df_features = df.copy().reset_index(drop=True)

    # Scale (standardize) numeric features.
    standard_scalar = sklearn.preprocessing.StandardScaler()

    for col in ["age", "trestbps", "chol", "thalach", "oldpeak"]:
        df_features[col] = standard_scalar.fit_transform(df_features[[col]])

    # One hot encode cp.
    cp_ohe = pd.DataFrame(
        sklearn.preprocessing.OneHotEncoder()
        .fit_transform(df_features[["cp"]])
        .todense()
    )
    cp_ohe.columns = ["cp_0", "cp_1", "cp_2", "cp_3"]

    df_features = pd.concat([df_features, cp_ohe], axis=1)

    # Limit to desired feature columns.
    df_features = df_features[demo.data.FEATURE_COLUMNS]

    return df_features

Preview feature transformation on prepared data.

In [None]:
featurize_data(df_prepared_data).head()

## Split data into train/test sets

Divide the prepared data into train and test sets.

The `num` column is the last column in the dataset and contains the label.

In [None]:
X = df_prepared_data.iloc[:, :-1]
y = df_prepared_data.iloc[:, -1]

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=42
)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

print(f"Train size:\t{X_train.shape[0]}")
print(f"Test size:\t{X_test.shape[0]}")
print(f"Total size:\t{X_train.shape[0] + X_test.shape[0]}")

## Verify consistent `age` distributions in train and test set

Retrieve the distribution Expectation Suite from GX Cloud that was saved during the data preparation phase.

In [None]:
cloud_context = gx.get_context(mode="cloud")

distribution_suite = cloud_context.suites.get(name="Heart disease data: distribution")

Create a local context and Data Source to apply data definitions at runtime in notebook exploration.

In [None]:
local_context = gx.get_context(mode="ephemeral")

data_source = local_context.data_sources.add_pandas("pandas")

Create the training set Data Asset, Batch Definition, and Batch.

In [None]:
train_set_data_asset = data_source.add_dataframe_asset(name="train set features")
train_set_batch_definition = train_set_data_asset.add_batch_definition_whole_dataframe(
    "train set batch definition"
)

train_set_batch = train_set_batch_definition.get_batch(
    batch_parameters={"dataframe": X_train}
)

Create the test set Data Asset, Batch Definition, and Batch.

In [None]:
test_set_data_asset = data_source.add_dataframe_asset(name="test set features")
test_set_batch_definition = test_set_data_asset.add_batch_definition_whole_dataframe(
    "test set batch definition"
)

test_set_batch = test_set_batch_definition.get_batch(
    batch_parameters={"dataframe": X_test}
)

Assert that the train and set sets both match the expected distribution of patient ages.

In [None]:
assert train_set_batch.validate(distribution_suite)["success"] is True
assert test_set_batch.validate(distribution_suite)["success"] is True

## Featurize the data

Transform the prepared data into features for modeling.

In [None]:
X_train.head()

In [None]:
X_train = featurize_data(X_train)
X_test = featurize_data(X_test)

In [None]:
X_train.head()

## Train the model

Train a simple model with sklearn. The performance of the model does not matter for this demo.

In [None]:
demo_model = RandomForestClassifier(max_depth=4, random_state=0)

demo_model.fit(X_train, y_train)

prediction = demo_model.predict(X_test)

View the trained model scores.

In [None]:
accuracy_score = sklearn.metrics.accuracy_score(y_test, prediction)
print(f"Accuracy:\t\t{round(accuracy_score, 2)}")

xval_score = sklearn.model_selection.cross_val_score(
    demo_model,
    X_train,
    y_train,
    cv=sklearn.model_selection.RepeatedStratifiedKFold(
        n_splits=10, n_repeats=3, random_state=1
    ),
    scoring="roc_auc",
).mean()
print(f"Cross validation score:\t{round(xval_score,2)}")

roc_auc_score = sklearn.metrics.roc_auc_score(y_test, prediction)
print(f"ROC_AUC score:\t\t{round(roc_auc_score,2)}")

## Deploy the model

### Log the model to MLflow Tracking server

In [None]:
# Log the model.
model_info = mlflow.sklearn.log_model(
    sk_model=demo_model,
    artifact_path="sklearn-model-test",
    input_example=X_train,
    registered_model_name="demo-model",
)

### Serve the model

Run the generated command in your host terminal to serve the model within the demo MLflow Docker container.

In [None]:
model_serve_command = f"mlflow models serve -m {model_info.model_uri} --env-manager virtualenv -p 5555 -h 0.0.0.0"
docker_exec_wrapper = (
    f"docker exec gx-in-the-ml-pipeline-mlflow bash -c '{model_serve_command}'"
)

print("Run the following command to serve the model:\n")
print(docker_exec_wrapper)

## Use deployed model for inference

### Verify that deployed model is available and running

Before scoring new data, ping the model to check that it is running.

In [None]:
requests.get(url=f"http://mlflow:5555/ping")

### Run inference on sample data point

Generate a new data point to score with the deployed model.

In [None]:
sample_input = {
    "columns": [
        "age",
        "sex",
        "cp_0",
        "cp_1",
        "cp_2",
        "cp_3",
        "trestbps",
        "chol",
        "fbs",
        "restecg",
        "thalach",
        "exang",
        "oldpeak",
        "slope",
        "ca",
        "thal",
    ],
    "data": [
        [
            1.0073855590677647,
            1,
            0.0,
            0.0,
            0.0,
            1.0,
            0.412888910628608,
            0.5497692967873232,
            0.0,
            1.0,
            -0.9858895025925402,
            1.0,
            1.9450130929477942,
            2.0,
            0,
            0,
        ]
    ],
}

Use a HTTP request to send data to deployed model.

In [None]:
request_payload = json.dumps(
    {
        "dataframe_split": sample_input,
    }
)

response = requests.post(
    url=f"http://mlflow:5555/invocations",
    headers={"Content-Type": "application/json"},
    data=request_payload,
)

Parse request response for the model prediction.

In [None]:
prediction_to_description = {0: "no heart disease present", 1: "heart disease present"}

prediction = response.json()["predictions"][0]

print(f"Model prediction: {prediction_to_description[prediction]}")