# Model development and deployment
* In this stage, the prepared data is used for feature engineering and then model development.
* The model is registered and served with a containerized MLFlow server.

In [1]:
import pandas as pd
import mlflow
import sklearn
import pathlib

import demo_code as demo
import sqlalchemy
from sklearn.ensemble import RandomForestClassifier
import json
import requests

In [2]:
mlflow.autolog()

## Load prepared data

In [3]:
POSTGRES_CONNECTION_STRING = "postgresql://gx_user:gx_user_password@postgres:5432/demo"
engine = sqlalchemy.create_engine(POSTGRES_CONNECTION_STRING)

In [4]:
df_features = pd.read_sql_query("select * from heart_disease", con=engine)

In [5]:
df_features.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,dataset,num
0,63,1,4,140.0,260.0,0.0,1.0,112.0,1.0,3.0,2.0,,,va,1
1,44,1,4,130.0,209.0,0.0,1.0,127.0,0.0,0.0,,,,va,0
2,60,1,4,132.0,218.0,0.0,1.0,140.0,1.0,1.5,3.0,,,va,1
3,55,1,4,142.0,228.0,0.0,1.0,149.0,1.0,2.5,1.0,,,va,1
4,66,1,3,110.0,213.0,1.0,2.0,99.0,1.0,1.3,2.0,,,va,0


## Engineer features

### Transform data to develop features for modeling

In [6]:
def featurize_data(df: pd.DataFrame) -> pd.DataFrame:
    """Transform data to features for training or inference."""

    # Drop dataset annotation.
    df = df.drop(["dataset"], axis=1)

    # Scale (standardize) numeric features.
    standard_scalar = sklearn.preprocessing.StandardScaler()

    for col in ["age", "trestbps", "chol", "thalach", "oldpeak"]:
        df[col] = standard_scalar.fit_transform(df[[col]])

    # One hot encode cp.
    cp_ohe = pd.DataFrame(
        sklearn.preprocessing.OneHotEncoder().fit_transform(df[["cp"]]).todense()
    )
    cp_ohe.columns = ["cp_0", "cp_1", "cp_2", "cp_3"]
    df = pd.concat([df, cp_ohe], axis=1)

    # Limit to desired feature columns.
    df = df[demo.data.FEATURE_COLUMNS]

    return df


df_features = featurize_data(df_features)

### Examine feature data

In [7]:
df_features.head()

Unnamed: 0,age,sex,cp_0,cp_1,cp_2,cp_3,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,1.007386,1,0.0,0.0,0.0,1.0,0.412889,0.549769,0.0,1.0,-0.98589,1.0,1.945013,2.0,,,1
1,-1.009693,1,0.0,0.0,0.0,1.0,-0.111908,0.089142,0.0,1.0,-0.406991,0.0,-0.805791,,,,0
2,0.688899,1,0.0,0.0,0.0,1.0,-0.006949,0.170429,0.0,1.0,0.094721,1.0,0.569611,3.0,,,1
3,0.158089,1,0.0,0.0,0.0,1.0,0.517848,0.260748,0.0,1.0,0.44206,1.0,1.486546,1.0,,,1
4,1.325872,1,0.0,0.0,1.0,0.0,-1.161501,0.12527,1.0,2.0,-1.487601,1.0,0.386224,2.0,,,0


## Split features into train/test sets

In [8]:
# The num column is the last column in the dataset and contains the label.
X = df_features.iloc[:, :-1]
y = df_features.iloc[:, -1]

In [9]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=42
)

## Train the model

### Train a simple demo model with sklearn

In [10]:
demo_model = RandomForestClassifier(max_depth=4, random_state=0)

demo_model.fit(X_train, y_train)

prediction = demo_model.predict(X_test)

### View demo model scores

In [11]:
accuracy_score = sklearn.metrics.accuracy_score(y_test, prediction)
print(f"Accuracy:\t\t{round(accuracy_score, 2)}")

xval_score = sklearn.model_selection.cross_val_score(
    demo_model,
    X_train,
    y_train,
    cv=sklearn.model_selection.RepeatedStratifiedKFold(
        n_splits=10, n_repeats=3, random_state=1
    ),
    scoring="roc_auc",
).mean()
print(f"Cross validation score:\t{round(xval_score,2)}")

roc_auc_score = sklearn.metrics.roc_auc_score(y_test, prediction)
print(f"ROC_AUC score:\t\t{round(roc_auc_score,2)}")

Accuracy:		0.85
Cross validation score:	0.89
ROC_AUC score:		0.85


## Deploy the model

### Log the model to MLflow Tracking server

In [12]:
# Log the model.
model_info = mlflow.sklearn.log_model(
    sk_model=demo_model,
    artifact_path="sklearn-model-test",
    input_example=X_train,
    registered_model_name="demo-model",
)

Successfully registered model 'demo-model'.
Created version '1' of model 'demo-model'.
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 284.71it/s]  


### Serve the model

In [13]:
model_serve_command = f"mlflow models serve -m {model_info.model_uri} --env-manager virtualenv -p 5555 -h 0.0.0.0"
docker_exec_wrapper = (
    f"docker exec gx-in-the-ml-pipeline-mlflow bash -c '{model_serve_command}'"
)

print("Run the following command to serve the model:\n")
print(docker_exec_wrapper)

Run the following command to serve the model:

docker exec gx-in-the-ml-pipeline-mlflow bash -c 'mlflow models serve -m runs:/5c81de5d08724f01875b569d685f1156/sklearn-model-test --env-manager virtualenv -p 5555 -h 0.0.0.0'


## Use deployed model for inference

### Check that model is up

In [14]:
requests.get(url=f"http://mlflow:5555/ping")

<Response [200]>

### Run inference on sample data point

In [15]:
sample_input = {
    "columns": [
        "age",
        "sex",
        "cp_0",
        "cp_1",
        "cp_2",
        "cp_3",
        "trestbps",
        "chol",
        "fbs",
        "restecg",
        "thalach",
        "exang",
        "oldpeak",
        "slope",
        "ca",
        "thal",
    ],
    "data": [
        [
            1.0073855590677647,
            1,
            0.0,
            0.0,
            0.0,
            1.0,
            0.412888910628608,
            0.5497692967873232,
            0.0,
            1.0,
            -0.9858895025925402,
            1.0,
            1.9450130929477942,
            2.0,
            None,
            None,
        ]
    ],
}

In [16]:
request_payload = json.dumps(
    {
        "dataframe_split": sample_input,
    }
)

response = requests.post(
    url=f"http://mlflow:5555/invocations",
    headers={"Content-Type": "application/json"},
    data=request_payload,
)

print(response.json())

{'predictions': [1]}
