In [2]:
import kfp
from kfp.components import InputPath, OutputPath
from kfp import dsl
from typing import List, Tuple

In [39]:
BASE_IMAGE = "quay.io/ntlawrence/explain-demo:0.0.0"

In [None]:
def load_df_from_db2(table_name: str,
                     data_frame_pkl: OutputPath):
    import warnings
    import ibm_db
    import ibm_db_dbi
    import os
    import json
    import pandas as pd
    import pickle
    from typing import Dict, Any
    
    def assign_categories_to_df(df: pd.DataFrame, column_info: Dict[str, any]) -> None:
        for col_name, levels in column_info["label_columns"].items():
            if col_name in df.columns:
                ctype = pd.CategoricalDtype(categories=levels, ordered=False)
                df[col_name] = df[col_name].astype(ctype)

    def df_from_sql(
        name: str,
        conn: ibm_db.IBM_DBConnection,
        column_info: Dict[str, Any],
    ) -> pd.DataFrame:
        sql_safe_name = name.replace('"', "")

        rStmtColsSql = ",".join([f'"{col}"' for col in column_info["columns"]])
        rSql = f'SELECT {rStmtColsSql} FROM "{sql_safe_name}"'

        read_conn = ibm_db_dbi.Connection(conn)
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="pandas only support SQLAlchemy")
            df = pd.read_sql(rSql, read_conn)

        assign_categories_to_df(df, column_info)
        return df
    
    conn_str = (
    "DRIVER={IBM DB2 ODBC DRIVER};"
    f"DATABASE=BLUDB;HOSTNAME={os.environ['db2_host']};PORT={os.environ['db2_port']};PROTOCOL=TCPIP;UID={os.environ['db2_user']};Pwd={os.environ['db2_pwd']};SECURITY=SSL;"
    )
        
    conn = ibm_db.connect(conn_str, "", "")

    column_info = json.loads(os.environ["COLUMNS"])
    df = df_from_sql(table_name, conn, column_info)
    df.to_pickle(data_frame_pkl)
    
    
load_df_from_db2_comp = kfp.components.create_component_from_func(
    func=load_df_from_db2, base_image=BASE_IMAGE
)

In [44]:
def fit_preprocessor(
    training_df: InputPath,
    preprocessor_pkl: OutputPath,
    features: List[str],
):
    import pandas as pd
    import json
    import joblib
    import os

    from sklearn.preprocessing import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import Pipeline

    feature_set = set(features)
    column_info = json.loads(os.environ["COLUMNS"])

    ohe_labels = [
        (
            "ohe_" + label,
            OneHotEncoder(
                handle_unknown="ignore", sparse_output=False, categories=levels
            ),
            label,
        )
        for label, levels in column_info["label_columns"].items()
        if label in feature_set
    ]

    int_cols = [
        (
            "passthrough",
            "passthrough",
            [col for col in column_info["int_columns"] if col in feature_set],
        )
    ]

    pipe = Pipeline(
        steps=[
            ("Preprocess", ColumnTransformer(ohe_labels + int_cols, remainder="drop")),
        ]
    )

    train = pd.read_pickle(training_df)
    pipe.fit(train)
    joblib.dump(pipe, preprocessor_pkl)


fit_preprocessor_comp = kfp.components.create_component_from_func(
    func=fit_preprocessor, base_image=BASE_IMAGE
)

In [42]:
def train(
    training_df: InputPath,
    preprocessor: InputPath,
    model: OutputPath,
    target_processing_config: OutputPath,
    target_col: str = "Risk"
):
    import pandas as pd
    import json
    import joblib
    import tensorflow as tf
    from keras import Sequential
    from keras.layers import Dense, Dropout, BatchNormalization, Input
    from keras.callbacks import EarlyStopping, ReduceLROnPlateau
    from sklearn.metrics import precision_recall_curve
    import numpy as np

    target_processing_config_dict = {
        "threshold" : 0.5,
        "target_names" : {0: "No Risk", 1: "Risk"}
    }

    def get_tf_model(num_features: int) -> Tuple[tf.keras.Model, List[tf.keras.callbacks.Callback]]:

        tf_model = Sequential(
            [
                Input(shape=(num_features,)),
                BatchNormalization(),
                Dense(30, activation="relu", name="layer1"),
                Dropout(0.3, name="dropout1"),
                Dense(30, activation="relu", name="layer2"),
                Dropout(0.3, name="dropout2"),
                Dense(30, activation="relu", name="layer3"),
                Dropout(0.3, name="dropout3"),
                Dense(
                    1,
                    activation="sigmoid",
                    name="output",
                ),
            ]
        )

        tf_model.compile(optimizer="adam", loss="binary_crossentropy")

        callbacks = [
            EarlyStopping(
                monitor="val_loss",
                patience=50,
                verbose=0,
                mode="min",
                restore_best_weights=True,
            ),
            ReduceLROnPlateau(
                monitor="val_loss",
                factor=0.1,
                patience=7,
                verbose=1,
                min_delta=0.0001,
                mode="min",
            ),
        ]

        return tf_model, callbacks

    train = pd.read_pickle(training_df)
    preprocessor = joblib.load(preprocessor)

    X = tf.convert_to_tensor(preprocessor.transform(train))
    y = tf.convert_to_tensor(
        train.loc[:, target_col].apply(lambda v: 1 if v == target_processing_config_dict["target_names"][1] else 0)
    )                                                        
    
    tf_model, callbacks = get_tf_model(num_features=X.shape[1])
    tf_model.fit(
        X,
        y,
        validation_split=0.2,
        epochs=500,
        callbacks=callbacks,
        class_weight={0: 1, 1: 2},
    )

    # calculate best threshold of highest f1 score
    predictions = model.predict(X)
    precision, recall, thresholds = precision_recall_curve(
        y_true=y.numpy(), probas_pred=predictions.flatten()
    )
    f1s = 2 * (precision * recall) / (precision + recall)
    target_processing_config_dict["threshold"] = thresholds[np.argmax(f1s)]
    
    # Save model and threshold config
    tf_model.save(model)
    with open(target_processing_config, "w") as f:
        json.dump(target_processing_config_dict, f)


train_comp = kfp.components.create_component_from_func(
    func=train, base_image=BASE_IMAGE
)

In [3]:
def evaluate(
    df: InputPath,
    preprocessor: InputPath,
    model: InputPath,
    target_processing_config: InputPath,
    output_report: OutputPath(str),
    mlpipeline_ui_metadata_path: OutputPath(),
    target="Risk",
):
    import pandas as pd
    import joblib
    import tensorflow as tf
    import json
    from evidently.metric_preset import ClassificationPreset
    from evidently.report import Report
    from evidently import ColumnMapping
    import os
    from pathlib import Path

    dataset = pd.read_pickle(df)
    preprocessor = joblib.load(preprocessor)
    tf_model = tf.keras.models.load_model(model)
    with open(target_processing_config, "r") as f:
        target_processing_config_dict = json.load(f)

    column_info = json.loads(os.environ["COLUMNS"])

    X = preprocessor.transform(dataset)
    y_prob = tf_model.predict(X)

    dataset["Prediction"] = pd.Series(y_prob).apply(
        lambda p: 1 if p > target_processing_config_dict["threshold"] else 0
    )
    dataset["Actual"] = train.loc[:, target].apply(
        lambda v: 1 if v == target_processing_config_dict["target_names"][1] else 0
    )

    column_mapping = ColumnMapping()
    column_mapping.target_names = target_processing_config_dict["target_names"]
    column_mapping.target = "Actual"
    column_mapping.prediction = "Prediction"
    column_mapping.task = "classification"
    column_mapping.numerical_features = [
        c
        for c in column_info["int_columns"]
        if c in set(preprocessor.feature_names_in_)
    ]
    column_mapping.categorical_features = [
        c
        for c in column_info["label_columns"]
        if c in set(preprocessor.feature_names_in_)
    ]

    report = Report(
        metrics=[
            ClassificationPreset(),
        ]
    )

    report.run(
        reference_data=None,
        current_data=dataset,
        column_mapping=column_mapping,
    )

    Path(output_report).parent.mkdir(parents=True, exist_ok=True)
    report.save_html(output_report)
    html_content = open(output_report, "r").read()
    metadata = {
        "outputs": [
            {
                "type": "web-app",
                "storage": "inline",
                "source": html_content,
            }
        ]
    }

    with open(mlpipeline_ui_metadata_path, "w") as f:
        json.dump(metadata, f)

In [None]:
@dsl.pipeline(
    name="Credit Risk",
    description="An example pipeline that builds and deploys a credit risk model",
)
def credit_model_pipeline():
    load_training_data_task = load_df_from_db2("TRAIN")
    load_test_data_task = load_df_from_db2("TEST")

    fit_preprocessor_task = fit_preprocessor(
        load_training_data_task.outputs["data_frame_pkl"],
        features=[
            "CheckingStatus",
            "LoanDuration",
            "CreditHistory",
            "LoanPurpose",
            "LoanAmount",
            "ExistingSavings",
            "EmploymentDuration",
            "InstallmentPercent",
            "Sex",
            "OthersOnLoan",
            "CurrentResidenceDuration",
            "OwnsProperty",
            "Age",
            "InstallmentPlans",
            "Housing",
            "ExistingCreditsCount",
            "Job",
            "Dependents",
            "Telephone",
            "ForeignWorker",
        ],
    )

    train_model_task = train(
        training_df=load_training_data_task.outputs["data_frame_pkl"],
        preprocessor=fit_preprocessor_task.outputs["preprocessor_pkl"],
    )

    evaluate_model_task = evaluate(
        load_test_data_task.outputs["data_frame_pkl"],
        preprocessor=fit_preprocessor_task.outputs["preprocessor_pkl"],
        model=train_model_task.outputs["model"],
        target_processing_config=train_model_task.outputs["target_processing_config"]
    )

In [4]:
def delete_pipeline(pipeline_name: str):
    """Delete's a pipeline with the specified name"""

    client = kfp.Client()
    existing_pipelines = client.list_pipelines(page_size=999).pipelines
    matches = (
        [ep.id for ep in existing_pipelines if ep.name == pipeline_name]
        if existing_pipelines
        else []
    )
    for id in matches:
        client.delete_pipeline(id)


In [5]:
def get_experiment_id(experiment_name: str) -> str:
    """Returns the id for the experiment, creating the experiment if needed"""
    client = kfp.Client()
    existing_experiments = client.list_experiments(page_size=999).experiments
    matches = (
        [ex.id for ex in existing_experiments if ex.name == experiment_name]
        if existing_experiments
        else []
    )

    if matches:
        return matches[0]

    exp = client.create_experiment(experiment_name)
    return exp.id


In [None]:
PIPELINE_NAME = "Build_Credit_Risk_Model"
# Pipeline names need to be unique, so before we upload,
# check for and delete any pipeline with the same name
delete_pipeline(PIPELINE_NAME)

kfp.compiler.Compiler().compile(
    pipeline_func=credit_model_pipeline,
    package_path=f"{PIPELINE_NAME}.yaml",
)

# upload
client = kfp.Client()
uploaded_pipeline = client.upload_pipeline(f"{PIPELINE_NAME}.yaml", PIPELINE_NAME)


In [None]:
run = client.run_pipeline(
    experiment_id=get_experiment_id("monkey-classification-exp"),
    job_name="monkey-classification-pipeline",
    pipeline_id=uploaded_pipeline.id,
)

In [None]:
TWENTY_MIN = 20 * 60
result = client.wait_for_run_completion(run.id, timeout=TWENTY_MIN)
{
    "status": result.run.status,
    "error": result.run.error,
    "time": str(result.run.finished_at - result.run.created_at),
    "metrics": result.run.metrics,
}
