In [1]:
import kfp
from kfp.components import InputPath, OutputPath
from kfp import dsl
from typing import List, Tuple
from kfp.dsl import ContainerOp
from kubernetes.client.models import V1EnvVar,V1EnvVarSource, V1SecretKeySelector,V1ConfigMapKeySelector
from typing import NamedTuple

In [2]:
BASE_IMAGE = "quay.io/ntlawrence/demo-workflow@sha256:e0b071e361a147d1cc957b96a19ae6144d792ff994ac8daf0ba887a5bd3652f5"

In [3]:
def load_df_from_db2(table_name: str,
                     data_frame_pkl: OutputPath(str),
                     target_column: str = 'Risk',
                     predictions_column: str = ''):
    import warnings
    import ibm_db
    import ibm_db_dbi
    import os
    import json
    import pandas as pd
    import pickle
    from typing import Dict, Any
    
    def assign_categories_to_df(df: pd.DataFrame, column_info: Dict[str, any]) -> None:
        for col_name, levels in column_info["label_columns"].items():
            if col_name in df.columns:
                ctype = pd.CategoricalDtype(categories=levels, ordered=False)
                df[col_name] = df[col_name].astype(ctype)

    def df_from_sql(
        name: str,
        conn: ibm_db.IBM_DBConnection,
        column_info: Dict[str, Any],
        target_col: str = 'Risk',
        predictions_col: str = ''
    ) -> pd.DataFrame:
        sql_safe_name = name.replace('"', "")

        column_list = column_info["columns"] + ([] if not predictions_col else [predictions_col])
        rStmtColsSql = ",".join([f'"{col}"' for col in column_list])
        rSql = f'SELECT {rStmtColsSql} FROM "{sql_safe_name}"'

        read_conn = ibm_db_dbi.Connection(conn)
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="pandas only support SQLAlchemy")
            df = pd.read_sql(rSql, read_conn)

        assign_categories_to_df(df, column_info)
        if predictions_col:
            df[predictions_col] = df[predictions_col].astype(df[target_col].dtype)
            
        return df
    
    conn_str = (
    "DRIVER={IBM DB2 ODBC DRIVER};"
    f"DATABASE=BLUDB;HOSTNAME={os.environ['db2_host']};PORT={os.environ['db2_port']};PROTOCOL=TCPIP;UID={os.environ['db2_user']};Pwd={os.environ['db2_pwd']};SECURITY=SSL;"
    )
        
    conn = ibm_db.connect(conn_str, "", "")

    column_info = json.loads(os.environ["COLUMNS"])
    df = df_from_sql(table_name, conn, column_info, target_column, predictions_column)
    df.to_pickle(data_frame_pkl)


load_df_from_db2_comp = kfp.components.create_component_from_func(
    func=load_df_from_db2, base_image=BASE_IMAGE
)

In [4]:
def evidently_report(reference_df: InputPath(str),
                      production_df: InputPath(str),
                      mlpipeline_ui_metadata_path: OutputPath(str),
                      output_report: OutputPath(str),
                      output_json: OutputPath(str),
                      target: str = 'Risk',
                      report_type: str = 'drift'
                     ):
    from evidently.metric_preset import (
    DataDriftPreset,
    TargetDriftPreset,
    ClassificationPreset,
    )
    from evidently.report import Report
    from evidently import ColumnMapping
    import pandas as pd
    import os
    from pathlib import Path
    import json
    
    reference_dataset = pd.read_pickle(reference_df)
    production_dataset = pd.read_pickle(production_df)
    column_info = json.loads(os.environ["COLUMNS"])

    column_mapping = ColumnMapping()
    column_mapping.target = target
    column_mapping.task = "classification"
   

    column_mapping.numerical_features = [
        c
        for c in column_info["int_columns"]
        if c != target
    ]
    column_mapping.categorical_features = [
        c
        for c in column_info["label_columns"]
        if c != target
    ]

    if report_type.lower() == "drift":
        report = Report(
            metrics=[
                DataDriftPreset(),
            ]
        )
    elif report_type.lower() == "target_drift":
        report = Report(
            metrics=[
                TargetDriftPreset(),
            ]
        )
    else:
        raise NotImplementedError()

    report.run(
        reference_data=reference_dataset,
        current_data=production_dataset,
        column_mapping=column_mapping,
    )

    Path(output_report).parent.mkdir(parents=True, exist_ok=True)
    report.save_html(output_report)
    html_content = open(output_report, "r").read()
    metadata = {
        "outputs": [
            {
                "type": "web-app",
                "storage": "inline",
                "source": html_content,
            }
        ]
    }

    with open(mlpipeline_ui_metadata_path, "w") as f:
        json.dump(metadata, f)
        
    with open(output_json, "w") as json_f:
        json_f.write(report.json())
        
evidently_report_comp = kfp.components.create_component_from_func(
    func=evidently_report, base_image=BASE_IMAGE
)

In [5]:
def evidently_classification_report(production_df: InputPath(str),
                      mlpipeline_ui_metadata_path: OutputPath(str),
                      output_report: OutputPath(str),
                      output_json: OutputPath(str),
                      target: str = 'Risk',
                      predictions: str = 'PredictedRisk',
                      pos_class: str = 'Risk'
                     ):
    from evidently.metric_preset import (
    ClassificationPreset,
    )
    from evidently.report import Report
    from evidently import ColumnMapping
    import pandas as pd
    import os
    from pathlib import Path
    import json

    production_dataset = pd.read_pickle(production_df)
  
    production_dataset.dropna(subset=[target, predictions], inplace=True)
    column_info = json.loads(os.environ["COLUMNS"])

    column_mapping = ColumnMapping()
    #column_mapping.target_names = ['No Risk', 'Risk']
    column_mapping.target = 'Risk' #'Actual_Int'
    column_mapping.prediction = 'PredictedRisk' #'Predicted_Int'
    column_mapping.pos_label = 'Risk'
    column_mapping.task = "classification"

   

    column_mapping.numerical_features = [
        c
        for c in column_info["int_columns"]
        if c != target
    ]
    column_mapping.categorical_features = [
        c
        for c in column_info["label_columns"]
        if c != target
    ]
  
    report = Report(
        metrics=[
            ClassificationPreset()
        ]
    )
    
    #production_dataset['Actual_Int'] = production_dataset[target].apply(lambda v: 1 if v == pos_class else 0)
    #production_dataset['Predicted_Int'] = production_dataset[predictions].apply(lambda v: 1 if v == pos_class else 0)

    report.run(
        reference_data=None,
        current_data=production_dataset,
        column_mapping=column_mapping,
    )

    Path(output_report).parent.mkdir(parents=True, exist_ok=True)
    report.save_html(output_report)
    html_content = open(output_report, "r").read()
    metadata = {
        "outputs": [
            {
                "type": "web-app",
                "storage": "inline",
                "source": html_content,
            }
        ]
    }

    with open(mlpipeline_ui_metadata_path, "w") as f:
        json.dump(metadata, f)
        
    with open(output_json, "w") as json_f:
        json_f.write(report.json())
        
evidently_classification_report_comp = kfp.components.create_component_from_func(
    func=evidently_classification_report, base_image=BASE_IMAGE
)

In [6]:
from typing import NamedTuple

def check_metrics(classification_report: InputPath(str),
                  data_drift_report: InputPath(str),
                  target_drift_report: InputPath(str)) -> NamedTuple("EvaluationOutput", [("mlpipeline_metrics", "Metrics")]):
    import json
    from collections import namedtuple

    with open(classification_report) as class_f:
         classification = json.load(class_f)

    ClassificationQualityMetric = next(filter(lambda m: m["metric"] == "ClassificationQualityMetric", classification["metrics"]))
        
    
    metrics = {
        "metrics": [
            {"name": "f1", "numberValue": ClassificationQualityMetric["result"]["current"]["f1"], "format": "RAW"}
        ]
    }

    out_tuple = namedtuple("EvaluationOutput", ["mlpipeline_metrics"])
    return out_tuple(json.dumps(metrics))

check_metrics_comp = kfp.components.create_component_from_func(
    func=check_metrics, base_image=BASE_IMAGE
)

In [7]:
from kubernetes.client import ( V1PersistentVolumeClaimVolumeSource, V1Volume, V1VolumeMount)
@dsl.pipeline(
    name="Monitor Credit Risk AI",
    description="An example pipeline that monitors the behavior of the AI model within the application",
)
def monitor_credit_model_pipeline():
    def env_var_from_secret(env_var_name: str, secret_name: str, secret_key: str) -> V1EnvVar:
        return V1EnvVar(name=env_var_name,
                                     value_from=V1EnvVarSource(
                                         secret_key_ref=V1SecretKeySelector(
                                             name=secret_name,
                                             key=secret_key
                                         )
                                     )
                                    )
    
    def add_db2_connection_secrets(pipeline_task) -> None:
        pipeline_task.container.add_env_variable(env_var_from_secret("db2_host", "db2-credentials", "host"))
        pipeline_task.container.add_env_variable(env_var_from_secret("db2_user", "db2-credentials", "username"))
        pipeline_task.container.add_env_variable(env_var_from_secret("db2_pwd", "db2-credentials", "password"))
        pipeline_task.container.add_env_variable(env_var_from_secret("db2_port", "db2-credentials", "port"))

    load_reference_data_task = load_df_from_db2_comp(table_name="TRAIN")
    load_reference_data_task.set_display_name("Load_Reference_Data_From_DB2")
    add_db2_connection_secrets(load_reference_data_task)

    load_production_data_task = load_df_from_db2_comp(table_name="CLIENT_DATA", predictions_column='PredictedRisk')
    load_production_data_task.set_display_name("Load_Production_Data_From_DB2")
    add_db2_connection_secrets(load_production_data_task)

    drift_report_task = evidently_report_comp(
                            reference_df = load_reference_data_task.outputs["data_frame_pkl"],
                            production_df = load_production_data_task.outputs["data_frame_pkl"],
                            report_type="drift"
    )
    drift_report_task.set_display_name("Produce Data Drift Report")

    target_drift_report_task = evidently_report_comp(
                            reference_df = load_reference_data_task.outputs["data_frame_pkl"],
                            production_df = load_production_data_task.outputs["data_frame_pkl"],
                            report_type="target_drift"
    )
    target_drift_report_task.set_display_name("Produce Target Drift Report")
    
    classification_report_task = evidently_classification_report_comp(
                            production_df = load_production_data_task.outputs["data_frame_pkl"],
    )
    classification_report_task.set_display_name("Produce classification Report")
    
    
    check_metrics_task = check_metrics_comp(
              classification_report=classification_report_task.outputs["output_json"],
                  data_drift_report=drift_report_task.outputs["output_json"],
                  target_drift_report=target_drift_report_task.outputs["output_json"]
    )

In [8]:
def delete_pipeline(pipeline_name: str):
    """Delete's a pipeline with the specified name"""

    client = kfp.Client()
    existing_pipelines = client.list_pipelines(page_size=999).pipelines
    matches = (
        [ep.id for ep in existing_pipelines if ep.name == pipeline_name]
        if existing_pipelines
        else []
    )
    for id in matches:
        client.delete_pipeline(id)

In [9]:
def get_experiment_id(experiment_name: str) -> str:
    """Returns the id for the experiment, creating the experiment if needed"""
    client = kfp.Client()
    existing_experiments = client.list_experiments(page_size=999).experiments
    matches = (
        [ex.id for ex in existing_experiments if ex.name == experiment_name]
        if existing_experiments
        else []
    )

    if matches:
        return matches[0]

    exp = client.create_experiment(experiment_name)
    return exp.id

In [10]:
pipeline_conf = kfp.dsl.PipelineConf()

def provide_column_info_transformer(op: dsl.ContainerOp):
    
    if isinstance(op, dsl.ContainerOp):
        op.container.add_env_variable(
            V1EnvVar(name="COLUMNS",
                    value_from=V1EnvVarSource(
                                         config_map_key_ref=V1ConfigMapKeySelector(
                                             name="credit-risk-columns",
                                             key="columns"
                                         )
                                     )
                    )
        )
                            

pipeline_conf.add_op_transformer(provide_column_info_transformer)

In [11]:
PIPELINE_NAME = "Monitor_Credit_Risk_AI"
# Pipeline names need to be unique, so before we upload,
# check for and delete any pipeline with the same name
delete_pipeline(PIPELINE_NAME)

        
kfp.compiler.Compiler().compile(
    pipeline_func=monitor_credit_model_pipeline,
    package_path=f"{PIPELINE_NAME}.yaml",
    pipeline_conf=pipeline_conf,
)

# upload
client = kfp.Client()
uploaded_pipeline = client.upload_pipeline(f"{PIPELINE_NAME}.yaml", PIPELINE_NAME)

In [16]:
print(uploaded_pipeline)

{'created_at': datetime.datetime(2023, 10, 20, 20, 45, 48, tzinfo=tzlocal()),
 'default_version': {'code_source_url': None,
                     'created_at': datetime.datetime(2023, 10, 20, 20, 45, 48, tzinfo=tzlocal()),
                     'description': None,
                     'id': 'e7aa99a1-7b71-463c-a4c4-a411b3d09597',
                     'name': 'Monitor_Credit_Risk_AI',
                     'package_url': None,
                     'parameters': None,
                     'resource_references': [{'key': {'id': 'e7aa99a1-7b71-463c-a4c4-a411b3d09597',
                                                      'type': 'PIPELINE'},
                                              'name': None,
                                              'relationship': 'OWNER'}]},
 'description': None,
 'error': None,
 'id': 'e7aa99a1-7b71-463c-a4c4-a411b3d09597',
 'name': 'Monitor_Credit_Risk_AI',
 'parameters': None,
 'resource_references': None,
 'url': None}


In [23]:
# https://github.com/kubeflow/pipelines/blob/1.7.1/sdk/python/kfp/_client.py
# https://pkg.go.dev/github.com/robfig/cron#hdr-CRON_Expression_Format
_ = client.create_recurring_run(experiment_id=get_experiment_id("monitor-production-credit"),
                                job_name="monitor_credit_risk_api_performance",
                                description="Tests for data drift and f1 performance",
                                cron_expression="0 0 0-23 ? JAN-DEC MON-FRI",
                                pipeline_id=uploaded_pipeline.id)

{'created_at': datetime.datetime(2023, 10, 20, 21, 18, 22, tzinfo=tzlocal()),
 'description': 'Tests for data drift and f1 performance',
 'enabled': True,
 'error': None,
 'id': 'ef33c733-422f-4c3d-9bb6-340835a95882',
 'max_concurrency': '1',
 'mode': None,
 'name': 'monitor_credit_risk_api_performance',
 'no_catchup': None,
 'pipeline_spec': {'parameters': None,
                   'pipeline_id': 'e7aa99a1-7b71-463c-a4c4-a411b3d09597',
                   'pipeline_manifest': None,
                   'pipeline_name': 'Monitor_Credit_Risk_AI',
                   'runtime_config': None,
                   'workflow_manifest': '{"kind":"Workflow","apiVersion":"argoproj.io/v1alpha1","metadata":{"generateName":"monitor-credit-risk-ai-","creationTimestamp":null,"labels":{"pipelines.kubeflow.org/kfp_sdk_version":"1.8.18"},"annotations":{"pipelines.kubeflow.org/kfp_sdk_version":"1.8.18","pipelines.kubeflow.org/pipeline_compilation_time":"2023-10-20T20:45:48.628040","pipelines.kubeflow.org/pipel

In [22]:
!oc get scheduledworkflows 

No resources found in ntl-us-ibm-com namespace.


In [12]:
#run = client.run_pipeline(
#    experiment_id=get_experiment_id("monitor-credit-risk"),
#    job_name="monitor-credit-risk",
#    pipeline_id=uploaded_pipeline.id,
#)

In [14]:
#TWENTY_MIN = 20 * 60
#result = client.wait_for_run_completion(run.id, timeout=TWENTY_MIN)
#{
#    "status": result.run.status,
#    "error": result.run.error,
#    "time": str(result.run.finished_at - result.run.created_at),
#    "metrics": result.run.metrics,
#}
