In [1]:
import kfp
from kfp import dsl

# 1) Preprocess stays the same
@dsl.component(base_image="quay.io/jupyter/scipy-notebook:lab-4.4.3")
def preprocess_op(
    train_path: dsl.Output[dsl.Artifact],
    test_path:  dsl.Output[dsl.Artifact]
):
    from sklearn.datasets   import load_iris
    from sklearn.model_selection import train_test_split
    import pickle

    X, y = load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1337
    )

    with open(train_path.path, "wb") as f:
        pickle.dump((X_train, y_train), f)
    with open(test_path.path, "wb") as f:
        pickle.dump((X_test,  y_test ), f)

In [2]:
@dsl.component(base_image="quay.io/jupyter/scipy-notebook:lab-4.4.3")
def train_op(
    train_data:  dsl.Input[dsl.Artifact],
    penalty:     str,
    tol:         float,
    C:           float,
    max_iter:    int,
    model:       dsl.Output[dsl.Model],      # renamed from model_output
):
    import pickle, joblib, os
    from sklearn.linear_model import LogisticRegression

    with open(train_data.path, "rb") as f:
        X_train, y_train = pickle.load(f)

    clf = LogisticRegression(
        penalty=penalty,
        tol=tol,
        C=C,
        max_iter=max_iter,
        random_state=1337,
        solver="lbfgs",
        multi_class="auto",
    )
    clf.fit(X_train, y_train)

    os.makedirs(os.path.dirname(model.path), exist_ok=True)
    joblib.dump(clf, model.path)
    print(f"Model written to: {model.path}")


In [3]:
@dsl.component(base_image="quay.io/jupyter/scipy-notebook:lab-4.4.3")
def eval_op(
    test_data:  dsl.Input[dsl.Artifact],
    model_input: dsl.Input[dsl.Model],
):
    import pickle
    from sklearn.metrics import accuracy_score
    import joblib

    with open(test_data.path, "rb") as f:
        X_test, y_test = pickle.load(f)

    # load via joblib
    clf = joblib.load(model_input.path)
    acc = accuracy_score(y_test, clf.predict(X_test))
    print(f"Model accuracy: {acc:.4f}")


In [4]:
@dsl.pipeline(name="iris-logreg-with-joblib")
def iris_pipeline(
    penalty:  str   = "l2",
    tol:      float = 0.001,
    C:        float = 1,
    max_iter: int   = 10,
):
    # preprocess
    prep = preprocess_op()

    # train with hyperparams and produce a model.joblib
    train = train_op(
        train_data=prep.outputs["train_path"],
        penalty=penalty,
        tol=tol,
        C=C,
        max_iter=max_iter,
    )

    # eval
    _ = eval_op(
        test_data=prep.outputs["test_path"],
        model_input=train.outputs["model"],
    )


if __name__ == "__main__":
    import kfp.compiler as compiler
    compiler.Compiler().compile(
        pipeline_func=iris_pipeline,
        package_path="iris_pipeline_with_joblib.yaml",
    )