In [1]:
from kfp.v2.dsl import (
    component, 
    pipeline, 
    Input, 
    Output, 
    Dataset, 
    Model, 
    Metrics
)
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import json

  from kfp.v2.dsl import (


In [2]:
from kfp.v2.dsl import component, Output, Dataset

@component(
    base_image="python:3.9-slim",
    packages_to_install=["pandas", "pyarrow"]  # added pyarrow for Parquet support
)
def load_data_op(
    csv_path: str,
    data: Output[Dataset]
):
    """Load CSV from `csv_path` and write it out as a Dataset (Parquet)."""
    import pandas as pd

    df = pd.read_csv(csv_path)
    df.to_parquet(data.path, index=False)

In [3]:
from kfp.v2.dsl import (
    component,
    Input,
    Output,
    Dataset
)

@component(
    base_image="python:3.9-slim",
    packages_to_install=["pandas", "scikit-learn", "pyarrow"]
)
def preprocess_and_split_op(
    data: Input[Dataset],
    X_train: Output[Dataset],
    X_test: Output[Dataset],
    y_train: Output[Dataset],
    y_test: Output[Dataset],
    test_size: float = 0.2,
    random_state: int = 0
):
    """Split into features & target, then train/test splits."""
    import pandas as pd
    from sklearn.model_selection import train_test_split

    # 1. Read the parquet-formatted input
    df = pd.read_parquet(data.path)

    # 2. Separate features and label
    X = df.drop("Outcome", axis=1)
    y = df["Outcome"]

    # 3. Do the split
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # 4. Write out each split as a Dataset (Parquet)
    Xtr.to_parquet(X_train.path, index=False)
    Xte.to_parquet(X_test.path, index=False)

    # Wrap the Series in a DataFrame so we can parquet it
    pd.DataFrame({"Outcome": ytr}).to_parquet(y_train.path, index=False)
    pd.DataFrame({"Outcome": yte}).to_parquet(y_test.path, index=False)


In [4]:
from kfp.v2.dsl import component, Input, Output, Dataset, Model

@component(
    base_image="python:3.9-slim",
    packages_to_install=["pandas", "scikit-learn", "joblib", "pyarrow"]
)
def train_model_op(
    X_train: Input[Dataset],
    y_train: Input[Dataset],
    model: Output[Model],
    penalty: str = "l2",
    C: float = 1.0,
    max_iter: int = 300
):
    """Train a LogisticRegression model and save it."""
    import joblib
    import pandas as pd
    from sklearn.linear_model import LogisticRegression

    # Load the train splits (Parquet)
    Xtr = pd.read_parquet(X_train.path)
    ytr = pd.read_parquet(y_train.path)

    # Train
    clf = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter)
    clf.fit(Xtr, ytr)

    # Persist model
    joblib.dump(clf, model.path)


In [5]:
from kfp.v2.dsl import component, Input, Output, Model, Dataset, Metrics

@component(
    base_image="python:3.9-slim",
    packages_to_install=["pandas", "scikit-learn", "joblib", "pyarrow"]
)
def evaluate_model_op(
    model: Input[Model],
    X_test: Input[Dataset],
    y_test: Input[Dataset],
    report: Output[Metrics]
):
    """Generate classification report and emit as JSON."""
    import joblib
    import json
    import pandas as pd
    from sklearn.metrics import classification_report

    # Load model
    clf = joblib.load(model.path)

    # Read test splits (Parquet)
    Xte = pd.read_parquet(X_test.path)
    yte = pd.read_parquet(y_test.path)

    # Predict and build report
    preds = clf.predict(Xte)
    rpt = classification_report(yte, preds, output_dict=True)

    # Write out JSON metrics
    with open(report.path, "w") as f:
        json.dump(rpt, f)

In [6]:
from typing import Dict, List
from kfp.v2.dsl import component, Input, Output, Model, Dataset

@component(
    base_image="python:3.9-slim",
    packages_to_install=["pandas", "joblib", "scikit-learn"]
)
def predict_samples_op(
    model: Input[Model],
    samples: Dict[str, List[Dict]],
    predictions: Output[Dataset],
):
    """Run model.predict on a list of sample records and write out JSON."""
    import joblib
    import pandas as pd
    import json

    # 1. Load the trained model
    clf = joblib.load(model.path)

    # 2. Build a DataFrame from the incoming dict
    df = pd.DataFrame(samples["records"])

    # 3. Reorder columns to match training, if possible
    if hasattr(clf, "feature_names_in_"):
        df = df[clf.feature_names_in_]

    # 4. Generate predictions
    preds = clf.predict(df)
    classes = ("No diabetes", "Diabetes")
    result = [classes[int(p)] for p in preds]

    # 5. Write JSON to the Dataset artifact
    with open(predictions.path, "w") as f:
        json.dump(result, f)


In [7]:
@pipeline(name="diabetes-logreg-pipeline", description="Logistic regression on diabetes")
def diabetes_pipeline(
    csv_path: str = "/mnt/data/diabetes.csv",
    test_size: float = 0.2,
    random_state: int = 0,
    penalty: str = "l2",
    C: float = 1.0,
    max_iter: int = 300
):
    # 1. Load data
    data = load_data_op(csv_path=csv_path)

    # 2. Split
    splits = preprocess_and_split_op(
        data=data.output,
        test_size=test_size,
        random_state=random_state
    )

    # 3. Train
    model = train_model_op(
        X_train=splits.outputs["X_train"],
        y_train=splits.outputs["y_train"],
        penalty=penalty,
        C=C,
        max_iter=max_iter
    )

    # 4. Evaluate
    report = evaluate_model_op(
        model=model.output,
        X_test=splits.outputs["X_test"],
        y_test=splits.outputs["y_test"]
    )

    # 5. Predict on sample cases
    sample_records = {
        "records": [
            {
                "Pregnancies": 6.0,
                "Glucose": 110.0,
                "BloodPressure": 65.0,
                "SkinThickness": 15.0,
                "Insulin": 1.0,
                "BMI": 45.7,
                "DiabetesPedigreeFunction": 0.627,
                "Age": 50
            },
            {
                "Pregnancies": 0,
                "Glucose": 88.0,
                "BloodPressure": 60.0,
                "SkinThickness": 35.0,
                "Insulin": 1.0,
                "BMI": 45.7,
                "DiabetesPedigreeFunction": 0.27,
                "Age": 20
            }
        ]
    }
    predict_samples_op(
        model=model.output,
        samples=sample_records
    )

In [8]:
if __name__ == "__main__":
    from kfp.v2 import compiler
    compiler.Compiler().compile(
        pipeline_func=diabetes_pipeline,
        package_path="diabetes_pipeline.yaml"
    )
