In [1]:
import kfp
from kfp import dsl


In [2]:
@dsl.component(base_image="quay.io/jupyter/pytorch-notebook:lab-4.4.3")
def preprocess_op(
    train_path: dsl.Output[dsl.Artifact],
    test_path:  dsl.Output[dsl.Artifact]
):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    import pickle

    # Load CSV (ensure 'data/diabetes.csv' is mounted into your notebook)
    data = pd.read_csv('data/diabetes.csv')
    X = data.drop('Outcome', axis=1).values
    y = data['Outcome'].values

    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )

    # Serialize to pipeline artifacts
    with open(train_path.path, 'wb') as f:
        pickle.dump((X_train, y_train), f)
    with open(test_path.path, 'wb') as f:
        pickle.dump((X_test,  y_test),  f)


In [3]:
@dsl.component(base_image="quay.io/jupyter/pytorch-notebook:lab-4.4.3")
def train_op(
    train_data:  dsl.Input[dsl.Artifact],
    model_output: dsl.Output[dsl.Model]
):
    import pickle
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import torch.nn.functional as F

    # Load training split
    with open(train_data.path, 'rb') as f:
        X_train, y_train = pickle.load(f)
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)

    # Define a simple MLP
    class DiabetesModel(nn.Module):
        def __init__(self, in_features):
            super().__init__()
            self.fc1 = nn.Linear(in_features, 20)
            self.fc2 = nn.Linear(20, 10)
            self.out = nn.Linear(10, 2)
        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            return self.out(x)

    # Instantiate, loss, optimizer
    model = DiabetesModel(X_train.shape[1])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Training loop
    epochs = 100
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

    # Save trained state dict
    torch.save(model.state_dict(), model_output.path)


In [4]:
@dsl.component(base_image="quay.io/jupyter/pytorch-notebook:lab-4.4.3")
def eval_op(
    test_data:   dsl.Input[dsl.Artifact],
    model_input: dsl.Input[dsl.Model]
):
    import pickle
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from sklearn.metrics import accuracy_score

    # Load test split
    with open(test_data.path, 'rb') as f:
        X_test, y_test = pickle.load(f)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Rebuild model architecture
    class DiabetesModel(nn.Module):
        def __init__(self, in_features):
            super().__init__()
            self.fc1 = nn.Linear(in_features, 20)
            self.fc2 = nn.Linear(20, 10)
            self.out = nn.Linear(10, 2)
        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            return self.out(x)

    model = DiabetesModel(X_test.shape[1])
    state_dict = torch.load(model_input.path)
    model.load_state_dict(state_dict)
    model.eval()

    # Predict and compute accuracy
    with torch.no_grad():
        logits = model(X_test)
        preds = torch.argmax(logits, axis=1)
    acc = accuracy_score(y_test.numpy(), preds.numpy())
    print(f"Model accuracy: {acc:.4f}")


In [5]:
@dsl.pipeline(name="diabetes-pytorch-pipeline")
def diabetes_pipeline():
    # Chain the three steps
    prep  = preprocess_op()
    train = train_op(train_data=prep.outputs['train_path'])
    eval  = eval_op(
        test_data=prep.outputs['test_path'],
        model_input=train.outputs['model_output']
    )


In [6]:
from kfp import compiler
compiler.Compiler().compile(
    pipeline_func=diabetes_pipeline,
    package_path="diabetes_pipeline_v1.yaml"
)