In [14]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


# Question Answering Model

This notebook serves a question answering model with Kubeflow Pipelines. \
The dataset and model are taken from Huggingface, which uses PyTorch. \
Each step is represented by a Kubeflow component:
- Loading the dataset
- Preproccessing the data
- Training + saving the model to ONNX + saving Tensorboard logs
- Deploy the model to MMA
- Deploy the model to No-MMA

## Preliminary: import libraries and define constants

In [15]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl
import os

MINIO_URL = "minio-service.kubeflow:9000"
MINIO_USER = "minio"
MINIO_PASS = "minio123"
BUCKET_NAME = "tensorboardlogs"
base_image = "quay.io/jeremie_ch/transformers-component:gpu"
minio_image = "quay.io/jeremie_ch/minio-component:1.0"

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()
NAMESPACE
COMPONENT_CATALOG_FOLDER = f"{os.getenv('HOME')}/components"

## 1. Loading the dataset

In [16]:
def load_dataset(dataset_dir: comp.OutputPath(str)):
    from datasets import load_dataset
    import os

    squad = load_dataset("squad")
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
    squad.save_to_disk(dataset_dir)


load_dataset_comp = kfp.components.create_component_from_func(
    load_dataset, "dataset.yaml", base_image
)

## 2. Pre-processing the data

In [17]:
def preprocess(dataset_dir: comp.InputPath(str), preprocess_dir: comp.OutputPath(str)):

    from transformers import AutoTokenizer
    from datasets.load import load_from_disk
    import os

    print("dataset_dir:", dataset_dir)
    print("preproccess_dir:", preprocess_dir)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    squad = load_from_disk(dataset_dir)

    def preprocess_function(examples):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )

        offset_mapping = inputs.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if (
                offset[context_start][0] > end_char
                or offset[context_end][1] < start_char
            ):
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        return inputs

    tokenized_squad = squad.map(
        preprocess_function, batched=True, remove_columns=squad["train"].column_names
    )

    if not os.path.exists(preprocess_dir):
        os.makedirs(preprocess_dir)

    tokenized_squad.save_to_disk(preprocess_dir)


preprocess_comp = kfp.components.create_component_from_func(
    preprocess, "preprocess.yaml", base_image
)

## 3. Training

The training is done by a Bert model made for resolving question answering problems. Data are logged into a directory and mounted on a persistent volume so that it can be read and displayed by TensorBoard. \
Then, at the end of the training, the model is saved and converted to onnx.

In [18]:
def train(
    preprocess_dir: comp.InputPath(str),
    onnx_model: comp.OutputPath(str),
    checkpoint_dir: comp.OutputPath(str),
):

    import os
    from datasets import load_from_disk
    from transformers import (
        AutoTokenizer,
        DefaultDataCollator,
        AutoModelForQuestionAnswering,
        TrainingArguments,
        Trainer,
    )
    import datetime
    import shutil

    tokenized_squad = load_from_disk(preprocess_dir)

    data_collator = DefaultDataCollator()

    tokenizer = AutoTokenizer.from_pretrained(
        "distilbert-base-uncased", torchscript=True
    )

    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

    os.makedirs(checkpoint_dir, exist_ok=True)

    logging_dir = "/mnt/logs/tb/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    os.makedirs(logging_dir, exist_ok=True)
    print("logging_dir:", logging_dir)

    training_args = TrainingArguments(
        output_dir=checkpoint_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=20,
        weight_decay=0.01,
        logging_dir=logging_dir,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_squad["train"].select(range(1000)),
        eval_dataset=tokenized_squad["validation"].select(range(1000)),
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    # if not os.path.exists(model_dir):
    #     os.makedirs(model_dir)

    # trainer.save_model(model_dir)

    import sys
    import runpy

    # Convert saved model to ONNX
    model_dir = "/mnt/models/question-answering/pytorch/"
    os.makedirs(model_dir, exist_ok=True)
    trainer.save_model(model_dir)
    sys.argv = [
        "dummy_sysargs.py",
        f"--model={model_dir}",
        "--feature=question-answering",
        f"/mnt/models/question-answering/onnx/",
    ]
    runpy.run_module("transformers.onnx", run_name="__main__")

    # import glob
    # print(glob.glob(logging_dir + '/**', recursive=True))

    os.makedirs(os.path.dirname(onnx_model), exist_ok=True)
    shutil.copyfile("/mnt/models/question-answering/1/model.onnx", onnx_model)


train_comp = kfp.components.create_component_from_func(train, "train.yaml", base_image)

In [19]:
UPLOAD_MODEL_COMPONENT = (
    f"{COMPONENT_CATALOG_FOLDER}/model-building/upload-model/component.yaml"
)

upload_model_comp = kfp.components.load_component_from_file(UPLOAD_MODEL_COMPONENT)

DEPLOY_MODEL_COMPONENT = f"{os.getenv('HOME')}/kubeflow-ppc64le-examples/deploy_triton_inference_service_component/deploy_triton_inference_service_component.yaml"
deploy_model_comp = kfp.components.load_component_from_file(DEPLOY_MODEL_COMPONENT)

## 7. Define the pipeline and run it

In [20]:
def pipeline(
    dataset_dir: str,
    preprocess_dir: str,
    model_dir: str,
    checkpoint_dir: str,
    logs_dir: str,
    pt_path: str,
    model_name: str,
    size: str = "1Gi",
    minio_url="minio-service.kubeflow:9000",
):
    vop = dsl.VolumeOp(
        name="create-pvc",
        resource_name="models-volume",
        modes=dsl.VOLUME_MODE_RWO,
        size=size,
        generate_unique_name=False,
        action="apply",
    )
    load_dataset_task = load_dataset_comp()
    preproccess_task = preprocess_comp(dataset_dir=load_dataset_task.output)
    train_task = (
        train_comp(preprocess_dir=preproccess_task.output)
        .set_gpu_limit(1)
        .add_pvolumes({"/mnt": vop.volume})
    )

    # Upload ONNX model
    upload_model_task = upload_model_comp(
        train_task.outputs["onnx_model"],
        minio_url=minio_url,
        export_bucket="{{workflow.namespace}}-bee",
        model_format="onnx",
        model_name="qa",
        model_version=1,
    )

    # Deploy Inference Service
    deploy_model_task = deploy_model_comp(
        name="qa",
        rm_existing=True,
        storage_uri="s3://{{workflow.namespace}}-qa/onnx",
        minio_url=minio_url,
        predictor_protocol="v2",
    )
    deploy_model_task.after(upload_model_task)

In [None]:
arguments = {"dataset_dir": "/dataset_dir",
             "preprocess_dir": "/preprocess_dir",
             "model_dir": "/model_dir",
             "checkpoint_dir": "/checkpoint_dir",
             "pt_path": "/pt_path/traced_distilbert.pth",
             "model_name": "question-answering"}

client = kfp.Client()
client.create_run_from_pipeline_func(
    pipeline,
    arguments=arguments,
    namespace=NAMESPACE
)