# Question Answering Model

This notebook serves a question answering model with Kubeflow Pipelines. \
The dataset and model are taken from Huggingface, which uses PyTorch. \
Each step is represented by a Kubeflow component:
- Loading the dataset
- Preproccessing the data
- Training + saving the model to ONNX + saving Tensorboard logs
- Deploy the model to MMA
- Deploy the model to No-MMA

## Preliminary: import libraries and define constants

In [None]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl

MINIO_URL = "minio-service.kubeflow:9000"
MINIO_USER = "minio"
MINIO_PASS = "minio123"
BUCKET_NAME = "tensorboardlogs"
base_image="quay.io/jeremie_ch/transformers-component:gpu"
minio_image = "quay.io/jeremie_ch/minio-component:1.0"

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()
NAMESPACE

## 1. Loading the dataset

In [None]:
def load_dataset(dataset_dir: comp.OutputPath(str)):
    from datasets import load_dataset
    import os
    squad = load_dataset("squad")
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
    squad.save_to_disk(dataset_dir)

        
load_dataset_comp = kfp.components.create_component_from_func(load_dataset, "dataset.yaml",
                                                              base_image)

## 2. Pre-processing the data

In [None]:
def preprocess(dataset_dir: comp.InputPath(str),
               preprocess_dir: comp.OutputPath(str)):

    from transformers import AutoTokenizer
    from datasets.load import load_from_disk
    import os
    
    print("dataset_dir:", dataset_dir)
    print("preproccess_dir:", preprocess_dir)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    squad = load_from_disk(dataset_dir)
    
    def preprocess_function(examples):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )

        offset_mapping = inputs.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        return inputs

    tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
    
    if not os.path.exists(preprocess_dir):
        os.makedirs(preprocess_dir)
    
    tokenized_squad.save_to_disk(preprocess_dir)


preprocess_comp = kfp.components.create_component_from_func(preprocess, "preprocess.yaml",
                                                            base_image)

## 3. Training

The training is done by a Bert model made for resolving question answering problems. Data are logged into a directory and mounted on a persistent volume so that it can be read and displayed by TensorBoard. \
Then, at the end of the training, the model is saved and converted to onnx.

In [None]:
def train(preprocess_dir: comp.InputPath(str),
          model_dir: comp.OutputPath(str),
          checkpoint_dir: comp.OutputPath(str)):

    import os
    from datasets import load_from_disk
    from transformers import AutoTokenizer, DefaultDataCollator, \
        AutoModelForQuestionAnswering, TrainingArguments, Trainer
    import datetime
    
    tokenized_squad = load_from_disk(preprocess_dir)
    
    data_collator = DefaultDataCollator()
    
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", torchscript=True)

    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    
    os.makedirs(checkpoint_dir, exist_ok=True)

    logging_dir = "/mnt/logs/tb/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    os.makedirs(logging_dir, exist_ok=True)
    print("logging_dir:", logging_dir)
    
    training_args = TrainingArguments(
        output_dir=checkpoint_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=20,
        weight_decay=0.01,
        logging_dir=logging_dir
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        # train_dataset=tokenized_squad["train"],
        train_dataset=tokenized_squad["train"].select(range(1000)),
        eval_dataset=tokenized_squad["validation"].select(range(1000)),
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    trainer.save_model(model_dir)
    
    import sys
    import runpy
    # Convert saved model to ONNX
    os.makedirs(f"/mnt/models/question-answering/1/", exist_ok=True)
    sys.argv = ["dummy_sysargs.py", f"--model={model_dir}", "--feature=question-answering", f"/mnt/models/question-answering/1/"]
    runpy.run_module("transformers.onnx", run_name="__main__")
    os.listdir(f"/mnt/models/question-answering/1/")
    
    import glob
    print(glob.glob(logging_dir + '/**', recursive=True))

    
train_comp = kfp.components.create_component_from_func(train, "train.yaml",
                                                       base_image)

## 4. Save to torchserve

In [None]:
# def save_torchserve_model_comp(model_dir: comp.InputPath(str),
#                                pt_path: comp.OutputPath(str)):
#     from transformers import AutoModelForQuestionAnswering, AutoTokenizer
#     import torch
#     print(model_dir)
#     model = AutoModelForQuestionAnswering.from_pretrained(model_dir)

#     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", torchscript=True)
#     context = "Architecturally, the school has a Catholic character. Atop the Main Building\"s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary."
#     question = "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?"
#     model.eval()
#     model.to("cpu")
#     inputs = tuple(dict(tokenizer(question, context, return_tensors="pt")).values())
#     traced_model = torch.jit.trace(model, inputs, strict=False)
#     torch.jit.save(traced_model, pt_path)
    

# save_torchserve_model_comp = comp.create_component_from_func(save_torchserve_model_comp, 
#                                                         "save_torchserve_model_comp.yaml", 
#                                                         base_image)

## 5. Upload to Minio

In [None]:
# def upload_to_minio(local_dir: comp.InputPath(str),
#                     bucket_name="nlp",
#                     minio_url="minio-service.kubeflow:9000"):
#     import glob
#     import os
#     from minio import Minio
    
#     client = Minio("minio-service.kubeflow:9000", "minio", "minio123", secure=False)
    
#     if client.bucket_exists(bucket_name):
#         client.remove_bucket(bucket_name)
#     client.make_bucket(bucket_name)
#     # if not client.bucket_exists(bucket_name):
#     #     client.make_bucket(bucket_name)
    
#     assert os.path.isdir(local_dir)

#     for local_file in glob.glob(local_dir + '/**'):
#         local_file = local_file.replace(os.sep, "/") # Replace \ with / on Windows
#         if not os.path.isfile(local_file):
#             upload_to_minio(
#                 local_file, bucket_name, minio_url + "/" + os.path.basename(local_file))
#         else:
#             remote_path = os.path.join(
#                 minio_url, local_file[1 + len(local_dir):])
#             remote_path = remote_path.replace(
#                 os.sep, "/")  # Replace \ with / on Windows
#             client.fput_object(bucket_name, remote_path, local_file)
        

# upload_to_minio_comp = comp.create_component_from_func(upload_to_minio, 
#                                                        base_image=minio_image)

## 6. Deploy the model to MMA and No-MMA

In [None]:
#Inference
#deploy_model_mma_comp = kfp.components.load_component_from_file("deploy-mma.yaml")
deploy_model_no_mma_comp = kfp.components.load_component_from_file("deploy-no-mma.yaml")

## 7. Define the pipeline and run it

In [None]:
def pipeline(dataset_dir: str,
             preprocess_dir: str,
             model_dir: str,
             checkpoint_dir: str,
             logs_dir: str,
             pt_path: str,
             model_name: str,
             size: str="1Gi"):
    vop = dsl.VolumeOp(
        name="create-pvc",
        resource_name="models-volume",
        modes=dsl.VOLUME_MODE_RWO,
        size=size,
        generate_unique_name=False,
        action='apply'
    )
    load_dataset_task = load_dataset_comp()
    preproccess_task = preprocess_comp(dataset_dir=load_dataset_task.output)
    train_task = train_comp(preprocess_dir=preproccess_task.output).set_gpu_limit(1).add_pvolumes({"/mnt": vop.volume})
    # save_torchserve_model_task = save_torchserve_model_comp(train_task.outputs["model_dir"])
    # upload_to_minio_task = upload_to_minio_comp(local_dir=train_task.outputs["logs_dir"])
    #deploy_model_mma_task = deploy_model_mma_comp(model_name=model_name)
    #deploy_model_mma_task.after(train_task)
    deploy_model_no_mma_task = deploy_model_no_mma_comp(model_name=model_name)
    deploy_model_no_mma_task.after(train_task)

In [None]:
arguments = {"dataset_dir": "/dataset_dir",
             "preprocess_dir": "/preprocess_dir",
             "model_dir": "/model_dir",
             "checkpoint_dir": "/checkpoint_dir",
             "pt_path": "/pt_path/traced_distilbert.pth",
             "model_name": "question-answering"}

client = kfp.Client()
client.create_run_from_pipeline_func(
    pipeline,
    arguments=arguments,
    namespace=NAMESPACE
)

In [None]:
!curl http://question-answering-mma-predictor-default-marving-de-ibm-com.apps.ruby.edu.ihost.com/v2/models/question-answering

In [None]:
!curl http://question-answering-no-mma-predictor-default-marving-de-ibm-com.apps.ruby.edu.ihost.com/v2/models/question-answering