In [None]:
%env RELEASE_VERSION=0.1.18
!pip install https://storage.googleapis.com/ml-pipeline/release/${RELEASE_VERSION}/kfp.tar.gz --upgrade --user

In [1]:
import kfp
import kfp.dsl as dsl
import kfp.compiler as compiler
from kubernetes import client as k8s_client

W0408 05:50:36.929502 139839822395200 deprecation_wrapper.py:119] From /home/dkube/.ipython/extensions/myextension.py:9: The name tf.gfile.Exists is deprecated. Please use tf.io.gfile.exists instead.



Create a Dkube pipeline experiment

In [2]:
client = kfp.Client()
parabricks_experiment = client.create_experiment(name='human-par-pipeline')

In [9]:
import kfp.dsl as dsl
from kfp import components
from kubernetes import client as k8s_client

import os
import json
from random import randint

dkube_training_op           = components.load_component_from_file("training-component.yaml")

description = """Given one or more pairs of fastq files, you can run the human_par
pipeline workflow to generate output including BAM, recal, and variants called with
proper pseudoautosomal region ploidy values.
"""
@dsl.pipeline(
    name='human-par-pipeline',
    description=description
)
def human_par_pipeline(
    auth_token  = os.getenv("DKUBE_USER_ACCESS_TOKEN"),
    container = json.dumps({'image':'docker.io/ocdr/parabricks:v2.5.0', 'username':'', 'password': ''}),
    workspace = "parabricks",
    datasets = json.dumps(["parabricks-sample"]),
    ref = "Ref/Homo_sapiens_assembly38.fasta",
    in_fq_1 = "Data/sample_1.fq.gz",
    in_fq_2 = "Data/sample_2.fq.gz",
    knownSites = "Ref/Homo_sapiens_assembly38.known_indels.vcf.gz",
    range_male = "1-10",
    range_female = "150-250",
    sample_sex = "male",    
    #Request gpus as needed. Val 0 means no gpu,
    num_gpus=2):

    #Script to run inside the training container
    args = """--ref $DKUBE_INPUT_DATASETS/{} --in-fq $DKUBE_INPUT_DATASETS/{} $DKUBE_INPUT_DATASETS/{} \
            --knownSites $DKUBE_INPUT_DATASETS/{} --range-male {} --range-female {} --sample-sex {} \
            --out-bam $DKUBE_INPUT_DATASETS/output.bam --out-variants $DKUBE_INPUT_DATASETS/output.vcf \
            --out-recal-file $DKUBE_INPUT_DATASETS/report.txt --num-gpus {}""".format(
        ref, in_fq_1, in_fq_2,knownSites, range_male, range_female, sample_sex, num_gpus
    )
    
    #Path to NVIDIA parabricks license. Upload it to workspace"
    license_file = "license.bin"
    run_script="/INSTALL/pbrun human_par " + args + " --license-file " + license_file + "|| true"
    
    dkube_training_op(auth_token, container, program=workspace, run_script=run_script, datasets=datasets, ngpus=num_gpus)

Compile and generate tar ball

In [10]:
compiler.Compiler().compile(human_par_pipeline, 'human_par_pl.tar.gz')

Create and Run pipeline

In [11]:
run = client.run_pipeline(parabricks_experiment.id, 'human-par-pipeline', 'human_par_pl.tar.gz', params={})