In [1]:
from mlrun import new_function, NewTask, get_run_db, mlconf, mount_v3io, new_model_server, builder
import kfp
from kfp import dsl
import os
from os.path import isfile, join

In [2]:
mlconf.dbpath = 'http://mlrun-api:8080'

In [3]:
# Environment vars to be set by Nuclio
PYTHON_SCRIPT = os.getenv('PYTHON_SCRIPT','/kv-to-parquet.py')
V3IO_SCRIPT_PATH = os.getenv('V3IO_SCRIPT_PATH',os.getcwd().replace('/User','/v3io/'+os.getenv('V3IO_HOME')))
SPARK_JOB_NAME = os.getenv('SPARK_JOB_NAME','my-spark-job') 
SPARK_SPEC_MEM = os.getenv('SPARK_SPEC_MEM','2g') 
SPARK_SPEC_CPU = int(os.getenv('SPARK_SPEC_CPU',1) )
SPARK_SPEC_REPLICAS = int(os.getenv('SPARK_SPEC_REPLICAS',1) )

In [4]:
#Set the pyspark script path
V3IO_SCRIPT_PATH = V3IO_SCRIPT_PATH+PYTHON_SCRIPT

In [5]:
#Get the list of the dpendency jars
V3IO_JARS_PATH = '/igz/java/libs/'
DEPS_JARS_LIST = [join(V3IO_JARS_PATH, f) for f in os.listdir(V3IO_JARS_PATH) 
                  if isfile(join(V3IO_JARS_PATH, f)) and f.startswith('v3io-') and f.endswith('.jar')]


In [6]:
#Create MLRun function to run the spark-job on the kubernetes cluster
serverless_spark_fn = new_function(kind='spark', image='urihoenig/spark-app:2.4.4-2.9.0-0.0.3', 
                                   command=f'local://{V3IO_SCRIPT_PATH}', name=SPARK_JOB_NAME).apply(mount_v3io(name='v3io', remote='~/', mount_path='/User', access_key=os.getenv('V3IO_ACCESS_KEY'),
      user=os.getenv('V3IO_USERNAME')))

In [7]:
serverless_spark_fn.spec.env.append({'name':'V3IO_HOME_URL','value':os.getenv("V3IO_HOME_URL")})

In [8]:
serverless_spark_fn.with_limits(mem=SPARK_SPEC_MEM)
serverless_spark_fn.with_requests(cpu=SPARK_SPEC_CPU)
serverless_spark_fn.with_igz_spark(igz_version='2.8_b3506_20191217042239')
#Set number of executors
serverless_spark_fn.spec.replicas = SPARK_SPEC_REPLICAS

In [9]:
@dsl.pipeline(
    name='Kubeflow pipeline with Spark jobs',
    description='Run SparkK8s as par tof pipeline'
)
def example_pipeline(
   p1 = [1,2,3,4,5,6],
   p2 = [9,8,6,5,4,3]
):
    # Use the same fn definition, but run different functions.
    # fn2 is a definition from the external notebook
    f1 = serverless_spark_fn.as_step(NewTask(), name='Sparkstep1').apply(mount_v3io(name='v3io', remote='~/', mount_path='/User', access_key=os.getenv('V3IO_ACCESS_KEY'),
      user=os.getenv('V3IO_USERNAME')))
    

In [10]:
client = kfp.Client(namespace='default-tenant')
p1 = [1,2,3,4,5,6]
p2 = [9,8,6,5,4,3]
#arguments = {'p1': p1 ,'p2': p2}
arguments={}

# Record pipeline deployment in KV

In [11]:
import uuid
import v3io.dataplane

In [15]:
v3io_client = v3io.dataplane.Client(context.logger, max_connections=1)

In [18]:
def record_pipeline_id(run_id):
    record_id=str(run_id)
    v3io_client.put_item(container=os.getenv('MONITOR_CONTAINER','bigdata'),
                         path=os.path.join(os.getenv('MONITOR_TABLE','kubeflow_runs'),record_id),
                         attributes={
                             'id': record_id,
                             'status' : 'started',}
                        )

# get the Kubeflow run_id
Note: This notebook was written to invoke the deployment as an mlrun function.

In [21]:
def handler(context,event):
    run_id=str(uuid.uuid4())
    run_result = client.create_run_from_pipeline_func(example_pipeline, arguments, run_name='SparkPipe-'+run_id, experiment_name='SparkPipeline')
    record_pipeline_id(run_result.run_id)
    return run_result

In [24]:
runobj=handler('a','b')

In [25]:
runobj.run_id

'44955143-38da-4079-a291-989d6afe205e'