# Submiting spark

In [45]:
#!pip install mlrun==0.4.7

In [46]:
# nuclio: ignore
import nuclio

Define the MLRun environment

In [47]:
%nuclio config kind = "job"
%nuclio config spec.image = "aviaigz/mlrun"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'aviaigz/mlrun'


## Function

In [48]:
from mlrun import get_or_create_ctx
from kubernetes import config, client
from kubernetes.stream import stream
from mlrun.k8s_utils import get_k8s_helper

In [49]:
def handler(context,v3io_access_key,name=None,class_param=None,jars=None,packages=None,spark_options=''):            
    cmd = spark_command_builder(name,class_param,jars,packages,spark_options)
    context.logger.info("submiting :" + cmd)
    cli = get_k8s_helper()
    cli.exec_shell_cmd(cmd, v3io_access_key)
    
    
def spark_command_builder(name,class_param,jars,packages,spark_options):
    cmd = 'spark-submit' 
    if name is not None:
        cmd += ' --name ' + name 
    
    if class_param is not None:
        cmd += ' --class ' + class_param 
        
    if jars is not None:
        cmd += ' --jars ' + jars
            
    if packages is not None:
        cmd += ' --packages ' + packages

    if spark_options is not None:
        cmd += ' ' + spark_options    
    
    return cmd    

In [50]:
# nuclio: end-code

## Test
> This test uses the metrics data, created by the [Generator function](https://github.com/mlrun/demo-network-operations/blob/master/notebooks/generator.ipynb) from MLRun's [Network Operations Demo](https://github.com/mlrun/demo-network-operations)  
To test it yourself, please generate this dataset or use any of your available csv/parquet datasets.

In [51]:
from mlrun import code_to_function, mount_v3io, NewTask, mlconf, run_local
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'


### Define the execute test task

In [52]:
execute_task = NewTask(name='spark-submit',
                         project='submit-proj',
                         params={'spark_options':"/spark/examples/jars/spark-examples_2.11-2.4.4.jar 10",'v3io_access_key':'96ce94f5-8680-4a27-bd0b-9025626d7d2f','class_param':'org.apache.spark.examples.SparkPi'},                                                  
                         handler=handler)

In [53]:
#submit_run = run_local(submit_task)

### Test on cluster

Convert the code to an MLRun function

In [54]:
fn = code_to_function('submit', handler='handler')
fn.spec.service_account='mlrun-api'
fn.apply(mount_v3io())
fn.export('function.yaml')

[mlrun] 2020-06-17 10:02:33,837 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f1e3b067630>

In [None]:
execute_run = fn.run(execute_task)

[mlrun] 2020-06-17 10:02:33,845 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-06-17 10:02:33,853 starting run spark-submit uid=5c43be949e29421c95939fe2e4e011cb  -> http://10.199.81.213:8080
[mlrun] 2020-06-17 10:02:33,949 Job is running in the background, pod: spark-submit-rps2k
[mlrun] 2020-06-17 10:02:41,007 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-06-17 10:02:41,030 starting local run: main.py # handler


### Show results