# Submiting spark

In [131]:
# nuclio: ignore
import nuclio

Define the MLRun environment

In [132]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/mlrun"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/mlrun'


## Function

In [133]:
from mlrun import get_or_create_ctx
from kubernetes import config, client
from kubernetes.stream import stream

In [134]:
def handler(context,spark_options='',shell_pod_name = 'shell'):        
    cmd = 'spark-submit ' + spark_options
    context.logger.info("submiting :" + cmd)
    cli = K8SClient(context.logger)
    cli.exec_shell_cmd(cmd, shell_pod_name)

class K8SClient(object):
    def __init__(self, logger, namespace='default-tenant', config_file=None):
        self.namespace = namespace
        self.logger = logger
        self._init_k8s_config(config_file)
        self.v1api = client.CoreV1Api()

    def _init_k8s_config(self, config_file):
        try:
            config.load_incluster_config()
            self.logger.info('using in-cluster config.')
        except Exception:
            try:
                config.load_kube_config(config_file)
                self.logger.info('using local kubernetes config.')
            except Exception:
                raise RuntimeError(
                    'cannot find local kubernetes config file,'
                    ' place it in ~/.kube/config or specify it in '
                    'KUBECONFIG env var')

    def get_shell_pod_name(self, shell_pod_name):
        shell_pod = self.v1api.list_namespaced_pod(namespace=self.namespace)
        for i in shell_pod.items:            
            if shell_pod_name in i.metadata.name:
                self.logger.info("%s\t%s\t%s" % (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
                shell_name = i.metadata.name
                break
        return shell_name

    def exec_shell_cmd(self, cmd, shell_pod_name):
        shell_name = self.get_shell_pod_name(shell_pod_name)
        # Calling exec and waiting for response
        exec_command = [
            '/bin/bash',
            '-c',
            cmd]
        resp = stream(self.v1api.connect_get_namespaced_pod_exec,
                      shell_name,
                      self.namespace,
                      command=exec_command,
                      stderr=True, stdin=False,
                      stdout=True, tty=False)
        self.logger.info("Response: " + resp)    
    

In [135]:
# nuclio: end-code

## Test
> This test uses the metrics data, created by the [Generator function](https://github.com/mlrun/demo-network-operations/blob/master/notebooks/generator.ipynb) from MLRun's [Network Operations Demo](https://github.com/mlrun/demo-network-operations)  
To test it yourself, please generate this dataset or use any of your available csv/parquet datasets.

In [136]:
from mlrun import code_to_function, mount_v3io, NewTask, mlconf, run_local
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'


### Define the execute test task

In [137]:
execute_task = NewTask(name='spark-submit',
                         project='submit-proj',
                         params={'spark_options':"--class org.apache.spark.examples.SparkPi --jars /spark/examples/jars/spark-examples_2.11-2.4.4.jar 10",'shell_pod_name':'shell'},                                                  
                         handler=handler)

In [138]:
#submit_run = run_local(submit_task)

### Test on cluster

Convert the code to an MLRun function

In [139]:
fn = code_to_function('submit', handler='handler')
fn.spec.service_account='mlrun-api'
fn.apply(mount_v3io())
fn.export('function.yaml')

[mlrun] 2020-06-11 19:13:35,044 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f4258e73da0>

In [140]:
execute_run = fn.run(execute_task)

[mlrun] 2020-06-11 19:13:35,066 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-06-11 19:13:35,091 starting run spark-submit uid=c74e20f21cd64bb0919de44de0cab3f5  -> http://10.192.248.103:8080
[mlrun] 2020-06-11 19:13:35,265 Job is running in the background, pod: spark-submit-dmw4m
[mlrun] 2020-06-11 19:13:41,015 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-06-11 19:13:41,075 submiting :spark-submit --class org.apache.spark.examples.SparkPi --jars /spark/examples/jars/spark-examples_2.11-2.4.4.jar 10
[mlrun] 2020-06-11 19:13:41,076 using in-cluster config.
[mlrun] 2020-06-11 19:13:41,382 10.200.0.6	default-tenant	avi-shell-6d9c467597-89lss
[mlrun] 2020-06-11 19:13:50,612 Response: 20/06/11 19:13:43 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
20/06/11 19:13:44 WARN deploy.DependencyUtils: Local jar /User/10

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
submit-proj,...e0cab3f5,0,Jun 11 19:13:41,completed,spark-submit,host=spark-submit-dmw4mkind=jobowner=iguaziov3io_user=iguazio,,shell_pod_name=shellspark_options=--class org.apache.spark.examples.SparkPi --jars /spark/examples/jars/spark-examples_2.11-2.4.4.jar 10,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run c74e20f21cd64bb0919de44de0cab3f5 --project submit-proj , !mlrun logs c74e20f21cd64bb0919de44de0cab3f5 --project submit-proj
[mlrun] 2020-06-11 19:13:57,258 run executed, status=completed


### Show results