# Submiting spark

In [11]:
#!pip install mlrun

In [12]:
# nuclio: ignore
import nuclio

Define the MLRun environment

In [13]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/mlrun"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/mlrun'


## Function

In [14]:
from mlrun import get_or_create_ctx
from kubernetes import config, client
from kubernetes.stream import stream


In [15]:
class K8SClient(object):

    def __init__(self, logger, namespace='default-tenant', config_file=None):
        self.namespace = namespace
        self.logger = logger
        self._init_k8s_config(config_file)
        self.v1api = client.CoreV1Api()

    def _init_k8s_config(self, config_file):
        try:
            config.load_incluster_config()
            self.logger.info('using in-cluster config.')
        except Exception:
            try:
                config.load_kube_config(config_file)
                self.logger.info('using local kubernetes config.')
            except Exception:
                raise RuntimeError(
                    'cannot find local kubernetes config file,'
                    ' place it in ~/.kube/config or specify it in '
                    'KUBECONFIG env var')

    def get_shell_pod_name(self, pod_name='shell'):
        shell_pod = self.v1api.list_namespaced_pod(namespace=self.namespace)
        for i in shell_pod.items:
            if pod_name in i.metadata.name:
                self.logger.info("%s\t%s\t%s" % (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
                shell_name = i.metadata.name
                break
        return shell_name

    def exec_shell_cmd(self, cmd, shell_pod_name = 'shell'):
        shell_name = self.get_shell_pod_name(shell_pod_name)
        # Calling exec and waiting for response
        exec_command = [
            '/bin/bash',
            '-c',
            cmd]
        resp = stream(self.v1api.connect_get_namespaced_pod_exec,
                      shell_name,
                      self.namespace,
                      command=exec_command,
                      stderr=True, stdin=False,
                      stdout=True, tty=False)
        self.logger.info("Response: " + resp)


def spark_command_builder(name, class_name, jars, packages, spark_options):
    cmd = 'spark-submit'
    if name:
        cmd += ' --name ' + name

    if class_name:
        cmd += ' --class ' + class_name

    if jars:
        cmd += ' --jars ' + jars

    if packages:
        cmd += ' --packages ' + packages

    if spark_options:
        cmd += ' ' + spark_options

    return cmd


def spark_submit(context, v3io_access_key, name=None, class_name=None, jars=None, packages=None, spark_options=''):
    """spark_submit function
    
    submiting spark via shell
    
    :param name:        A name of your application.
    :param class_name:  Your application's main class (for Java / Scala apps).
                        * If relative will add to the {artifact_path}
    :param jars:        Comma-separated list of jars to include on the driver
                        and executor classpaths.
    :param packages:    Comma-separated list of maven coordinates of jars to include
                        on the driver and executor classpaths. Will search the local
                        maven repo, then maven central and any additional remote
                        repositories given by --repositories. The format for the
    :param spark_options: spark parametes that are not included as function arguments
    """
    cmd = spark_command_builder(name, class_name, jars, packages, spark_options)
    context.logger.info("submiting :" + cmd)
    cli = K8SClient(context.logger)
    cli.exec_shell_cmd(cmd)


In [16]:
# nuclio: end-code

## Test
> This test uses the metrics data, created by the [Generator function](https://github.com/mlrun/demo-network-operations/blob/master/notebooks/generator.ipynb) from MLRun's [Network Operations Demo](https://github.com/mlrun/demo-network-operations)  
To test it yourself, please generate this dataset or use any of your available csv/parquet datasets.

In [17]:
import os
from mlrun import code_to_function, mount_v3io, NewTask, mlconf, run_local
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### Define the execute test task

In [18]:
execute_task = NewTask(name='spark-submit',
                         project='submit-proj',
                         params={'spark_options':"/spark/examples/jars/spark-examples_2.11-2.4.4.jar 10",'class_name':'org.apache.spark.examples.SparkPi'},                          
                         handler=spark_submit)

In [19]:
#submit_run = run_local(submit_task)

### Test on cluster

Convert the code to an MLRun function

In [20]:
fn = code_to_function('spark-submit', handler='spark_submit')
fn.spec.service_account='mlrun-api'
fn.apply(mount_v3io())
fn.export('function.yaml')

[mlrun] 2020-06-17 14:47:58,078 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f69a4117c50>

In [21]:
execute_run = fn.run(execute_task)

[mlrun] 2020-06-17 14:47:58,101 starting run spark-submit uid=27391b5c221e4409baf4ae896169768b  -> http://mlrun-api:8080
[mlrun] 2020-06-17 14:47:58,204 Job is running in the background, pod: spark-submit-wrc48
[mlrun] 2020-06-17 14:48:02,407 starting local run: main.py # spark_submit
[mlrun] 2020-06-17 14:48:02,423 submiting :spark-submit --class org.apache.spark.examples.SparkPi /spark/examples/jars/spark-examples_2.11-2.4.4.jar 10
[mlrun] 2020-06-17 14:48:02,423 using in-cluster config.
[mlrun] 2020-06-17 14:48:02,516 10.200.0.53	default-tenant	shell-658dd9bb8f-8q5sr
[mlrun] 2020-06-17 14:48:08,739 Response: 20/06/17 14:48:03 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
20/06/17 14:48:04 INFO spark.SparkContext: Running Spark version 2.4.4
20/06/17 14:48:04 INFO spark.SparkContext: Submitted application: Spark Pi
20/06/17 14:48:04 INFO spark.SecurityManager: Changing view acls to: iguazio
20/06/17 1

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
submit-proj,...6169768b,0,Jun 17 14:48:02,completed,spark-submit,host=spark-submit-wrc48kind=jobowner=adminv3io_user=admin,,class_name=org.apache.spark.examples.SparkPispark_options=/spark/examples/jars/spark-examples_2.11-2.4.4.jar 10,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run 27391b5c221e4409baf4ae896169768b --project submit-proj , !mlrun logs 27391b5c221e4409baf4ae896169768b --project submit-proj
[mlrun] 2020-06-17 14:48:10,433 run executed, status=completed


### Show results