# Submiting spark

In [1]:
#!pip install mlrun

In [2]:
# nuclio: ignore
import nuclio

Define the MLRun environment

In [3]:
%nuclio config kind = "job"
#%nuclio config spec.image = "docker-registry.default-tenant.app.iguazio.padsquad.com:80/mlrun/mlrun"

%nuclio: setting kind to 'job'


In [4]:
%run set_env.ipynb

## Function

In [5]:
from mlrun import get_or_create_ctx
from kubernetes import config, client
from kubernetes.stream import stream


In [73]:
class K8SClient(object):

    def __init__(self, logger, namespace='default-tenant', config_file=None):
        self.namespace = namespace
        self.logger = logger
        self._init_k8s_config(config_file)
        self.v1api = client.CoreV1Api()

    def _init_k8s_config(self, config_file):
        try:
            config.load_incluster_config()
            self.logger.info('using in-cluster config.')
        except Exception:
            try:
                config.load_kube_config(config_file)
                self.logger.info('using local kubernetes config.')
            except Exception:
                raise RuntimeError(
                    'cannot find local kubernetes config file,'
                    ' place it in ~/.kube/config or specify it in '
                    'KUBECONFIG env var')

    def get_shell_pod_name(self, pod_name='shell'):
        shell_pod = self.v1api.list_namespaced_pod(namespace=self.namespace)
        for i in shell_pod.items:
            if pod_name in i.metadata.name:
                self.logger.info("%s\t%s\t%s" % (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
                shell_name = i.metadata.name
                break
        return shell_name

    def exec_shell_cmd(self, cmd, shell_pod_name = 'shell'):
        shell_name = self.get_shell_pod_name(shell_pod_name)
        # Calling exec and waiting for response
        exec_command = [
            '/bin/bash',
            '-c',
            cmd]
        resp = stream(self.v1api.connect_get_namespaced_pod_exec,
                      shell_name,
                      self.namespace,
                      command=exec_command,
                      stderr=True, stdin=False,
                      stdout=True, tty=False)
        self.logger.info("Response: " + resp)


def spark_command_builder(name, class_name, jars, packages, spark_options):
    cmd = 'spark-submit'
    if name:
        cmd += ' --name ' + name

    if class_name:
        cmd += ' --class ' + class_name

    if jars:
        cmd += ' --jars ' + jars

    if packages:
        cmd += ' --packages ' + packages

    if spark_options:
        cmd += ' ' + spark_options

    return cmd


def spark_submit(context, v3io_access_key, name=None, class_name=None, jars=None, packages=None, spark_options=''):
    """spark_submit function
    
    submiting spark via shell
    
    :param name:        A name of your application.
    :param class_name:  Your application's main class (for Java / Scala apps).
                        * If relative will add to the {artifact_path}
    :param jars:        Comma-separated list of jars to include on the driver
                        and executor classpaths.
    :param packages:    Comma-separated list of maven coordinates of jars to include
                        on the driver and executor classpaths. Will search the local
                        maven repo, then maven central and any additional remote
                        repositories given by --repositories. The format for the
    :param spark_options: spark parametes that are not included as function arguments
    """
    cmd = spark_command_builder(name, class_name, jars, packages, spark_options)
    context.logger.info("submiting :" + cmd)
    cli = K8SClient(context.logger)
    cli.exec_shell_cmd(cmd)
    #cli.exec_shell_cmd("pip uninstall -y pandas")
    

In [74]:
# nuclio: end-code

## Test
> This test uses the metrics data, created by the [Generator function](https://github.com/mlrun/demo-network-operations/blob/master/notebooks/generator.ipynb) from MLRun's [Network Operations Demo](https://github.com/mlrun/demo-network-operations)  
To test it yourself, please generate this dataset or use any of your available csv/parquet datasets.

In [75]:
import os
from mlrun import code_to_function, mount_v3io, NewTask, mlconf, run_local
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

In [76]:
pwd

'/User/sparkworkflows'

### Define the execute test task

In [77]:
execute_task = NewTask(name='spark-submit',
                         project='submit-proj',
                         params={'spark_options':"/v3io/users/admin/sparkworkflows/pi-Copy1.py"},                          
                         handler=spark_submit)

In [78]:
#submit_run = run_local(submit_task)

In [79]:
# execute_task = NewTask(name='spark-submit',
#                          project='submit-proj',
#                          params={'spark_options':""},                          
#                          handler=spark_submit)

### Test on cluster

Convert the code to an MLRun function

In [80]:
fn = code_to_function('submit', handler='spark_submit')
# fn.spec.image = docker_registry + "/mlrunjob/padsquad_d"
fn.spec.service_account='mlrun-api'
fn.spec.image = "mlrun/mlrun"
fn.apply(mount_v3io())
fn.export('function.yaml')

[mlrun] 2020-06-29 19:58:36,168 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fb9a9ec73c8>

In [82]:
execute_run = fn.run(execute_task)

[mlrun] 2020-06-29 20:02:50,158 starting run spark-submit uid=bbda3487f86a4c32b06dba09c77bf473  -> http://mlrun-api:8080
[mlrun] 2020-06-29 20:02:50,433 Job is running in the background, pod: spark-submit-nm5cj
[mlrun] 2020-06-29 20:02:57,022 submiting :spark-submit /v3io/users/admin/sparkworkflows/pi-Copy1.py
[mlrun] 2020-06-29 20:02:57,022 using in-cluster config.
[mlrun] 2020-06-29 20:02:57,215 10.200.0.59	default-tenant	bill-shell-66c5fcb65-jdwzv
[mlrun] 2020-06-29 20:03:07,023 Response: 20/06/29 20:02:58 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
20/06/29 20:03:00 INFO spark.SparkContext: Running Spark version 2.4.4
20/06/29 20:03:00 INFO spark.SparkContext: Submitted application: PythonPi
20/06/29 20:03:01 INFO spark.SecurityManager: Changing view acls to: iguazio
20/06/29 20:03:01 INFO spark.SecurityManager: Changing modify acls to: iguazio
20/06/29 20:03:01 INFO spark.SecurityManager: Changin

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
submit-proj,...c77bf473,0,Jun 29 20:02:56,completed,spark-submit,v3io_user=adminkind=jobowner=adminhost=spark-submit-nm5cj,,spark_options=/v3io/users/admin/sparkworkflows/pi-Copy1.py,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run bbda3487f86a4c32b06dba09c77bf473 --project submit-proj , !mlrun logs bbda3487f86a4c32b06dba09c77bf473 --project submit-proj
[mlrun] 2020-06-29 20:03:09,679 run executed, status=completed


### Show results