# Spark with MLRun example

This example notebook demonstrates how to execute a spark job with MLRun.

Our spark job is a generic ETL job which pulls data from user-defined data sources, applies a SQL query on top of them, and writes the result to a user defined destination.

The definition of the input-sources should be according to : https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader

The definition of the output destination should be according to :
https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter

In [None]:
import os
from os.path import isfile, join
from mlrun import new_function, NewTask, mlconf, mount_v3io

#Set the mlrun database/api
mlconf.dbpath = 'http://mlrun-api:8080'

In [None]:
# Environment vars to be set by Nuclio
PYTHON_SCRIPT = os.getenv('PYTHON_SCRIPT','/kv-to-parquet.py')
V3IO_SCRIPT_PATH = os.getenv('V3IO_SCRIPT_PATH',os.getcwd().replace('/User','/v3io/'+os.getenv('V3IO_HOME')))
SPARK_JOB_NAME = os.getenv('SPARK_JOB_NAME','my-spark-job') 
SPARK_SPEC_MEM = os.getenv('SPARK_SPEC_MEM','2g') 
SPARK_SPEC_CPU = os.getenv('SPARK_SPEC_CPU',1) 
SPARK_SPEC_REPLICAS = os.getenv('SPARK_SPEC_REPLICAS',1) 

In [None]:
#Set the pyspark script path
V3IO_SCRIPT_PATH = V3IO_SCRIPT_PATH+PYTHON_SCRIPT

In [None]:
V3IO_SCRIPT_PATH

## Define a task (job parameters)

In [None]:
#Create a task execution with parameters
PARAMS = {}
SPARK_TASK = NewTask(params=PARAMS)

In [None]:
#Get the list of the dpendency jars
V3IO_JARS_PATH = '/igz/java/libs/'
DEPS_JARS_LIST = [join(V3IO_JARS_PATH, f) for f in os.listdir(V3IO_JARS_PATH) 
                  if isfile(join(V3IO_JARS_PATH, f)) and f.startswith('v3io-') and f.endswith('.jar')]


## Run as a job on the Kubernetes cluster

In [None]:
#Create MLRun function to run the spark-job on the kubernetes cluster
serverless_spark_fn = new_function(kind='spark', image='urihoenig/spark-app:2.4.4-2.9.0-0.0.3', 
                                   command=f'local://{V3IO_SCRIPT_PATH}', name=SPARK_JOB_NAME).apply(mount_v3io())

In [None]:
print(serverless_spark_fn.to_yaml())

In [None]:
serverless_spark_fn.spec.env.append({'name':'V3IO_HOME_URL','value':os.getenv("V3IO_HOME_URL")})

In [None]:
serverless_spark_fn.spec.env

In [None]:
serverless_spark_fn.with_limits(mem=SPARK_SPEC_MEM)
serverless_spark_fn.with_requests(cpu=SPARK_SPEC_CPU)
serverless_spark_fn.with_igz_spark(igz_version='2.8_b3506_20191217042239')
#Set number of executors
serverless_spark_fn.spec.replicas = SPARK_SPEC_REPLICAS

serverless_spark_fn.run(SPARK_TASK)