### Import and Config

In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "iguazio/shell:3.0_b5565_20201026062233_wsdf" # docker image available on idan707/spark_shell 

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'iguazio/shell:3.0_b5565_20201026062233_wsdf'


In [3]:
#!/usr/local/bin/python

import mlrun
from mlrun.platforms.iguazio import mount_v3io, mount_v3iod
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx

import os
#import spark_df_profiling
from subprocess import run

from pyspark.sql import SparkSession
import pyspark.sql.functions as f


### Build Simple Read CSV Function using Spark

In [4]:
#!/usr/local/bin/python

run(["/bin/bash", "/etc/config/v3io/v3io-spark-operator.sh"])

def describe_spark(context: MLClientCtx, 
                   dataset: DataItem, 
                   artifact_path):
    
    # get file location
    location = dataset.local()
    
    # build spark session
    spark = SparkSession.builder.appName("Spark job").getOrCreate()
    
    # read csv
    df = spark.read.csv(location, header=True, inferSchema= True)
    
    # show
    df.show(5)
    
    # sample for logging
    df_to_log = df.sample(False, 0.1).toPandas()
    
    # log final report
    context.log_dataset("df_sample", 
                        df=df_to_log,
                        format="csv", index=False,
                        artifact_path=context.artifact_subpath('data'))
    
    spark.stop()


In [5]:
# nuclio: end-code

### Save and Config

In [6]:
fn = mlrun.code_to_function(handler="describe_spark")

In [7]:
fn.apply(mount_v3io())
fn.apply(mount_v3iod(namespace="default-tenant", v3io_config_configmap="spark-operator-v3io-config"))
fn.spec.image_pull_policy = "IfNotPresent"

In [8]:
artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',
                                      artifact_path = os.path.abspath('./'))



In [9]:
fn.run(inputs={"dataset": "iris_dataset.csv"},
       artifact_path=artifact_path)

> 2020-10-28 11:26:16,536 [info] starting run test-describe_spark uid=c90bf87b5c9641ca9bc17940e068ab38  -> http://mlrun-api:8080
> 2020-10-28 11:26:16,680 [info] Job is running in the background, pod: test-describe-spark-xlzj7


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...e068ab38,0,Oct 28 11:26:16,running,test-describe_spark,v3io_user=adminkind=jobowner=admin,dataset,,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run c90bf87b5c9641ca9bc17940e068ab38 --project default , !mlrun logs c90bf87b5c9641ca9bc17940e068ab38 --project default
> 2020-10-28 11:26:16,762 [info] run executed, status=running


<mlrun.model.RunObject at 0x7fcfdd6b8b10>