# MLRun Spark on K8s Example

#### Pre-req

In [None]:
# using awscli==1.35.2 due to ML-9923

In [None]:
!version=`curl -s ${IGZ_MLRUN_API_ENDPOINT}/api/v1/client-spec | python3 -c "import sys, json; print(json.load(sys.stdin)['version'])"`; pip install awscli==1.35.2 mlrun[complete]==$version

#### Create or load a project

In [None]:
# Initialize the MLRun project object
import mlrun
import os

project_name = "mlrun-spark-k8s"
project = mlrun.get_or_create_project(project_name, context="./")


#### Credentials & Parameters

In [None]:
import uuid
work_dir = str(uuid.uuid4())

secrets = {'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
           'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY')}

S3_BUCKET = os.environ.get('S3_BUCKET', 'testbucket-igz-temp')

source_path = f"{S3_BUCKET}/{work_dir}/dataset.csv"
target_path = f"{S3_BUCKET}/{work_dir}/target-pq"

# Create project secrets with S3 credentials
project.set_secrets(secrets=secrets, provider="kubernetes")

## Prep dataset

In [None]:
#upload dataset to S3
!aws s3 cp dataset.csv s3://$source_path

# cleanup from previous runs
!aws s3 rm --recursive s3://$target_path



#### Deploy default spark image

In [None]:
from mlrun.runtimes import Spark3Runtime
Spark3Runtime.deploy_default_image()

#### Initiate an MLRun function of kind Spark

In [None]:
fn = project.set_function(name="my-spark-function", tag="latest",
                          func="simple-spark-etl.py", kind="spark",
                          image='.spark-job-default-image:latest')

In [None]:
# Configure the function's resources
fn.with_driver_limits(cpu="1300m")
fn.with_driver_requests(cpu=1, mem="1300m")
fn.with_executor_limits(cpu="1400m")
fn.with_executor_requests(cpu=1, mem="512m")
fn.spec.replicas = 2
fn.spec.image_pull_policy = 'Always'


# set arguments
fn.spec.args=['--source_path',f"s3a://{source_path}", '--target_path',f"s3a://{target_path}"]


### Run

In [None]:
project.run_function(fn)

### Check S3

In [None]:
count = !aws s3 ls s3://$target_path/ | wc -l
assert count == ['21'] or count == ['22']  # newer hadoop / aws jar can add extra empty "0" file

In [None]:
count

In [None]:
!aws s3 ls s3://$target_path/ 

### Cleanup

In [None]:
!aws s3 rm --recursive s3://$target_path
!aws s3 rm --recursive s3://$source_path

mlrun.get_run_db().delete_project(name=project.name, deletion_strategy='cascade')