In [1]:
import os
import json
import requests
import cml2cde.cdeconnection as cde

In [2]:
### Required Env Variables ###
# 1. JOBS API URL for CDE Virtual Cluster you ship the PySpark Job to (you can copy this from the CDE Cluster Service page 
# 2. CDE Resource Name (This is something you can make up at will)
# 3. CDE Job Script (This is a PySpark script you can store anywhere in your local CML Project - an example is provided as /home/cdsw/example_spark_jobs/jobs/sql.py

#### Connecting to the CDE Virtual Cluster

In [3]:
JOBS_API_URL = "https://z4xgdztf.cde-6fr6l74r.go01-dem.ylcu-atmi.cloudera.site/dex/api/v1"
WORKLOAD_USER = os.environ["WORKLOAD_USER"]

In [4]:
cde_connection = cde.CdeConnection(JOBS_API_URL, WORKLOAD_USER)

In [5]:
#The Workload Password should be set automatically for each user. 
#If you encounter an error, set the variable directly in the notebook or as a CML Project Environment Variable
TOKEN = cde_connection.set_cde_token(os.environ["WORKLOAD_PASSWORD"]) 

#### Creating a CDE Resource

In [6]:
CDE_RESOURCE_NAME = "example-spark-jobs"

In [7]:
cde_connection.create_cde_resource(TOKEN, CDE_RESOURCE_NAME)

Started Creating Resource example-spark-jobs
409
{"status":"error","message":"resource with name already exists"}


In [8]:
PYSPARK_EXAMPLE_LOCAL_PATH = "/home/cdsw/example_spark_jobs/jobs/"
PYSPARK_EXAMPE_SCRIPT_NAME = "sql.py"

#### Uploading the Example PySpark script to the CDE Resource

In [9]:
# Caution: this method directly overwrites any existing files with the same PYSPARK_EXAMPE_SCRIPT_NAME within the CDE Resource if present

cde_connection.upload_file(CDE_RESOURCE_NAME, PYSPARK_EXAMPLE_LOCAL_PATH, PYSPARK_EXAMPE_SCRIPT_NAME, TOKEN)

Uploading File sql.py to CDE Resource example-spark-jobs
Response Status Code 201
Uploading File sql.py to CDE Resource example-spark-jobs has Succeeded


#### Creating (Declaring but not Executing) the CDE Job from the CDE Resource Script

In [10]:
#The CDE Job Name as it will appear in the CDE Jobs UI - Not to be confused with the Script Name above "PYSPARK_EXAMPE_SCRIPT_NAME"
CDE_JOB_NAME = "simple_spark_sql" 

In [11]:
cde_connection.create_spark_job_from_resource(TOKEN, CDE_JOB_NAME, CDE_RESOURCE_NAME, PYSPARK_EXAMPE_SCRIPT_NAME)

Started Creating CDE Spark Job simple_spark_sql with Script sql.py
500
{"status":"error","message":"job with name already exists"}


##### Spark Configurations can be set at CDE Spark Job declaration and are not mandatory.
##### The create_spark_job_from_resource() method used above allows you to pass an optional argument "spark_confs"
##### e.g. 
        spark_confs_example = { 
                  "spark.dynamicAllocation.maxExecutors": "6",
                  "spark.dynamicAllocation.minExecutors": "2",
                  "spark.executor.extraJavaOptions": "-Dsun.security.krb5.debug=true -Dsun.security.spnego.debug=true",
                  "spark.hadoop.fs.s3a.metadatastore.impl": "org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore",
                  "spark.kubernetes.memoryOverheadFactor": "0.2",
                  "spark.pyspark.python": "python3"
                  "spark.rpc.askTimeout": "600",
                  "spark.sql.shuffle.partitions": "48",
                  "spark.yarn.access.hadoopFileSystems": "s3a://your_data_lake_here"
                }                 
cde_connection.create_spark_job_from_resource(TOKEN, CDE_JOB_NAME, CDE_RESOURCE_NAME, PYSPARK_EXAMPE_SCRIPT_NAME, **spark_confs=spark_confs_example**)

#### Running the CDE Job 

In [12]:
#The Following Spark Resource Arguments default to the following values
#"driverCores": 2, "driverMemory": "4g", "executorCores": 4, "executorMemory": "4g", "numExecutors": 4

#You can customize resources with e.g. run_spark_job(TOKEN, CDE_JOB_NAME, driver_cores = 4, driver_memory = "8g", executor_cores = 4, executor_memory = "12g", num_executors = 10)  

cde_connection.run_spark_job(TOKEN, CDE_JOB_NAME)

Started to Submit Spark Job simple_spark_sql
Submitting CDE Spark Job simple_spark_sql has Succeeded
This doesn't necessarily mean that the CDE Spark Job has Succeeded
Please visit the CDE Job Runs UI to check on CDE Job Status
