## Migrating Oozie Spark Actions to Spark CDE Jobs

In [1]:
import numpy as np
import pandas as pd
import os
import json
import sys
import re
import requests
from requests_toolbelt import MultipartEncoder
import xmltodict as xd

In [2]:
os.environ["WORKLOAD_USER"] = "pauldefusco"
os.environ["JOBS_API_URL"] = "https://tk5p4pn9.cde-6fr6l74r.go01-dem.ylcu-atmi.cloudera.site/dex/api/v1"

#### We reuse the Python methods from Notebook 4 to construct requests to CDE

In [3]:
# Set user token to interact with CDE Service remotely
def set_cde_token():
    rep = os.environ["JOBS_API_URL"].split("/")[2].split(".")[0]
    os.environ["GET_TOKEN_URL"] = os.environ["JOBS_API_URL"].replace(rep, "service").replace("dex/api/v1", "gateway/authtkn/knoxtoken/api/v1/token")
    token_json = !curl -u $WORKLOAD_USER:$WORKLOAD_PASSWORD $GET_TOKEN_URL
    os.environ["ACCESS_TOKEN"] = json.loads(token_json[5])["access_token"]
    
    return json.loads(token_json[5])["access_token"]

#### This time we upload one file at a time. The method is almost identical to "put_files" in notebook 4

In [4]:
#Upload Spark CDE Job file to CDE Resource
def put_file(resource_name, job_path, tok):
         
    print("Working on Job: {}".format(job_path.split("/")[-1].split(".")[0]))

    m = MultipartEncoder(
        fields={
                'file': ('filename', open(job_path, 'rb'), 'text/plain')}
        )

    PUT = '{jobs_api_url}/resources/{resource_name}/{file_name}'.format(jobs_api_url=os.environ["JOBS_API_URL"], 
                                                                                      resource_name=resource_name, 
                                                                                      file_name=job_path.split("/")[-1])

    x = requests.put(PUT, data=m, headers={'Authorization': f"Bearer {tok}",'Content-Type': m.content_type})

    print("Response Status Code {}".format(x.status_code))
    print(x.text)

#### Similarly, we only create a Spark CDE Job at a time. 

In [5]:
def create_job_from_resource(job_path, tok, cde_payload):
        
    print("Working on Job: {}".format(job_path.split("/")[-1].split(".")[0]))

    headers = {
    'Authorization': f"Bearer {tok}",
    'accept': 'application/json',
    'Content-Type': 'application/json',
    }

    PUT = '{}/jobs'.format(os.environ["JOBS_API_URL"])
    
    data = json.dumps(cde_payload)

    x = requests.post(PUT, headers=headers, data=data)

    print("Response Status Code {}".format(x.status_code))
    print(x.text)
    print("\n")

#### The Spark Oozie Action is an XML file. It can be easily converted into a Python dictionary.

In [6]:
def ooziexml_to_dict(ooziexml_path):

    with open(ooziexml_path,'rb') as f:
        d = xd.parse(f)
        
    return d

In [7]:
d = ooziexml_to_dict('oozie_workflows/Workflow.xml')
d

{'workflow-app': {'@xmlns': 'uri:oozie:workflow:0.5',
  '@name': 'SparkWordCount',
  'start': {'@to': 'spark-node'},
  'action': {'@name': 'spark-node',
   'spark': {'@xmlns': 'uri:oozie:spark-action:0.1',
    'job-tracker': '${jobTracker}',
    'name-node': '${nameNode}',
    'prepare': {'delete': {'@path': '${nameNode}/user/${wf:user()}/${examplesRoot}/output-data'}},
    'master': '${master}',
    'name': 'SparkPi',
    'class': 'org.apache.spark.examples.SparkPi',
    'jar': 'example_spark_jobs/jobs/pi.scala',
    'spark-opts': '--executor-memory 2G --num-executors 5',
    'arg': 'value=10'},
   'ok': {'@to': 'end'},
   'error': {'@to': 'fail'}},
  'kill': {'@name': 'fail',
   'message': 'Workflow failed, error\n            message[${wf:errorMessage(wf:lastErrorNode())}]'},
  'end': {'@name': 'end'}}}

#### This method is used to create Payloads for Spark CDE Jobs from the Spark Oozie Action

In [9]:
def oozie_to_cde(cde_resource, d):
    
    cde_payload = { "name": "job_name", 
                "type": "spark", 
                "retentionPolicy": "keep_indefinitely", 
                "mounts": [ { "dirPrefix": "/", "resourceName": "resource_name" } ], 
                "spark": { "file": "file_name", 
                          "conf": { "spark.pyspark.python": "python3" }}, 
                "schedule": { "enabled": False}
               }
    
    if "spark" in d["workflow-app"]["action"].keys():

        cde_payload["name"] = d["workflow-app"]["action"]["spark"]["name"]
        cde_payload["mounts"][0]["resourceName"] = cde_resource 
        cde_payload["spark"]["file"] = d["workflow-app"]["action"]["spark"]["jar"].split("/")[-1]

        if len(d["workflow-app"]["action"]["spark"]["spark-opts"])>0:
            opts = d["workflow-app"]["action"]["spark"]["spark-opts"]
            spark_job_opts = dict(np.array_split(opts.split(" "), len(opts.split(" "))/2))

        if "--driver-cores" in spark_job_opts.keys():
            cde_payload["spark"]["driverCores"] = int(spark_job_opts["--driver-cores"])

        if "--executor-cores" in spark_job_opts.keys():
            cde_payload["spark"]["executorCores"] = int(spark_job_opts["--executor-cores"])

        if "--driver-memory" in spark_job_opts.keys():
            cde_payload["spark"]["driverMemory"] = spark_job_opts["--driver-memory"]

        if "--executor-memory" in spark_job_opts.keys():
            cde_payload["spark"]["executorMemory"] = spark_job_opts["--executor-memory"]

        if "--num-executors" in spark_job_opts.keys():
            cde_payload["spark"]["numExecutors"] = int(spark_job_opts["--num-executors"])

        #if "class" in d["workflow-app"]["action"]["spark"].keys():
        #    cde_payload["spark"]["className"] = d["workflow-app"]["action"]["spark"]["class"]
        
    else:
        print("Error. This is not a Spark Oozie Action")
        
    print("Working on Spark CDE Job: {}".format(cde_payload["name"]))
    print("Converted Spark Oozie Action into Spark CDE Payload")
    return cde_payload

In [11]:
cde_payload = oozie_to_cde("python2cde", d)
cde_payload

Working on Spark CDE Job: SparkPi
Converted Spark Oozie Action into Spark CDE Payload


{'name': 'SparkPi',
 'type': 'spark',
 'retentionPolicy': 'keep_indefinitely',
 'mounts': [{'dirPrefix': '/', 'resourceName': 'python2cde'}],
 'spark': {'file': 'pi.scala',
  'conf': {'spark.pyspark.python': 'python3'},
  'executorMemory': '2G',
  'numExecutors': 5},
 'schedule': {'enabled': False}}

#### With the converted Payload we can upload to a CDE Resource and create a CDE Job

In [12]:
tok = set_cde_token()

In [13]:
put_file("python2cde", "example_spark_jobs/jobs/pi.scala", tok)

Working on Job: pi
Response Status Code 201



In [15]:
create_job_from_resource("python2cde", "example_spark_jobs/jobs/pi.scala", tok, cde_payload)

Working on Job: pi
Response Status Code 201





#### Finally, navigate to the CDE Jobs Page and verify that the job has been created

![alt text](images/oozie2cde.png)