## Migrating Oozie Workflows to Airflow CDE DAGs

In [197]:
import numpy as np
import pandas as pd
import os
from os.path import exists
import json
import sys
import re
import requests
from requests_toolbelt import MultipartEncoder
import xmltodict as xd
import pyparsing

#### Converting Shell Oozie Actions to Airflow Bash Operators

In [192]:
def ooziexml_to_dict(oozie_workflow_path):

    #"oozie_workflows/oozie_hive_workflow_with_properties"
    
    #Ensuring there is only one workflow xml file in the dir
    if len([file for file in os.listdir(oozie_workflow_path) if ".xml" in file]) == 1:
    
        #Parsing properties file
        for file in os.listdir(oozie_workflow_path):
            if ".xml" in file:
                print("Oozie workflow file {} found".format(file))
                with open(oozie_workflow_path+"/"+file,'rb') as f:
                    d = xd.parse(f)
                    return d
                
    elif len([file for file in os.listdir(oozie_workflow_path) if ".xml" in file]) == 0:
        
        print("No Oozie workflow file found.\n")
        print("If Oozie workflow file is expected, please ensure it is in the workflow directory.\n")
        print("If Oozie workflow file is not expected, please ignore this message.\n")
        
    else:
              
        print("Error. Only one Oozie workflow file per workflow directory expected.\n")
        print("Please remove the Oozie workflow file that is not associated with this project.\n")

In [193]:
def parse_workflow_properties(oozie_workflow_path):
    
    #"oozie_workflows/oozie_hive_workflow_with_properties"
    
    #Ensuring there is only one properties file in the dir
    if len([file for file in os.listdir(oozie_workflow_path) if ".properties" in file]) == 1:
    
        #Parsing properties file
        for file in os.listdir(oozie_workflow_path):
            if ".properties" in file:
                print("Properties file {} found".format(file))
                with open(oozie_workflow_path+"/"+file) as f:
                    properties_file = f.read()
                    properties_dict = dict([tuple(i.split("=")) for i in properties_file.split("\n") if len(tuple(i.split("="))) > 1])
                    properties_dict = {x.replace(' ', ''): v.replace(' ','') for x, v in properties_dict.items()}
                    
                    return properties_dict
                    
    elif len([file for file in os.listdir(oozie_workflow_path) if ".properties" in file]) == 0:
        
        print("No properties file found.\n")
        print("If properties file is expected, please ensure it is in the workflow directory.\n")
        print("If properties file is not expected, please ignore this message.\n")
        
    else:
              
        print("Error. Only one properties file per workflow directory expected.\n")
        print("Please remove the properties file that is not associated with this workflow.\n")

In [194]:
d = ooziexml_to_dict("oozie_workflows/oozie_hive_workflow_with_properties")
d

Oozie workflow file hive_properties_workflow.xml found


{'workflow-app': {'@xmlns': 'uri:oozie:workflow:0.4',
  '@name': 'simple-Workflow',
  'start': {'@to': 'Create_External_Table'},
  'action': [{'@name': 'Create_External_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_external}'},
    'ok': {'@to': 'Create_orc_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Create_orc_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_orc}'},
    'ok': {'@to': 'Insert_into_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Insert_into_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_copy}',
     'param': '${database}'},
    'ok': {'@to': 'end'},
    'error': {'@to': 'kill_job'}}],
  'kill': {'@name': 'kill_job'

In [195]:
props = parse_workflow_properties("oozie_workflows/oozie_hive_workflow_with_properties")
props

Properties file job1.properties found


{'nameNode': 'hdfs://rootname',
 'jobTracker': 'xyz.com:8088',
 'script_name_external': 'hdfs_path_of_script/external.hive',
 'script_name_orc': 'hdfs_path_of_script/orc.hive',
 'script_name_copy': 'hdfs_path_of_script/Copydata.hive',
 'database': 'database_name'}

In [85]:
s = ooziexml_to_dict("oozie_workflows")

Error. Only one Oozie workflow file per workflow directory expected.

Please remove the Oozie workflow file that is not associated with this project.



In [214]:
json.dumps(d)

'{"workflow-app": {"@xmlns": "uri:oozie:workflow:0.4", "@name": "simple-Workflow", "start": {"@to": "Create_External_Table"}, "action": [{"@name": "Create_External_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${jobTracker}", "name-node": "${nameNode}", "script": "${script_name_external}"}, "ok": {"@to": "Create_orc_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Create_orc_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${jobTracker}", "name-node": "${nameNode}", "script": "${script_name_orc}"}, "ok": {"@to": "Insert_into_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Insert_into_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${jobTracker}", "name-node": "${nameNode}", "script": "${script_name_copy}", "param": "${database}"}, "ok": {"@to": "end"}, "error": {"@to": "kill_job"}}], "kill": {"@name": "kill_job", "message": "Job failed"}, "end": {"@name": "end"}}}'

In [216]:
string = json.dumps(d)

for k, v in props.items():
    
    string = string.replace(k, v)
print(string)

{"workflow-app": {"@xmlns": "uri:oozie:workflow:0.4", "@name": "simple-Workflow", "start": {"@to": "Create_External_Table"}, "action": [{"@name": "Create_External_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/external.hive}"}, "ok": {"@to": "Create_orc_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Create_orc_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/orc.hive}"}, "ok": {"@to": "Insert_into_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Insert_into_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/Copydata.hive}", "param": "${database_name}"}, "ok": {"@to": "end"}, "error": {"@to": "kill_job"}}], "kill": {"@name": "kill_job", "message": "Job failed"

In [243]:
import re

print(re.sub('"[${*}]"', '', string))

{"workflow-app": {"@xmlns": "uri:oozie:workflow:0.4", "@name": "simple-Workflow", "start": {"@to": "Create_External_Table"}, "action": [{"@name": "Create_External_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/external.hive}"}, "ok": {"@to": "Create_orc_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Create_orc_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/orc.hive}"}, "ok": {"@to": "Insert_into_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Insert_into_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/Copydata.hive}", "param": "${database_name}"}, "ok": {"@to": "end"}, "error": {"@to": "kill_job"}}], "kill": {"@name": "kill_job", "message": "Job failed"

In [248]:
string

'{"workflow-app": {"@xmlns": "uri:oozie:workflow:0.4", "@name": "simple-Workflow", "start": {"@to": "Create_External_Table"}, "action": [{"@name": "Create_External_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/external.hive}"}, "ok": {"@to": "Create_orc_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Create_orc_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/orc.hive}"}, "ok": {"@to": "Insert_into_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Insert_into_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/Copydata.hive}", "param": "${database_name}"}, "ok": {"@to": "end"}, "error": {"@to": "kill_job"}}], "kill": {"@name": "kill_job", "message": "Job failed

In [None]:
import re
exp = '\$\d+(,|\.)?\d+'
s = '$1,000|hi,you|$45.43'
'|'.join(i.translate(None, '$,') if re.match(exp, i) else i for i in s.split('|'))

In [247]:
import re
exp = '\$\d+(,""\.)?\d+'
'""'.join(i.translate(None, '${') if re.match(exp, i) else i for i in string.split('""'))

'{"workflow-app": {"@xmlns": "uri:oozie:workflow:0.4", "@name": "simple-Workflow", "start": {"@to": "Create_External_Table"}, "action": [{"@name": "Create_External_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/external.hive}"}, "ok": {"@to": "Create_orc_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Create_orc_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/orc.hive}"}, "ok": {"@to": "Insert_into_Table"}, "error": {"@to": "kill_job"}}, {"@name": "Insert_into_Table", "hive": {"@xmlns": "uri:oozie:hive-action:0.4", "job-tracker": "${xyz.com:8088}", "name-node": "${hdfs://rootname}", "script": "${hdfs_path_of_script/Copydata.hive}", "param": "${database_name}"}, "ok": {"@to": "end"}, "error": {"@to": "kill_job"}}], "kill": {"@name": "kill_job", "message": "Job failed

In [None]:
rev_subs = { v:k for k,v in subs.iteritems()}
[rev_subs.get(item,item)  for item in my_lst]

In [212]:
import re
quoted = re.compile('"[^"]*"')
q = quoted.findall(json.dumps(d))
q

['"workflow-app"',
 '"@xmlns"',
 '"uri:oozie:workflow:0.4"',
 '"@name"',
 '"simple-Workflow"',
 '"start"',
 '"@to"',
 '"Create_External_Table"',
 '"action"',
 '"@name"',
 '"Create_External_Table"',
 '"hive"',
 '"@xmlns"',
 '"uri:oozie:hive-action:0.4"',
 '"job-tracker"',
 '"${jobTracker}"',
 '"name-node"',
 '"${nameNode}"',
 '"script"',
 '"${script_name_external}"',
 '"ok"',
 '"@to"',
 '"Create_orc_Table"',
 '"error"',
 '"@to"',
 '"kill_job"',
 '"@name"',
 '"Create_orc_Table"',
 '"hive"',
 '"@xmlns"',
 '"uri:oozie:hive-action:0.4"',
 '"job-tracker"',
 '"${jobTracker}"',
 '"name-node"',
 '"${nameNode}"',
 '"script"',
 '"${script_name_orc}"',
 '"ok"',
 '"@to"',
 '"Insert_into_Table"',
 '"error"',
 '"@to"',
 '"kill_job"',
 '"@name"',
 '"Insert_into_Table"',
 '"hive"',
 '"@xmlns"',
 '"uri:oozie:hive-action:0.4"',
 '"job-tracker"',
 '"${jobTracker}"',
 '"name-node"',
 '"${nameNode}"',
 '"script"',
 '"${script_name_copy}"',
 '"param"',
 '"${database}"',
 '"ok"',
 '"@to"',
 '"end"',
 '"error"

In [209]:
for i in q:
    if i == 

TypeError: translate() takes exactly one argument (2 given)

In [153]:
import flatdict

In [168]:
flatdict.FlatDict(d, delimiter='.')

<FlatDict id=140293236270032 {'workflow-app.@xmlns': 'uri:oozie:workflow:0.4', 'workflow-app.@name': 'simple-Workflow', 'workflow-app.start.@to': 'Create_External_Table', 'workflow-app.action': [{'@name': 'Create_External_Table', 'hive': {'@xmlns': 'uri:oozie:hive-action:0.4', 'job-tracker': '${jobTracker}', 'name-node': '${nameNode}', 'script': '${script_name_external}'}, 'ok': {'@to': 'Create_orc_Table'}, 'error': {'@to': 'kill_job'}}, {'@name': 'Create_orc_Table', 'hive': {'@xmlns': 'uri:oozie:hive-action:0.4', 'job-tracker': '${jobTracker}', 'name-node': '${nameNode}', 'script': '${script_name_orc}'}, 'ok': {'@to': 'Insert_into_Table'}, 'error': {'@to': 'kill_job'}}, {'@name': 'Insert_into_Table', 'hive': {'@xmlns': 'uri:oozie:hive-action:0.4', 'job-tracker': '${jobTracker}', 'name-node': '${nameNode}', 'script': '${script_name_copy}', 'param': '${database}'}, 'ok': {'@to': 'end'}, 'error': {'@to': 'kill_job'}}], 'workflow-app.kill.@name': 'kill_job', 'workflow-app.kill.message': '

In [185]:
flatdict.FlatterDict(d, delimiter='.').values()

['uri:oozie:workflow:0.4',
 'simple-Workflow',
 'Create_External_Table',
 'Create_External_Table',
 'uri:oozie:hive-action:0.4',
 '${jobTracker}',
 '${nameNode}',
 '${script_name_external}',
 'Create_orc_Table',
 'kill_job',
 'Create_orc_Table',
 'uri:oozie:hive-action:0.4',
 '${jobTracker}',
 '${nameNode}',
 '${script_name_orc}',
 'Insert_into_Table',
 'kill_job',
 'Insert_into_Table',
 'uri:oozie:hive-action:0.4',
 '${jobTracker}',
 '${nameNode}',
 '${script_name_copy}',
 '${database}',
 'end',
 'kill_job',
 'kill_job',
 'Job failed',
 'end']

In [190]:
for key, val in flatdict.FlatterDict(d, delimiter='.').items():
    if val.replace('$','').replace('{','').replace('}','') in props.keys():
        key[val] = props[val]

KeyError: '${jobTracker}'

In [188]:
flatdict.FlatterDict(d, delimiter='.').values()

['uri:oozie:workflow:0.4',
 'simple-Workflow',
 'Create_External_Table',
 'Create_External_Table',
 'uri:oozie:hive-action:0.4',
 '${jobTracker}',
 '${nameNode}',
 '${script_name_external}',
 'Create_orc_Table',
 'kill_job',
 'Create_orc_Table',
 'uri:oozie:hive-action:0.4',
 '${jobTracker}',
 '${nameNode}',
 '${script_name_orc}',
 'Insert_into_Table',
 'kill_job',
 'Insert_into_Table',
 'uri:oozie:hive-action:0.4',
 '${jobTracker}',
 '${nameNode}',
 '${script_name_copy}',
 '${database}',
 'end',
 'kill_job',
 'kill_job',
 'Job failed',
 'end']

In [None]:
    
        print(tup[1])
        tup[1] = props[tup[1].replace('$','').replace('{','').replace('}','')]
        print(tup[1])

In [89]:
props = parse_workflow_properties("oozie_workflows/oozie_hive_workflow_with_properties")
props

Properties file job1.properties found


{'nameNode ': ' hdfs://rootname',
 'jobTracker ': ' xyz.com:8088',
 'script_name_external ': ' hdfs_path_of_script/external.hive',
 'script_name_orc': 'hdfs_path_of_script/orc.hive',
 'script_name_copy': 'hdfs_path_of_script/Copydata.hive',
 'database ': ' database_name'}

In [130]:
import re
quoted = re.compile('"[^"]*"')
for value in quoted.findall(json.dumps(d)):
    if "$" in value:
        if value in props.keys():
            print(value)

In [None]:
d = ooziexml_to_dict("oozie_workflows/hive_action_workflow.xml")
d

In [42]:
def initialize_dag():
    with open('airflow_dags/oozie2airflow.py', 'w') as f:
        f.write('# The new Airflow DAG')

In [43]:
def dag_imports():
    
    imports = """\nfrom dateutil import parser
    \nfrom datetime import datetime, timedelta
    \nfrom datetime import timezone
    \nfrom airflow import DAG
    \nfrom cloudera.cdp.airflow.operators.cde_operator import CDEJobRunOperator\n\n"""
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(imports)

In [44]:
def dag_declaration():
    
    declarations = """default_args = {
    'owner': 'your_username_here',
    'retry_delay': timedelta(seconds=5),
    'depends_on_past': False,
    'start_date': parser.isoparse('2021-05-25T07:33:37.393Z').replace(tzinfo=timezone.utc)
    }

dag = DAG(
    'airflow-pipeline-demo',
    default_args=default_args,
    schedule_interval='@daily',
    catchup=False,
    is_paused_upon_creation=False
    )\n\n"""
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(declarations)

In [87]:
def append_cde_spark_operator(task_id, step_name, spark_cde_job_name):
    
    spark_operator = """{} = CDEJobRunOperator(
    task_id='{}',
    dag=dag,
    job_name='{}'
    )\n\n""".format(step_name, task_id, spark_cde_job_name)
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(spark_operator)

In [85]:
def parse_spark_oozie_action(d):
     
    if "spark" in d["workflow-app"]["action"].keys():

        task_id = d["workflow-app"]["action"]["spark"]["name"]
        step_name = task_id+"_Step" 
        spark_cde_job_name = task_id

    else:
        print("Error. This is not a Spark Oozie Action")
        
    print("Extracted Job Name: {}".format(task_id))
    return task_id, step_name, spark_cde_job_name

In [114]:
def append_cdw_operator(task_id, step_name, cdw_query):
    
    cdw_operator = '''cdw_query = """{}"""

{} = CDWOperator(
    task_id="{}",
    dag=dag,
    cli_conn_id="hive_conn",
    hql=cdw_query,
    schema='default',
    ### CDW related args ###
    use_proxy_user=False,
    query_isolation=True
)\n\n'''.format(cdw_query, step_name, task_id)
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(cdw_operator)

In [95]:
def parse_hive_oozie_action(d):
    
    #CDE Operator Task ID
    task_id = d['workflow-app']['action']['@name']
    
    #CDW Operator Name
    step_name = task_id+"_Step"
       
    #Parsing SQL from Hive file
    with open(d['workflow-app']['action']['hive']['script'], 'r') as f:
        hive_sql = f.read()
        cdw_query = hive_sql.replace("\n", "")
        
    return task_id, step_name, cdw_query

In [115]:
initialize_dag()

In [116]:
dag_imports()

In [117]:
dag_declaration()

In [118]:
task_id, step_name, spark_cde_job_name = parse_spark_oozie_action(s)

Extracted Job Name: SparkPi


In [119]:
append_cde_spark_operator(task_id, step_name, spark_cde_job_name)

In [120]:
task_id, step_name, cdw_query = parse_hive_oozie_action(d)

In [121]:
append_cdw_operator(task_id, step_name, cdw_query)