## Migrating Oozie Workflows to Airflow CDE DAGs

In [1]:
import numpy as np
import pandas as pd
import os
from os.path import exists
import json
import sys
import re
import requests
from requests_toolbelt import MultipartEncoder
import xmltodict as xd
import pyparsing

#### Converting Oozie Actions to Airflow Bash Operators

In [2]:
def ooziexml_to_dict(oozie_workflow_path):

    #"oozie_workflows/oozie_hive_workflow_with_properties"
    
    #Ensuring there is only one workflow xml file in the dir
    if len([file for file in os.listdir(oozie_workflow_path) if ".xml" in file]) == 1:
    
        #Parsing properties file
        for file in os.listdir(oozie_workflow_path):
            if ".xml" in file:
                print("Oozie workflow file {} found".format(file))
                with open(oozie_workflow_path+"/"+file,'rb') as f:
                    d = xd.parse(f)
                    return d
                
    elif len([file for file in os.listdir(oozie_workflow_path) if ".xml" in file]) == 0:
        
        print("No Oozie workflow file found.\n")
        print("If Oozie workflow file is expected, please ensure it is in the workflow directory.\n")
        print("If Oozie workflow file is not expected, please ignore this message.\n")
        
    else:
              
        print("Error. Only one Oozie workflow file per workflow directory expected.\n")
        print("Please remove the Oozie workflow file that is not associated with this project.\n")

In [3]:
def parse_workflow_properties(oozie_workflow_path):
    
    #"oozie_workflows/oozie_hive_workflow_with_properties"
    
    #Ensuring there is only one properties file in the dir
    if len([file for file in os.listdir(oozie_workflow_path) if ".properties" in file]) == 1:
    
        #Parsing properties file
        for file in os.listdir(oozie_workflow_path):
            if ".properties" in file:
                print("Properties file {} found".format(file))
                with open(oozie_workflow_path+"/"+file) as f:
                    properties_file = f.read()
                    properties_dict = dict([tuple(i.split("=")) for i in properties_file.split("\n") if len(tuple(i.split("="))) > 1])
                    properties_dict = {x.replace(' ', ''): v.replace(' ','') for x, v in properties_dict.items()}
                    
                    return properties_dict
                    
    elif len([file for file in os.listdir(oozie_workflow_path) if ".properties" in file]) == 0:
        
        print("No properties file found.\n")
        print("If properties file is expected, please ensure it is in the workflow directory.\n")
        print("If properties file is not expected, please ignore this message.\n")
        
    else:
              
        print("Error. Only one properties file per workflow directory expected.\n")
        print("Please remove the properties file that is not associated with this workflow.\n")

In [4]:
d = ooziexml_to_dict("oozie_workflows/oozie_hive_workflow_with_properties")
d

Oozie workflow file hive_properties_workflow.xml found


{'workflow-app': {'@xmlns': 'uri:oozie:workflow:0.4',
  '@name': 'simple-Workflow',
  'start': {'@to': 'Create_External_Table'},
  'action': [{'@name': 'Create_External_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_external}'},
    'ok': {'@to': 'Create_orc_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Create_orc_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_orc}'},
    'ok': {'@to': 'Insert_into_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Insert_into_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_copy}',
     'param': '${database}'},
    'ok': {'@to': 'end'},
    'error': {'@to': 'kill_job'}}],
  'kill': {'@name': 'kill_job'

In [5]:
props = parse_workflow_properties("oozie_workflows/oozie_hive_workflow_with_properties")
props

Properties file job1.properties found


{'nameNode': 'hdfs://rootname',
 'jobTracker': 'xyz.com:8088',
 'script_name_external': 'oozie_workflows/oozie_hive_workflow_with_properties/external.hive',
 'script_name_orc': 'oozie_workflows/oozie_hive_workflow_with_properties/orc.hive',
 'script_name_copy': 'oozie_workflows/oozie_hive_workflow_with_properties/Copydata.hive',
 'database_name': 'default'}

In [6]:
def workflow_properties_lookup(d, props):
    
    #Property value lookup 
    string = json.dumps(d)

    for k, v in props.items():
        string = string.replace(k, v)

    #Removing unwanted characters
    subbed = re.sub(r'"\${(.*?)}"', r'"\1"', string)
    parsed_xml = json.loads(subbed)
    return parsed_xml

In [7]:
d = workflow_properties_lookup(d, props)
d

{'workflow-app': {'@xmlns': 'uri:oozie:workflow:0.4',
  '@name': 'simple-Workflow',
  'start': {'@to': 'Create_External_Table'},
  'action': [{'@name': 'Create_External_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'oozie_workflows/oozie_hive_workflow_with_properties/external.hive'},
    'ok': {'@to': 'Create_orc_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Create_orc_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'oozie_workflows/oozie_hive_workflow_with_properties/orc.hive'},
    'ok': {'@to': 'Insert_into_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Insert_into_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'oozie_workflows/oozie_hive_workflow_with_properties

In [1]:
def query_properties_lookup(hive_file_path, hive_file_name, props):
    
    with open(hive_file_path+"/"+hive_file_name, "r") as f:
        hive_sql = f.read()
        cdw_query = hive_sql.replace("\n", "")

    print("The input Hive query is: \n")
    print(cdw_query)

    for k, v in props.items():
        cdw_query = cdw_query.replace(k, v)

    #Removing unwanted characters
    subbed = re.sub(r'\${(.*?)}', r'"\1"', cdw_query)

    print("\nThe output Hive query is: \n")
    print(subbed)

    with open(hive_file_path+"/"+hive_file_name, "w") as f:
        f.write(subbed)

In [9]:
query_properties_lookup("oozie_workflows/oozie_hive_workflow_with_properties", "Copydata.hive", props)

The input Hive query is: 

use "default"; -- input from Oozieinsert into table orc_tableselectconcat(first_name,' ',last_name) as name,yearofbirth,year(from_unixtime) --yearofbirth as age,address,zipfrom external_table;

The output Hive query is: 

use "default"; -- input from Oozieinsert into table orc_tableselectconcat(first_name,' ',last_name) as name,yearofbirth,year(from_unixtime) --yearofbirth as age,address,zipfrom external_table;


In [2]:
def parse_oozie_workflow(dag_dir, dag_file_name, d):

    for action in d['workflow-app']['action']:

        print(action)

        print(parse_hive_oozie_action(action))
        #task_id, step_name, cdw_query = parse_hive_oozie_action(action)

        if 'hive' in action.keys():

            #Parsing Hive Oozie Action
            task_id, step_name, cdw_query = parse_hive_oozie_action(action)

            #Converting Hive Oozie Action to CDW Operator and Appending to CDE DAG
            append_cdw_operator(dag_dir, dag_file_name,task_id, step_name, cdw_query)

        elif 'spark' in action.keys():

            #Parsing Spark Oozie Action
            task_id, step_name, spark_cde_job_name = parse_spark_oozie_action(action)

            #Converting Spark Oozie Action to CDE Operator and Appending to CDE DAG
            append_cde_spark_operator(dag_dir, dag_file_name, task_id, step_name, spark_cde_job_name)

        elif 'email' in action.keys():

            #Parsing Email Oozie Action
            task_id, step_name, email_to, email_cc, email_subject, email_body = parse_email_oozie_action(action)

            #Converting Email Oozie Action to CDE Airflow Email Operator
            append_email_operator(dag_dir, dag_file_name, task_id, step_name, email_to, email_cc, email_subject, email_body)

        elif 'shell' in action.keys():

            #Parsing Shell Oozie Action
            task_id, step_name = parse_shell_oozie_action(a)

            #Converting Shell Oozie Action to CDE Airflow Bash Operator
            append_bash_operator(dag_dir, dag_file_name, task_id, step_name)

        else:
            #Converting Unsupported Oozie Action to CDE Airflow Python Operator
            append_python_operator(dag_dir, dag_file_name)

In [118]:
parse_oozie_workflow(d)

{'@name': 'Create_External_Table', 'hive': {'@xmlns': 'uri:oozie:hive-action:0.4', 'job-tracker': 'xyz.com:8088', 'name-node': 'hdfs://rootname', 'script': 'oozie_workflows/oozie_hive_workflow_with_properties/external.hive'}, 'ok': {'@to': 'Create_orc_Table'}, 'error': {'@to': 'kill_job'}}
('Create_External_Table', 'Create_External_Table_Step', "Create external table external_table(name string,age int,address string,zip int)row format delimitedfields terminated by ','stored as textfilelocation '/test/abc';")
{'@name': 'Create_orc_Table', 'hive': {'@xmlns': 'uri:oozie:hive-action:0.4', 'job-tracker': 'xyz.com:8088', 'name-node': 'hdfs://rootname', 'script': 'oozie_workflows/oozie_hive_workflow_with_properties/orc.hive'}, 'ok': {'@to': 'Insert_into_Table'}, 'error': {'@to': 'kill_job'}}
('Create_orc_Table', 'Create_orc_Table_Step', 'Create Table orc_table(name string, -- Concate value of first name and last name with space as seperatoryearofbirth int,age int, -- Current year minus year o

In [70]:
def initialize_dag(dag_dir, dag_file_name):
    with open(dag_dir+"/"+dag_file_name, 'w') as f:
        f.write('# The new Airflow DAG')

In [110]:
def dag_imports(dag_dir, dag_file_name):
    
    imports = """\nfrom dateutil import parser
    \nfrom datetime import datetime, timedelta
    \nfrom datetime import timezone
    \nfrom airflow import DAG
    \nfrom airflow.operators.email import EmailOperator
    \nfrom airflow.operators.python_operator import PythonOperator
    \nfrom cloudera.cdp.airflow.operators.cdw_operator import CDWOperator
    \nfrom cloudera.cdp.airflow.operators.cde_operator import CDEJobRunOperator\n\n"""
    
    with open(dag_dir+"/"+dag_file_name, 'a') as f:
        f.write(imports)

In [2]:
def dag_declaration(dag_dir, dag_file_name):
    
    declaration = """default_args = {
    'owner': 'your_username_here',
    'retry_delay': timedelta(seconds=5),
    'depends_on_past': False,
    'start_date': parser.isoparse('2021-05-25T07:33:37.393Z').replace(tzinfo=timezone.utc)
    }

dag = DAG(
    'airflow-pipeline-demo',
    default_args=default_args,
    schedule_interval='@daily',
    catchup=False,
    is_paused_upon_creation=False
    )\n\n"""
    
    with open(dag_dir+"/"+dag_file_name, 'a') as f:
        f.write(declaration)

In [73]:
def append_cde_spark_operator(dag_dir, dag_file_name, task_id, step_name, spark_cde_job_name):
    
    spark_operator = """{} = CDEJobRunOperator(
    task_id='{}',
    dag=dag,
    job_name='{}'
    )\n\n""".format(step_name, task_id, spark_cde_job_name)
    
    with open(dag_dir+"/"+dag_file_name, 'a') as f:
        f.write(spark_operator)

In [74]:
def parse_spark_oozie_action(a):
     
    if "spark" in a.keys():

        task_id = a["spark"]["name"]
        step_name = task_id+"_Step" 
        step_name = step_name.replace('-', '')
        spark_cde_job_name = task_id
        
        print("Extracted Job Name: {}".format(task_id))
        
        return task_id, step_name, spark_cde_job_name

    else:
        print("Error. This is not a Spark Oozie Action")

In [75]:
def append_cdw_operator(dag_dir, dag_file_name, task_id, step_name, cdw_query):    
    
    cdw_operator = '''cdw_query = """{}"""

{} = CDWOperator(
    task_id="{}",
    dag=dag,
    cli_conn_id="hive_conn",
    hql=cdw_query,
    schema='default',
    ### CDW related args ###
    use_proxy_user=False,
    query_isolation=True
)\n\n'''.format(cdw_query, step_name, task_id)
    
    with open(dag_dir+"/"+dag_file_name, 'a') as f:
        f.write(cdw_operator)

In [76]:
def parse_hive_oozie_action(a):
    
    #Checking if this is a Hive Oozie Action
    if "hive" in a.keys():
    
        #CDE Operator Task ID
        task_id = a['@name']

        #CDW Operator Name
        step_name = task_id+"_Step"
        step_name = step_name.replace('-', '')

        #Parsing SQL from Hive file
        with open(a['hive']['script'], 'r') as f:
            hive_sql = f.read()
            cdw_query = hive_sql.replace("\n", "")
        
    return task_id, step_name, cdw_query

In [77]:
def append_email_operator(dag_dir, dag_file_name, task_id, step_name, email_to, email_cc, email_subject, email_body):
    
    email_operator ='''
{} = EmailOperator( 
task_id="{}", 
to="{}", 
cc="{}",
subject="{}", 
html_content="{}", 
dag=dag)
    '''.format(step_name, task_id, email_to, email_cc, email_subject, email_body)
    
    with open(dag_dir+"/"+dag_file_name, 'a') as f:
        f.write(email_operator)

In [78]:
def parse_email_oozie_action(a):
    
    if "email" in a.keys():
    
        #Task ID
        task_id = a['@name']

        #Operator Name
        step_name = task_id+"_Step"
        step_name = step_name.replace('-', '')
        
        #Extracting Email Fields
        
        action = a['email']
        
        if action.__contains__('to'):
            email_to = a['email']['to'] 
        if action.__contains__('cc'):
            email_cc = a['email']['cc']
        if action.__contains__('subject'):
            email_subject = a['email']['subject']
        if action.__contains__('body'):
            email_body = a['email']['body']
        
        return task_id, step_name, email_to, email_cc, email_subject, email_body    

In [101]:
def append_bash_operator(dag_dir, dag_file_name, task_id, step_name):
    
    bash_operator = '''{} = BashOperator(
    task_id="{}",
    bash_command="echo \'here is the message'")'''.format(task_id, step_name  )
    
    with open(dag_dir+"/"+dag_file_name, 'a') as f:
        f.write(bash_operator)

In [102]:
def parse_shell_oozie_action(a):
    
    if "shell" in a.keys():
        
        #CDE Operator Task ID
        task_id = a['@name']

        #CDW Operator Name
        step_name = task_id+"_Step"
        step_name = step_name.replace('-', '')
        
        return task_id, step_name   

In [1]:
def append_python_operator(dag_dir, dag_file_name):
    
    print("Action not Found. Replacing Action with Airflow Python Operator Stub")

    task_id = "PythonOperator"
    step_name = "StepStub"

    python_operator = """\ndef my_func():\n\tpass\n 
    {} = PythonOperator(task_id='{}', python_callable=my_func)""".format(step_name, task_id)

    with open(dag_dir+"/"+dag_file_name, 'a') as f:
        f.write(python_operator)

In [103]:
oozie_action = ooziexml_to_dict("oozie_workflows/shell_oozie_action")

Oozie workflow file shell_action_workflow.xml found


In [104]:
action = oozie_action['workflow-app']['action']

In [105]:
task_id, step_name = parse_shell_oozie_action(action)

In [106]:
append_bash_operator(task_id, step_name)

In [79]:
oozie_action = ooziexml_to_dict("oozie_workflows/email_oozie_action")

Oozie workflow file email_action_workflow.xml found


In [80]:
action = oozie_action['workflow-app']['action']

In [82]:
append_email_operator(task_id, step_name, email_to, email_cc, email_subject, email_body)

In [108]:
print("Action not Found. Replacing Action with Airflow Python Operator Stub")

task_id = "PythonOperator"
step_name = "Step Stub"

python_operator = """def my_func():\n\t pass\n 
{} = PythonOperator(task_id='{}', python_callable=my_func)""".format(step_name, task_id)

with open('airflow_dags/oozie2airflow.py', 'a') as f:
    f.write(python_operator)

Action not Found. Replacing Action with Airflow Python Operator Stub


In [112]:
initialize_dag()

In [113]:
dag_imports()

In [114]:
dag_declaration()

In [115]:
task_id, step_name, spark_cde_job_name = parse_spark_oozie_action(s)

NameError: name 's' is not defined

In [312]:
s = ooziexml_to_dict("oozie_workflows")

Error. Only one Oozie workflow file per workflow directory expected.

Please remove the Oozie workflow file that is not associated with this project.



In [325]:
append_cde_spark_operator(task_id, step_name, spark_cde_job_name)

NameError: name 'task_id' is not defined

In [355]:
task_id, step_name, cdw_query = parse_hive_oozie_action(a)

NameError: name 'a' is not defined

In [121]:
append_cdw_operator(task_id, step_name, cdw_query)

In [None]:
task_id, step_name, email_to, email_cc, email_subject, email_body = parse_email_oozie_action(action)

In [None]:
append_email_operator(task_id, step_name, email_to, email_cc, email_subject, email_body)