## Migrating Oozie Workflows to Airflow CDE DAGs

In [339]:
import numpy as np
import pandas as pd
import os
from os.path import exists
import json
import sys
import re
import requests
from requests_toolbelt import MultipartEncoder
import xmltodict as xd
import pyparsing

#### Converting Oozie Actions to Airflow Bash Operators

In [340]:
def ooziexml_to_dict(oozie_workflow_path):

    #"oozie_workflows/oozie_hive_workflow_with_properties"
    
    #Ensuring there is only one workflow xml file in the dir
    if len([file for file in os.listdir(oozie_workflow_path) if ".xml" in file]) == 1:
    
        #Parsing properties file
        for file in os.listdir(oozie_workflow_path):
            if ".xml" in file:
                print("Oozie workflow file {} found".format(file))
                with open(oozie_workflow_path+"/"+file,'rb') as f:
                    d = xd.parse(f)
                    return d
                
    elif len([file for file in os.listdir(oozie_workflow_path) if ".xml" in file]) == 0:
        
        print("No Oozie workflow file found.\n")
        print("If Oozie workflow file is expected, please ensure it is in the workflow directory.\n")
        print("If Oozie workflow file is not expected, please ignore this message.\n")
        
    else:
              
        print("Error. Only one Oozie workflow file per workflow directory expected.\n")
        print("Please remove the Oozie workflow file that is not associated with this project.\n")

In [341]:
def parse_workflow_properties(oozie_workflow_path):
    
    #"oozie_workflows/oozie_hive_workflow_with_properties"
    
    #Ensuring there is only one properties file in the dir
    if len([file for file in os.listdir(oozie_workflow_path) if ".properties" in file]) == 1:
    
        #Parsing properties file
        for file in os.listdir(oozie_workflow_path):
            if ".properties" in file:
                print("Properties file {} found".format(file))
                with open(oozie_workflow_path+"/"+file) as f:
                    properties_file = f.read()
                    properties_dict = dict([tuple(i.split("=")) for i in properties_file.split("\n") if len(tuple(i.split("="))) > 1])
                    properties_dict = {x.replace(' ', ''): v.replace(' ','') for x, v in properties_dict.items()}
                    
                    return properties_dict
                    
    elif len([file for file in os.listdir(oozie_workflow_path) if ".properties" in file]) == 0:
        
        print("No properties file found.\n")
        print("If properties file is expected, please ensure it is in the workflow directory.\n")
        print("If properties file is not expected, please ignore this message.\n")
        
    else:
              
        print("Error. Only one properties file per workflow directory expected.\n")
        print("Please remove the properties file that is not associated with this workflow.\n")

In [343]:
d = ooziexml_to_dict("oozie_workflows/oozie_hive_workflow_with_properties")
d

Oozie workflow file hive_properties_workflow.xml found


{'workflow-app': {'@xmlns': 'uri:oozie:workflow:0.4',
  '@name': 'simple-Workflow',
  'start': {'@to': 'Create_External_Table'},
  'action': [{'@name': 'Create_External_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_external}'},
    'ok': {'@to': 'Create_orc_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Create_orc_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_orc}'},
    'ok': {'@to': 'Insert_into_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Insert_into_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': '${jobTracker}',
     'name-node': '${nameNode}',
     'script': '${script_name_copy}',
     'param': '${database}'},
    'ok': {'@to': 'end'},
    'error': {'@to': 'kill_job'}}],
  'kill': {'@name': 'kill_job'

In [344]:
props = parse_workflow_properties("oozie_workflows/oozie_hive_workflow_with_properties")
props

Properties file job1.properties found


{'nameNode': 'hdfs://rootname',
 'jobTracker': 'xyz.com:8088',
 'script_name_external': 'oozie_workflows/oozie_hive_workflow_with_properties/external.hive',
 'script_name_orc': 'oozie_workflows/oozie_hive_workflow_with_properties/orc.hive',
 'script_name_copy': 'oozie_workflows/oozie_hive_workflow_with_properties/Copydata.hive',
 'database': 'database_name'}

In [345]:
def workflow_properties_lookup(d, props):
    
    #Property value lookup 
    string = json.dumps(d)

    for k, v in props.items():
        string = string.replace(k, v)

    #Removing unwanted characters
    subbed = re.sub(r'"\${(.*?)}"', r'"\1"', string)
    parsed_xml = json.loads(subbed)
    return parsed_xml

In [346]:
d = workflow_properties_lookup(d, props)
d

{'workflow-app': {'@xmlns': 'uri:oozie:workflow:0.4',
  '@name': 'simple-Workflow',
  'start': {'@to': 'Create_External_Table'},
  'action': [{'@name': 'Create_External_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'oozie_workflows/oozie_hive_workflow_with_properties/external.hive'},
    'ok': {'@to': 'Create_orc_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Create_orc_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'oozie_workflows/oozie_hive_workflow_with_properties/orc.hive'},
    'ok': {'@to': 'Insert_into_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Insert_into_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'oozie_workflows/oozie_hive_workflow_with_properties

In [311]:
for action in d['workflow-app']['action']:
    #print(action)
    if "param" in action["hive"].keys():
        print(action["hive"]["param"])
        

database_name


In [362]:
tasks = []
steps = []
queries = []

for action in d['workflow-app']['action']:
    print(parse_hive_oozie_action(action))
    task_id, step_name, cdw_query = parse_hive_oozie_action(action)
    
    
    
    if cdw_query
    
    tasks.append(task_id) 
    steps.append(step_name) 
    queries.append(cdw_query)

('Create_External_Table', 'Create_External_Table_Step', "Create external table external_table(name string,age int,address string,zip int)row format delimitedfields terminated by ','stored as textfilelocation '/test/abc';")
('Create_orc_Table', 'Create_orc_Table_Step', 'Create Table orc_table(name string, -- Concate value of first name and last name with space as seperatoryearofbirth int,age int, -- Current year minus year of birthaddress string,zip int)STORED AS ORC;')
('Insert_into_Table', 'Insert_into_Table_Step', "use ${database_name}; -- input from Oozieinsert into table orc_tableselectconcat(first_name,' ',last_name) as name,yearofbirth,year(from_unixtime) --yearofbirth as age,address,zipfrom external_table;")


In [363]:
tasks

['Create_External_Table', 'Create_orc_Table', 'Insert_into_Table']

In [312]:
s = ooziexml_to_dict("oozie_workflows")

Error. Only one Oozie workflow file per workflow directory expected.

Please remove the Oozie workflow file that is not associated with this project.



In [313]:
def initialize_dag():
    with open('airflow_dags/oozie2airflow.py', 'w') as f:
        f.write('# The new Airflow DAG')

In [314]:
def dag_imports():
    
    imports = """\nfrom dateutil import parser
    \nfrom datetime import datetime, timedelta
    \nfrom datetime import timezone
    \nfrom airflow import DAG
    \nfrom cloudera.cdp.airflow.operators.cde_operator import CDEJobRunOperator\n\n"""
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(imports)

In [315]:
def dag_declaration():
    
    declarations = """default_args = {
    'owner': 'your_username_here',
    'retry_delay': timedelta(seconds=5),
    'depends_on_past': False,
    'start_date': parser.isoparse('2021-05-25T07:33:37.393Z').replace(tzinfo=timezone.utc)
    }

dag = DAG(
    'airflow-pipeline-demo',
    default_args=default_args,
    schedule_interval='@daily',
    catchup=False,
    is_paused_upon_creation=False
    )\n\n"""
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(declarations)

In [316]:
def append_cde_spark_operator(task_id, step_name, spark_cde_job_name):
    
    spark_operator = """{} = CDEJobRunOperator(
    task_id='{}',
    dag=dag,
    job_name='{}'
    )\n\n""".format(step_name, task_id, spark_cde_job_name)
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(spark_operator)

In [317]:
def parse_spark_oozie_action(d):
     
    if "spark" in d["workflow-app"]["action"].keys():

        task_id = d["workflow-app"]["action"]["spark"]["name"]
        step_name = task_id+"_Step" 
        spark_cde_job_name = task_id

    else:
        print("Error. This is not a Spark Oozie Action")
        
    print("Extracted Job Name: {}".format(task_id))
    return task_id, step_name, spark_cde_job_name

In [319]:
def append_cdw_operator(task_id, step_name, cdw_query):    
    
    cdw_operator = '''cdw_query = """{}"""

{} = CDWOperator(
    task_id="{}",
    dag=dag,
    cli_conn_id="hive_conn",
    hql=cdw_query,
    schema='default',
    ### CDW related args ###
    use_proxy_user=False,
    query_isolation=True
)\n\n'''.format(cdw_query, step_name, task_id)
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(cdw_operator)

In [353]:
def parse_hive_oozie_action(a):
    
    #CDE Operator Task ID
    task_id = a['@name']
    
    #CDW Operator Name
    step_name = task_id+"_Step"
       
    #Parsing SQL from Hive file
    with open(a['hive']['script'], 'r') as f:
        hive_sql = f.read()
        cdw_query = hive_sql.replace("\n", "")
        
    return task_id, step_name, cdw_query

In [331]:
with open(d['workflow-app']['action']['hive']['script'], 'r') as f:
        hive_sql = f.read()
        cdw_query = hive_sql.replace("\n", "")

TypeError: list indices must be integers or slices, not str

In [333]:
d['workflow-app']['action']['hive']

TypeError: list indices must be integers or slices, not str

In [334]:
d

{'workflow-app': {'@xmlns': 'uri:oozie:workflow:0.4',
  '@name': 'simple-Workflow',
  'start': {'@to': 'Create_External_Table'},
  'action': [{'@name': 'Create_External_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'hdfs_path_of_script/external.hive'},
    'ok': {'@to': 'Create_orc_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Create_orc_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'hdfs_path_of_script/orc.hive'},
    'ok': {'@to': 'Insert_into_Table'},
    'error': {'@to': 'kill_job'}},
   {'@name': 'Insert_into_Table',
    'hive': {'@xmlns': 'uri:oozie:hive-action:0.4',
     'job-tracker': 'xyz.com:8088',
     'name-node': 'hdfs://rootname',
     'script': 'hdfs_path_of_script/Copydata.hive',
     'param': 'database_name'},
    'ok': {'@to': 'end'},
    'error': {'@to': 

In [327]:
initialize_dag()

In [328]:
dag_imports()

In [329]:
dag_declaration()

In [324]:
task_id, step_name, spark_cde_job_name = parse_spark_oozie_action(s)

TypeError: 'NoneType' object is not subscriptable

In [325]:
append_cde_spark_operator(task_id, step_name, spark_cde_job_name)

NameError: name 'task_id' is not defined

In [355]:
task_id, step_name, cdw_query = parse_hive_oozie_action(a)

NameError: name 'a' is not defined

In [121]:
append_cdw_operator(task_id, step_name, cdw_query)