## Migrating Oozie Workflows to Airflow CDE DAGs

In [1]:
import numpy as np
import pandas as pd
import os
import json
import sys
import re
import requests
from requests_toolbelt import MultipartEncoder
import xmltodict as xd

#### Converting Shell Oozie Actions to Airflow Bash Operators

In [2]:
def ooziexml_to_dict(ooziexml_path):

    with open(ooziexml_path,'rb') as f:
        d = xd.parse(f)
        
    return d

In [6]:
d = ooziexml_to_dict("oozie_workflows/Workflow.xml")
d

{'workflow-app': {'@xmlns': 'uri:oozie:workflow:0.5',
  '@name': 'SparkWordCount',
  'start': {'@to': 'spark-node'},
  'action': {'@name': 'spark-node',
   'spark': {'@xmlns': 'uri:oozie:spark-action:0.1',
    'job-tracker': '${jobTracker}',
    'name-node': '${nameNode}',
    'prepare': {'delete': {'@path': '${nameNode}/user/${wf:user()}/${examplesRoot}/output-data'}},
    'master': '${master}',
    'name': 'SparkPi',
    'class': 'org.apache.spark.examples.SparkPi',
    'jar': 'example_spark_jobs/jobs/pi.scala',
    'spark-opts': '--executor-memory 2G --num-executors 5',
    'arg': 'value=10'},
   'ok': {'@to': 'end'},
   'error': {'@to': 'fail'}},
  'kill': {'@name': 'fail',
   'message': 'Workflow failed, error\n            message[${wf:errorMessage(wf:lastErrorNode())}]'},
  'end': {'@name': 'end'}}}

In [36]:
def initialize_dag():
    with open('airflow_dags/oozie2airflow.py', 'w') as f:
        f.write('# The new Airflow DAG')

In [37]:
def dag_imports():
    
    imports = """\nfrom dateutil import parser
    \nfrom datetime import datetime, timedelta
    \nfrom datetime import timezone
    \nfrom airflow import DAG
    \nfrom cloudera.cdp.airflow.operators.cde_operator import CDEJobRunOperator\n\n"""
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(imports)

In [55]:
def dag_declaration():
    
    declarations = """default_args = {
    'owner': 'your_username_here',
    'retry_delay': timedelta(seconds=5),
    'depends_on_past': False,
    'start_date': parser.isoparse('2021-05-25T07:33:37.393Z').replace(tzinfo=timezone.utc)
    }

dag = DAG(
    'airflow-pipeline-demo',
    default_args=default_args,
    schedule_interval='@daily',
    catchup=False,
    is_paused_upon_creation=False
    )\n\n"""
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(declarations)

In [61]:
def append_spark_operator(step_name, task_id, spark_cde_job_name):
    
    spark_operator = """{} = CDEJobRunOperator(
    task_id={},
    dag=dag,
    job_name='{}'
    )\n\n""".format(step_name, task_id, spark_cde_job_name)
    
    with open('airflow_dags/oozie2airflow.py', 'a') as f:
        f.write(spark_operator)
    

In [66]:
def spark_oozie_action_name(d):
     
    if "spark" in d["workflow-app"]["action"].keys():

        name = d["workflow-app"]["action"]["spark"]["name"]

    else:
        print("Error. This is not a Spark Oozie Action")
        
    print("Extracted Job Name: {}".format(name))
    return name

In [62]:
initialize_dag()

In [63]:
dag_imports()

In [64]:
dag_declaration()

In [67]:
job_name = spark_oozie_action_name(d)

Extracted Job Name: SparkPi


In [68]:
append_spark_operator("step1", "task_id", job_name)