In [1]:
import datetime
import json

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta
PATH_TO_DBT_PROJECT = "/Users/valentin/Github/data-gcp/orchestration/dags/data_gcp_dbt"


In [2]:
def load_manifest():
    local_filepath = PATH_TO_DBT_PROJECT  + "/target/manifest.json"
    with open(local_filepath) as f:
        data = json.load(f)

    return data

In [3]:
def make_dbt_task(node, dbt_verb,dag):
    """Returns an Airflow operator either run and test an individual model"""
    DBT_DIR = PATH_TO_DBT_PROJECT
    GLOBAL_CLI_FLAGS = "--no-write-json"
    model = node.split(".")[-1]

    if dbt_verb == "run":
        dbt_task = BashOperator(
            task_id=model if node.split(".")[0]=="model" else node.split(".")[0]+'.'+model,
            bash_command=f"""
            dbt {GLOBAL_CLI_FLAGS} {dbt_verb} --target dev --models {model}
            """,
            cwd=PATH_TO_DBT_PROJECT, 
            dag=dag
        )

    elif dbt_verb == "test":
        node_test = node.replace("model", "test")
        dbt_task = BashOperator(
            task_id=node_test,
            bash_command=f"""
            dbt {GLOBAL_CLI_FLAGS} {dbt_verb} --target dev --models {model}
            """,
            cwd=PATH_TO_DBT_PROJECT,
            dag=dag
        )

    return dbt_task

In [61]:
data = load_manifest()
data['nodes'].keys()





dict_keys(['model.data_gcp_dbt.typeform_adage', 'model.data_gcp_dbt.adage', 'model.data_gcp_dbt.adage_involved_institution', 'model.data_gcp_dbt.firebase_app_experiments', 'model.data_gcp_dbt.firebase_events', 'model.data_gcp_dbt.venues', 'model.data_gcp_dbt.offerers', 'model.data_gcp_dbt.bank_information', 'model.data_gcp_dbt.dms_pro', 'model.data_gcp_dbt.dms_jeunes', 'model.data_gcp_dbt.downstream_adage_test', 'model.data_gcp_dbt.firebase_events_bis', 'model.data_gcp_dbt.firebase_visits', 'model.data_gcp_dbt.firebase_venue_data', 'model.data_gcp_dbt.firebase_bookings', 'model.data_gcp_dbt.firebase_aggregated_offers', 'model.data_gcp_dbt.firebase_app_experiments_bis', 'model.data_gcp_dbt.firebase_aggregated_users', 'model.data_gcp_dbt.firebase_session_origin', 'model.data_gcp_dbt.firebase_pro_events', 'operation.data_gcp_dbt.data_gcp_dbt-on-run-start-0', 'operation.data_gcp_dbt.data_gcp_dbt-on-run-start-1', 'model.elementary.snapshot_run_results', 'model.elementary.job_run_results', '

In [68]:
default_args = {
    'depends_on_past': False,
    'start_date': datetime(2020, 12, 23),
    'retries': 2,
    'retry_delay': timedelta(minutes=5)
}
dag = DAG(
    'dbt_dag',
    default_args=default_args,
    description='A dbt wrapper for airflow',
    schedule=timedelta(days=1),
)

In [69]:
from airflow.exceptions import DuplicateTaskIdFound
dbt_tasks = {}
for node in data["nodes"].keys():
    if "elementary" not in node.split("."):
        if node.split(".")[0] == "model":
            node_test = node.replace("model", "test")
            dbt_tasks[node] = make_dbt_task(node, "run",dag)
            try :
                dbt_tasks[node] = make_dbt_task(node, "run",dag)
            except DuplicateTaskIdFound:
                print(node)
                # print(dbt_tasks[node])
                pass
            try :
                dbt_tasks[node_test] = make_dbt_task(node, "test",dag)
            except DuplicateTaskIdFound:
                # print(node)
                # print(dbt_tasks[node])
                pass

model.data_gcp_dbt.typeform_adage
model.data_gcp_dbt.adage
model.data_gcp_dbt.adage_involved_institution
model.data_gcp_dbt.firebase_app_experiments
model.data_gcp_dbt.firebase_events
model.data_gcp_dbt.venues
model.data_gcp_dbt.offerers
model.data_gcp_dbt.bank_information
model.data_gcp_dbt.dms_pro
model.data_gcp_dbt.dms_jeunes
model.data_gcp_dbt.downstream_adage_test
model.data_gcp_dbt.firebase_events_bis
model.data_gcp_dbt.firebase_visits
model.data_gcp_dbt.firebase_venue_data
model.data_gcp_dbt.firebase_bookings
model.data_gcp_dbt.firebase_aggregated_offers
model.data_gcp_dbt.firebase_app_experiments_bis
model.data_gcp_dbt.firebase_aggregated_users
model.data_gcp_dbt.firebase_session_origin
model.data_gcp_dbt.firebase_pro_events


In [70]:
data["nodes"]['model.data_gcp_dbt.typeform_adage']['depends_on']


{'macros': [], 'nodes': ['source.data_gcp_dbt.clean.gsheet_eac_webinar']}

In [71]:

for node in data["nodes"].keys():
    if "elementary" not in node.split("."):
        if node.split(".")[0] == "model":
        
            # Set dependency to run tests on a model after model runs finishes
            # node_test = node.replace("model", "test")
            # dbt_tasks[node] >> dbt_tasks[node_test]
    
            # Set all model -> model dependencies
            for upstream_node in data["nodes"][node]["depends_on"]["nodes"]:
            
                upstream_node_type = upstream_node.split(".")[0]
                if upstream_node_type == "model":
                    dbt_tasks[upstream_node] >> dbt_tasks[node]

In [49]:
print(dag)


<DAG: dbt_dag>
