In [1]:
import datetime
import json

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta
PATH_TO_DBT_PROJECT = "/Users/valentin/Github/data-gcp/orchestration/dags/data_gcp_dbt"


In [2]:
def load_manifest():
    local_filepath = PATH_TO_DBT_PROJECT  + "/target/manifest.json"
    with open(local_filepath) as f:
        data = json.load(f)

    return data

In [3]:
def make_dbt_run_single_task(node, dag,DBT_DIR = PATH_TO_DBT_PROJECT,full_refresh =False,GLOBAL_CLI_FLAGS = "--no-write-json"):
    """Returns an Airflow operator either run and test an individual model"""
    node_alias = node['alias']
    dbt_task = BashOperator(
        task_id=node_alias,
        bash_command=f"""
        dbt {GLOBAL_CLI_FLAGS} run --target dev --select {node_alias} --no-compile
        """ if not full_refresh else f"""
        dbt {GLOBAL_CLI_FLAGS} run --target dev --select {node_alias} --no-compile --full-refresh
        """,
        cwd=PATH_TO_DBT_PROJECT, 
        dag=dag
    )

    return dbt_task

In [4]:
data = load_manifest()
data['nodes'].keys()





dict_keys(['model.data_gcp_dbt.isbn_rayon_editor', 'model.data_gcp_dbt.bookable_collective_offer', 'model.data_gcp_dbt.available_stock_information', 'model.data_gcp_dbt.offer_item_ids', 'model.data_gcp_dbt.booking', 'model.data_gcp_dbt.cleaned_stock', 'model.data_gcp_dbt.offer_extracted_data', 'model.data_gcp_dbt.bookable_offer', 'model.data_gcp_dbt.agg_partner_cultural_sector', 'model.data_gcp_dbt.linked_offers', 'model.data_gcp_dbt.siren_data', 'model.data_gcp_dbt.subcategories', 'model.data_gcp_dbt.siren_data_labels', 'model.data_gcp_dbt.region_department', 'model.data_gcp_dbt.applicative_database_offerer_tag_category', 'model.data_gcp_dbt.applicative_database_offerer_tag_category_mapping', 'model.data_gcp_dbt.applicative_database_collective_booking', 'model.data_gcp_dbt.applicative_database_favorite', 'model.data_gcp_dbt.applicative_database_venue_criterion', 'model.data_gcp_dbt.applicative_database_offerer_tag', 'model.data_gcp_dbt.applicative_database_offerer', 'model.data_gcp_db

In [5]:
standard_tests = ["not_null","unique","accepted_values","relationships"]
for i,node in enumerate(data["nodes"].keys()):
    if "test" in node.split("."): 
        print(node)
        
        
        

test.data_gcp_dbt.dummy_test
test.data_gcp_dbt.not_null_siren_data_siren.565b43275a
test.data_gcp_dbt.unique_siren_data_siren.0fdebc9526
test.data_gcp_dbt.relationships_siren_data_categorieEntreprise__dep_name__ref_region_department_.8efee564d5
test.data_gcp_dbt.accepted_values_siren_data_categorieEntreprise__PME__GE.92736e9208


In [6]:
data["nodes"]["test.data_gcp_dbt.dummy_test"]

{'database': 'passculture-data-ehp',
 'schema': 'tmp_dev',
 'name': 'dummy_test',
 'resource_type': 'test',
 'package_name': 'data_gcp_dbt',
 'path': 'dummy_test.sql',
 'original_file_path': 'tests/dummy_test.sql',
 'unique_id': 'test.data_gcp_dbt.dummy_test',
 'fqn': ['data_gcp_dbt', 'dummy_test'],
 'alias': 'dummy_test',
 'checksum': {'name': 'sha256',
  'checksum': '5f7096d753dfb36157ef9c0bf043418589833d113d25b04d79f61a03ca45efe4'},
 'config': {'enabled': True,
  'alias': None,
  'schema': 'dbt_test__audit',
  'database': None,
  'tags': [],
  'meta': {},
  'group': None,
  'materialized': 'test',
  'severity': 'ERROR',
  'store_failures': True,
  'store_failures_as': 'table',
  'where': None,
  'limit': None,
  'fail_calc': 'count(*)',
  'warn_if': '!= 0',
  'error_if': '>=10',
  'docs': {'show': True, 'tags': 'test'}},
 'tags': [],
 'description': '',
 'columns': {},
 'meta': {},
 'group': None,
 'docs': {'show': True, 'node_color': None},
 'patch_path': None,
 'build_path': None,

In [7]:
data["nodes"]['model.data_gcp_dbt.siren_data']

{'database': 'passculture-data-ehp',
 'schema': 'tmp_dev',
 'name': 'siren_data',
 'resource_type': 'model',
 'package_name': 'data_gcp_dbt',
 'path': 'raw/siren_data.sql',
 'original_file_path': 'models/raw/siren_data.sql',
 'unique_id': 'model.data_gcp_dbt.siren_data',
 'fqn': ['data_gcp_dbt', 'raw', 'siren_data'],
 'alias': 'siren_data',
 'checksum': {'name': 'sha256',
  'checksum': '752b3768f2dd123c8c49c3a53dcd1eb8164db9cd6577d16996cdfe78c2fb00f4'},
 'config': {'enabled': True,
  'alias': None,
  'schema': None,
  'database': None,
  'tags': ['pro'],
  'meta': {},
  'group': None,
  'materialized': 'view',
  'incremental_strategy': None,
  'persist_docs': {},
  'post-hook': [],
  'pre-hook': [],
  'quoting': {},
  'column_types': {},
  'full_refresh': None,
  'unique_key': None,
  'on_schema_change': 'ignore',
  'on_configuration_change': 'apply',
  'grants': {},
  'packages': [],
  'docs': {'show': True, 'node_color': '#0320fc'},
  'contract': {'enforced': False, 'alias_types': Tr

In [8]:
data["nodes"]["test.data_gcp_dbt.not_null_siren_data_siren.565b43275a"]

{'test_metadata': {'name': 'not_null',
  'kwargs': {'column_name': 'siren',
   'model': "{{ get_where_subquery(ref('siren_data')) }}"},
  'namespace': None},
 'database': 'passculture-data-ehp',
 'schema': 'tmp_dev',
 'name': 'not_null_siren_data_siren',
 'resource_type': 'test',
 'package_name': 'data_gcp_dbt',
 'path': 'not_null_siren_data_siren.sql',
 'original_file_path': 'models/raw/schema.yml',
 'unique_id': 'test.data_gcp_dbt.not_null_siren_data_siren.565b43275a',
 'fqn': ['data_gcp_dbt', 'raw', 'not_null_siren_data_siren'],
 'alias': 'not_null_siren_data_siren',
 'checksum': {'name': 'none', 'checksum': ''},
 'config': {'enabled': True,
  'alias': None,
  'schema': 'dbt_test__audit',
  'database': None,
  'tags': [],
  'meta': {},
  'group': None,
  'materialized': 'test',
  'severity': 'warn',
  'store_failures': True,
  'store_failures_as': 'table',
  'where': None,
  'limit': None,
  'fail_calc': 'count(*)',
  'warn_if': '!= 0',
  'error_if': '!= 0',
  'docs': {'show': True,

In [13]:
children_tests = {}
for node in data["nodes"].keys():
    test_alias = data["nodes"][node].split('.')[-2]
    if data["nodes"][node]["resource_type"] == "test":
        parents = data["nodes"][node]["depends_on"]["nodes"]
        for p_node in parents:
            p_alias  =  data["nodes"][p_node]["alias"]
            if children_tests.get(p_alias,None) is None:
               children_tests[p_alias] = [test_alias] 
            else:
                children_tests[p_alias] += [test_alias] 
        
        # node_test = node.replace("model", "test")
        # dbt_tasks[alias] = make_dbt_run_single_task(alias, "run")
        # try :
        #     dbt_tasks[alias] = make_dbt_run_single_task(alias, "run",dag)
        # except DuplicateTaskIdFound:
        #     print( data["nodes"][node]['alias'])
        #     # print(dbt_tasks[node])
        #     pass
    

In [12]:
children_tests 

{'siren_data': ['test.data_gcp_dbt.dummy_test',
  'test.data_gcp_dbt.not_null_siren_data_siren.565b43275a',
  'test.data_gcp_dbt.unique_siren_data_siren.0fdebc9526',
  'test.data_gcp_dbt.relationships_siren_data_categorieEntreprise__dep_name__ref_region_department_.8efee564d5',
  'test.data_gcp_dbt.accepted_values_siren_data_categorieEntreprise__PME__GE.92736e9208'],
 'region_department': ['test.data_gcp_dbt.relationships_siren_data_categorieEntreprise__dep_name__ref_region_department_.8efee564d5']}

In [43]:
from airflow.exceptions import DuplicateTaskIdFound


default_args = {
    'depends_on_past': False,
    'start_date': datetime(2020, 12, 23),
    'retries': 2,
    'retry_delay': timedelta(minutes=5)
}
dag = DAG(
    'dbt_dag',
    default_args=default_args,
    description='A dbt wrapper for airflow',
    schedule=timedelta(days=1),
)


dbt_tasks = {}
dbt_tests = {}

for node in data["nodes"].keys():
    alias = data["nodes"][node]["alias"]
    if data["nodes"][node]["resource_type"] == "model":
        
        # node_test = node.replace("model", "test")
        dbt_tasks[alias] = make_dbt_run_single_task(alias, "run",dag)
        # try :
        #     dbt_tasks[alias] = make_dbt_run_single_task(alias, "run",dag)
        # except DuplicateTaskIdFound:
        #     print( data["nodes"][node]['alias'])
        #     # print(dbt_tasks[node])
        #     pass
    if data["nodes"][node]["resource_type"] == "test":
            
        # node_test = node.replace("model", "test")
        dbt_tests[alias] = make_dbt_run_single_task(alias, "run",dag)

        # try :
        #     dbt_tasks[node_test] = make_dbt_task(node, "test",dag)
        # except DuplicateTaskIdFound:
        #     # print(node)
        #     # print(dbt_tasks[node])
        #     pass
    if 'elementary' in node.split('.'):
        print(node)

AttributeError: 'str' object has no attribute 'task_group'

In [70]:
data["nodes"]['model.data_gcp_dbt.typeform_adage']['depends_on']


{'macros': [], 'nodes': ['source.data_gcp_dbt.clean.gsheet_eac_webinar']}

In [71]:

for node in data["nodes"].keys():
    if "elementary" not in node.split("."):
        if node.split(".")[0] == "model":
        
            # Set dependency to run tests on a model after model runs finishes
            # node_test = node.replace("model", "test")
            # dbt_tasks[node] >> dbt_tasks[node_test]
    
            # Set all model -> model dependencies
            for upstream_node in data["nodes"][node]["depends_on"]["nodes"]:
            
                upstream_node_type = upstream_node.split(".")[0]
                if upstream_node_type == "model":
                    dbt_tasks[upstream_node] >> dbt_tasks[node]

In [49]:
print(dag)


<DAG: dbt_dag>
