In [1]:
import json
from pathlib import Path
from dataclasses import dataclass
from typing import List

In [2]:
DBT_DOCS_DIR = Path("..") / "jaffle_shop_duckdb-duckdb" / "target"

In [3]:
catalog_json_file_path = DBT_DOCS_DIR / "catalog.json"
manifest_json_file_path = DBT_DOCS_DIR / "manifest.json"

In [34]:
@dataclass
class ColumnMetadata:
    name: str
    description: str
    type: str
    accepted_values: List[str]

@dataclass
class TableMetadata:
    database: str
    schema: str
    table: str
    description: str
    columns: List[ColumnMetadata]

@dataclass
class DatabaseMetadata:
    tables: List[TableMetadata]

In [5]:
with open(catalog_json_file_path) as f:
    dbt_catalog = json.load(f)

with open(manifest_json_file_path) as f:
    dbt_manifest = json.load(f)

In [35]:
all_accepted_values_tests = {}
for node_name, node in dbt_manifest['nodes'].items():
    if 'test_metadata' in node:
        test_metadata = node['test_metadata']
        if test_metadata['name'] == 'accepted_values':
            kwargs = test_metadata['kwargs']
            accepted_value = kwargs['values']
            column_name = kwargs['column_name']
            model_name = node['attached_node']
            if model_name not in all_accepted_values_tests:
                all_accepted_values_tests[model_name] = {}
            all_accepted_values_tests[model_name][column_name] = accepted_value

tables_metadata = []
nodes = dbt_manifest['nodes']
for model_name in nodes.keys():
    node = nodes[model_name]

    if node['resource_type'] == 'model':
        columns_metadata = []
        columns = node['columns']
        for column_name in columns:
            column_metadata = columns[column_name]

            columns_metadata.append(
                ColumnMetadata(
                    name=column_name,
                    description=column_metadata['description'],
                    type=dbt_catalog['nodes'][model_name]['columns'].get(column_name, {}).get('type'),
                    accepted_values=all_accepted_values_tests.get(model_name, {}).get(column_name, []),
                )
            )

        tables_metadata.append(
            TableMetadata(
                database=node['database'],
                schema=node['schema'],
                table=node['name'],
                description=node['description'],
                columns=columns_metadata,
            )
        )

database_metadata = DatabaseMetadata(tables_metadata)

In [36]:
database_metadata

DatabaseMetadata(tables=[TableMetadata(database='jaffle_shop', schema='main', table='customers', description="This table has basic information about a customer, as well as some derived facts based on a customer's orders", columns=[ColumnMetadata(name='customer_id', description='This is a unique identifier for a customer', type='INTEGER', accepted_values=[]), ColumnMetadata(name='first_name', description="Customer's first name. PII.", type='VARCHAR', accepted_values=[]), ColumnMetadata(name='last_name', description="Customer's last name. PII.", type='VARCHAR', accepted_values=[]), ColumnMetadata(name='first_order', description="Date (UTC) of a customer's first order", type='DATE', accepted_values=[]), ColumnMetadata(name='most_recent_order', description="Date (UTC) of a customer's most recent order", type='DATE', accepted_values=[]), ColumnMetadata(name='number_of_orders', description='Count of the number of orders a customer has placed', type='BIGINT', accepted_values=[]), ColumnMetada

In [50]:
for t in database_metadata.tables:
    print('Document(text="' + t.schema + "." + t.table + '", extra_info=', {
        "description": t.description,
        "columns": ','.join([c.name for c in t.columns])
    }, '),')

Document(text="main.customers", extra_info= {'description': "This table has basic information about a customer, as well as some derived facts based on a customer's orders", 'columns': 'customer_id,first_name,last_name,first_order,most_recent_order,number_of_orders,total_order_amount'} ),
Document(text="main.orders", extra_info= {'description': 'This table has basic information about orders, as well as some derived facts based on payments', 'columns': 'order_id,customer_id,order_date,status,amount,credit_card_amount,coupon_amount,bank_transfer_amount,gift_card_amount'} ),
Document(text="main.stg_customers", extra_info= {'description': '', 'columns': 'customer_id'} ),
Document(text="main.stg_orders", extra_info= {'description': '', 'columns': 'order_id,status'} ),
Document(text="main.stg_payments", extra_info= {'description': '', 'columns': 'payment_id,payment_method'} ),


In [8]:
dbt_catalog.keys()

dict_keys(['metadata', 'nodes', 'sources', 'errors'])

In [13]:
dbt_catalog['nodes']['model.jaffle_shop.customers']['columns']['customer_id']['type']

'INTEGER'

In [32]:
all_accepted_values_tests = {}
for node_name, node in dbt_manifest['nodes'].items():
    if 'test_metadata' in node:
        test_metadata = node['test_metadata']
        if test_metadata['name'] == 'accepted_values':
            kwargs = test_metadata['kwargs']
            accepted_value = kwargs['values']
            column_name = kwargs['column_name']
            model_name = node['attached_node']
            if model_name not in all_accepted_values_tests:
                all_accepted_values_tests[model_name] = {}
            all_accepted_values_tests[model_name][column_name] = accepted_value

all_accepted_values_tests

{'model.jaffle_shop.orders': {'status': ['placed',
   'shipped',
   'completed',
   'return_pending',
   'returned']},
 'model.jaffle_shop.stg_orders': {'status': ['placed',
   'shipped',
   'completed',
   'return_pending',
   'returned']},
 'model.jaffle_shop.stg_payments': {'payment_method': ['credit_card',
   'coupon',
   'bank_transfer',
   'gift_card']}}