In [64]:
import os
import sys
from typing import Any, Dict, List, Union
import json

from neo4j import GraphDatabase

In [2]:
uri = os.environ.get("NEO4J_URI")
user = os.environ.get("NEO4J_USERNAME")
password = os.environ.get("NEO4J_PASSWORD")
dbname = 'neo4j'


driver = GraphDatabase.driver(
uri,
auth=(user, password),
max_transaction_retry_time=180,
)

In [50]:
query = """
call apoc.meta.stats
yield labels, relTypesCount
"""

In [51]:
def get_neo4j_data_counts() -> List[Dict[str, Any]]:
    # print(query.format(node_label=node_label))
    with driver.session(database=dbname) as session:
        return session.run(query).values()[0]

In [62]:
def get_jsonl_node_file_size(node_label: str) -> int:
    file_name: str = f"exports/nodes/{node_label}.jsonl"
    jsonl_file = ""
    lines = 0
    with open(file_name, 'r', encoding='utf-8') as f:
        for line in f:
            lines+=1
            jsonl_file += f"{line}\n"
    return lines

def get_jsonl_rel_file_size(rel_type: str) -> int:
    file_name: str = f"exports/relationships/{rel_type}.jsonl"
    jsonl_file = ""
    lines = 0
    with open(file_name, 'r', encoding='utf-8') as f:
        for line in f:
            lines+=1
            jsonl_file += f"{line}\n"
    return lines

In [67]:
labels = ["source", "ingested_table", 
          "dbt_table_or_consumption_view", 
          "powerbi_workspace", 
          "powerbi_report", "powerbi_dashboard", "powerbi_dashboard_tile", 
          "powerbi_dataset", 
          "powerbi_table", "powerbi_datamart", "tableau_table", "tableau_dashboard", "tableau_workbook", 
          "tableau_sheet"]

rels = ["ingested_as", "has_view", "referenced_by", "used_in_powerbi_table", "present_in", "in_dashboard", "used_in_tile", "used_in", "in_workspace", "contained_in", "visualized_as"]

In [54]:
data_counts = get_neo4j_data_counts()

In [75]:
# label = "dbt_table_or_consumption_view"
# s = 0
# for rel in [{k:v} for k, v in data_counts[1].items() if k[2:].startswith(label)]:
#     print(rel)
#     s += list(rel.values())[0]
# print(s)

In [55]:
def get_rel_total(rel_type: str) -> int:
    return data_counts[1][rel_type]

In [56]:
def get_label_total(label: str) -> int:
    return data_counts[0][label]

In [71]:
def validate_data_exports(labels: List[str], relationships: List[str]) -> Dict[str, Union[bool, Dict[str, Any]]]:

    result = dict()
    result["data"] = dict()
    result["valid"] = True

    for label in labels:
        result["data"][label] = dict()
        result["data"][label]["database"] = get_label_total(label)
        result["data"][label]["file"] = get_jsonl_node_file_size(label)
        if result["data"][label]["database"] != result["data"][label]["file"]:
            result["valid"] = False
            print(f"{label} is not exported properly. database: {result['data'][label]['database']} | file: {result['data'][label]['file']}")
    for rel in relationships:
        result["data"][rel] = dict()
        result["data"][rel]["database"] = get_rel_total(rel)
        result["data"][rel]["file"] = get_jsonl_rel_file_size(rel)
        if result["data"][rel]["database"] != result["data"][rel]["file"]:
            result["valid"] = False
            print(f"{rel} is not exported properly. database: {result['data'][rel]['database']} | file: {result['data'][rel]['file']}")
    
    return result

In [73]:
validation = validate_data_exports(labels=labels, relationships=rels)

In [76]:
validation["data"]

{'source': {'database': 10700, 'file': 10700},
 'ingested_table': {'database': 10701, 'file': 10701},
 'dbt_table_or_consumption_view': {'database': 25611, 'file': 25611},
 'powerbi_workspace': {'database': 111, 'file': 111},
 'powerbi_report': {'database': 1596, 'file': 1596},
 'powerbi_dashboard': {'database': 206, 'file': 206},
 'powerbi_dashboard_tile': {'database': 574, 'file': 574},
 'powerbi_dataset': {'database': 1030, 'file': 1030},
 'powerbi_table': {'database': 10889, 'file': 10889},
 'powerbi_datamart': {'database': 5, 'file': 5},
 'tableau_table': {'database': 7188, 'file': 7188},
 'tableau_dashboard': {'database': 11632, 'file': 11632},
 'tableau_workbook': {'database': 2536, 'file': 2536},
 'tableau_sheet': {'database': 72227, 'file': 72227},
 'ingested_as': {'database': 10701, 'file': 10701},
 'has_view': {'database': 10685, 'file': 10685},
 'referenced_by': {'database': 44177, 'file': 44177},
 'used_in_powerbi_table': {'database': 6749, 'file': 6749},
 'present_in': {'

In [74]:
print(validation["valid"])

True
