In [12]:
import os
import sys
from typing import Any, Dict, List
# from neo4j.debug import watch
import json

# uncomment below line(s) to get debug logging
# watch("neo4j", out=sys.stdout) #Output debug to stdout
# watch("neo4j", out=open('debugLogs.txt', 'w')) #Output debug to logfile


from neo4j import GraphDatabase

In [2]:
uri = os.environ.get("NEO4J_URI")
user = os.environ.get("NEO4J_USERNAME")
password = os.environ.get("NEO4J_PASSWORD")
dbname = 'neo4j'


driver = GraphDatabase.driver(
uri,
auth=(user, password),
max_transaction_retry_time=180,
)

In [3]:
# ex_query = (
# 'CALL apoc.export.json.all(null, {stream:true}) '
# 'YIELD file, nodes, relationships, properties, data '
# 'RETURN file, nodes, relationships, properties, data'
# )


# with driver.session(database=dbname) as session:
#     response = list(session.run(ex_query))
#     # Loop through results and do something with them
#     for p in response:
#         print(p)

In [49]:
query = """
match (n:{node_label})
call {{
  with n
  with n, collect {{match (n)-[r]->() return r}} as relList
  with  collect(n) as nodeList, relList
  call apoc.export.json.data(nodeList, relList, null, {{stream:true}})
  yield data
  return data
}} in 4 concurrent transactions of 200 rows
return data
"""

In [59]:
node_query = """
match (n:{node_label})
call {{
  with n
  with  collect(n) as nodeList
  call apoc.export.json.data(nodeList, [], null, {{stream:true}})
  yield data
  return data
}} in 4 concurrent transactions of 200 rows
return data
"""

In [68]:
rel_query = """
match ()-[r:{rel_type}]->()
call {{
  with r
  with  collect(r) as relList
  call apoc.export.json.data([], relList, null, {{stream:true}})
  yield data
  return data
}} in 4 concurrent transactions of 200 rows
return data
"""

In [69]:
query.format(node_label="test")

'\nmatch (n:test)\ncall {\n  with n\n  with n, collect {match (n)-[r]->() return r} as relList\n  with  collect(n) as nodeList, relList\n  call apoc.export.json.data(nodeList, relList, null, {stream:true})\n  yield data\n  return data\n} in 4 concurrent transactions of 200 rows\nreturn data\n'

In [72]:
# with driver.session(database=dbname) as session:
#     response = session.run(query).values()

In [71]:
def get_data(node_label: str) -> List[Dict[str, Any]]:
    print(query.format(node_label=node_label))
    with driver.session(database=dbname) as session:
        return session.run(query.format(node_label=node_label)).values()
    
def get_node_data(node_label: str) -> List[Dict[str, Any]]:
    print(node_query.format(node_label=node_label))
    with driver.session(database=dbname) as session:
        return session.run(node_query.format(node_label=node_label)).values()

def get_rel_data(rel_type: str) -> List[Dict[str, Any]]:
    print(rel_query.format(rel_type=rel_type))
    with driver.session(database=dbname) as session:
        return session.run(rel_query.format(rel_type=rel_type)).values()

In [63]:
def save_data_as_json_lines_file(data: List[Dict[str, Any]], file_name: str = "data.json") -> None:
    with open(file_name, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item[0], indent=4)+ "\n")

In [43]:
# len(response)

58056

In [33]:
# save_data_as_json_lines_file(response, "test.json")

In [73]:
labels = ["source", "ingested_table", 
          "dbt_table_or_consumption_view", 
          "powerbi_workspace", 
          "powerbi_report", "powerbi_dashboard", "powerbi_dashboard_tile", 
          "powerbi_dataset", 
          "powerbi_table", "powerbi_datamart", "tableau_table", "tableau_dashboard", "tableau_workbook", 
          "tableau_sheet"]

rels = ["ingested_as", "has_view", "referenced_by", "used_in_powerbi_table", "present_in", "in_dashboard", "used_in_tile", "used_in", "in_workspace", "contained_in", "visualized_as"]

In [74]:
# label = "powerbi_dataset"
# data = get_data(node_label=label)
# save_data_as_json_lines_file(data, label+".jsonl")

In [66]:
for label in labels:
    data = get_node_data(node_label=label)
    save_data_as_json_lines_file(data, "exports/nodes/"+label+".jsonl")


match (n:source)
call {
  with n
  with  collect(n) as nodeList
  call apoc.export.json.data(nodeList, [], null, {stream:true})
  yield data
  return data
} in 4 concurrent transactions of 200 rows
return data


match (n:ingested_table)
call {
  with n
  with  collect(n) as nodeList
  call apoc.export.json.data(nodeList, [], null, {stream:true})
  yield data
  return data
} in 4 concurrent transactions of 200 rows
return data


match (n:dbt_table_or_consumption_view)
call {
  with n
  with  collect(n) as nodeList
  call apoc.export.json.data(nodeList, [], null, {stream:true})
  yield data
  return data
} in 4 concurrent transactions of 200 rows
return data


match (n:powerbi_workspace)
call {
  with n
  with  collect(n) as nodeList
  call apoc.export.json.data(nodeList, [], null, {stream:true})
  yield data
  return data
} in 4 concurrent transactions of 200 rows
return data


match (n:powerbi_report)
call {
  with n
  with  collect(n) as nodeList
  call apoc.export.json.data(nodeList

In [75]:
for rel in rels:
    data = get_rel_data(rel_type=rel)
    save_data_as_json_lines_file(data, "exports/relationships/"+rel+".jsonl")


match ()-[r:ingested_as]->()
call {
  with r
  with  collect(r) as relList
  call apoc.export.json.data([], relList, null, {stream:true})
  yield data
  return data
} in 4 concurrent transactions of 200 rows
return data


match ()-[r:has_view]->()
call {
  with r
  with  collect(r) as relList
  call apoc.export.json.data([], relList, null, {stream:true})
  yield data
  return data
} in 4 concurrent transactions of 200 rows
return data


match ()-[r:referenced_by]->()
call {
  with r
  with  collect(r) as relList
  call apoc.export.json.data([], relList, null, {stream:true})
  yield data
  return data
} in 4 concurrent transactions of 200 rows
return data


match ()-[r:used_in_powerbi_table]->()
call {
  with r
  with  collect(r) as relList
  call apoc.export.json.data([], relList, null, {stream:true})
  yield data
  return data
} in 4 concurrent transactions of 200 rows
return data


match ()-[r:present_in]->()
call {
  with r
  with  collect(r) as relList
  call apoc.export.json.data