In [None]:
import pandas as pd

In [None]:
import pickle

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
from py2neo.bulk import create_relationships, merge_relationships

In [None]:
from py2neo.matching import *

In [None]:
uri = "bolt://localhost:7687"
user = "neo4j"
password = "localadmin"
graph = Graph(uri, user=user, password=password)
nodes = NodeMatcher(graph)

## Consolidating nodes via batched relationship redirection

The graph APOC merge node method does everything at once so maxes out memory. Hence using some simple routines to rewire relationships in batches, until down to none, then can remove node

In [None]:
with open ('consolidated_acts.pkl', 'rb') as fp:
    consolidated_acts = pickle.load(fp)
    print('Done so far: ', consolidated_acts)

In [None]:
count_rels = lambda act_id: graph.run(f"MATCH (c:Case)-[r]->(a:Act{{act_id:{act_id}}}) return count(r)").evaluate()

In [None]:
def extract_batch_cases(act_id, batch_size=10):
    query_result = graph.run(f"""
        MATCH ()-[r]->(a:Act{{act_id:{act_id}}}) RETURN r LIMIT {batch_size}
    """).data()
    cases = [rel['r'].start_node.get('case_id') for rel in query_result]
    return cases

In [None]:
def merge_and_delete_batch(to_merge_act_id, merge_into_act_id, batch_size=10):
    print("Initiating batch of rewires, rels into to merge: ", count_rels(to_merge_act_id), " and dest: ", count_rels(merge_into_act_id))
    # get the cases
    cases_to_rewire = extract_batch_cases(to_merge_act_id, batch_size=batch_size)
    # wire them to the new act
    data = [(case_id, { "type": "USES_ACT" }, merge_into_act_id) for case_id in cases_to_rewire]
    create_relationships(graph.auto(), data, "USES_ACT", 
                         start_node_key=("Case", "case_id"), 
                         end_node_key=("Act", "act_id"))
    # delete them from the old act
    # print('deleting from cases: ', cases_to_rewire)
    graph.run(f"""
        match (c:Case)-[r]->(a:Act{{act_id:{to_merge_act_id}}}) where 
        c.case_id in ($cases) delete r;
    """, cases=cases_to_rewire).evaluate()
    # complete
    left_to_merge = count_rels(to_merge_act_id)
    at_destination = count_rels(merge_into_act_id)
    print("Completed batch, rels into to merge: ", left_to_merge, " and dest: ", at_destination)
    return left_to_merge, at_destination

In [None]:
def merge_two_nodes(to_merge_node, merge_into_node):
    print("Initiating merger of nodes: ", to_merge_node, merge_into_node)
    to_merge_act_id = int(to_merge_node.get('act_id'))
    merge_into_act_id = int(merge_into_node.get('act_id'))
    
    to_merge_rels = count_rels(to_merge_act_id)
    while to_merge_rels > 0:
        to_merge_rels, _ = merge_and_delete_batch(to_merge_act_id, merge_into_act_id, batch_size=1000)
    print("Completed copying across relationships, removing node")
    
    graph.run(f"match (a:Act{{act_id:{to_merge_act_id}}}) delete a").evaluate()
    print("Completed merger")

In [None]:
# to_merge_act_id = 4942 # Criminal Procedure Act (big one) - also do 4943
# merge_into_act_id = 4925 # Criminal Procedure Act (even bigger one)

# first_act = nodes.match("Act", act_id=to_merge_act_id).first()
# first_act_name = first_act.get('act_sum')
# print(first_act)
# into_act = nodes.match("Act", act_id=merge_into_act_id).first()
# into_act_name = into_act.get('act_sum')
# print(into_act)

In [None]:
# note: this handles duplicates badly, so consider refactoring
to_merge = [14375, 10835, 4916, 11238]
merge_into = [14692, 10818, 4913, 11244]

to_merge_nodes = nodes.match("Act", act_id=IN(to_merge)).all()
merge_into_nodes = nodes.match("Act", act_id=IN(merge_into)).all()
for to_merge_node, merge_into_node in zip(to_merge_nodes, merge_into_nodes):
#     print(to_merge_node)
#     print(merge_into_node)
    merge_two_nodes(to_merge_node, merge_into_node)

In [None]:
newly_complete_acts = [{ 
    'removed_act_id': int(to_merge_node.get('act_id')),
    'removed_act_name': to_merge_node.get('act_sum'),
    'merged_into_act_id': int(merge_into_node.get('act_id')),
    'merged_into_act_name': merge_into_node.get('act_sum')
} for to_merge_node, merge_into_node in zip(to_merge_nodes, merge_into_nodes)]

# print(newly_complete_acts)
consolidated_acts += newly_complete_acts

print("Adding to saved list: ", consolidated_acts[-1])

In [None]:
print(consolidated_acts)

In [None]:
with open('consolidated_acts.pkl', 'wb') as fp:
    pickle.dump(consolidated_acts, fp)