In [43]:
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "Asdfasdf1!"
DB_NAME = "freiburg-2024-ris"

NODE_HIERARCHY = [
    "File",
    "Paper",
    "Consultation",
    "AgendaItem",
    "Meeting",
    "Person",
    "Organization",
    "Membership",
    "Location",
]


def markFilesFrom2024(tx):
    query = """
    MATCH (file:File)
    WHERE file.date STARTS WITH '2024-'
    SET file.shouldKeep = true
    RETURN count(file) as markedFiles
    """
    result = tx.run(query)
    return result.single()["markedFiles"]


def markRelatedNodes(tx, currentType, nextType):
    query = f"""
    MATCH (current:{currentType} {{shouldKeep: true}})-[relationship]-(next:{nextType})
    WHERE next.shouldKeep IS NULL
    SET next.shouldKeep = true
    RETURN count(next) as markedNodes, collect(distinct type(relationship)) as relationshipTypes
    """
    result = tx.run(query)
    markedNodes = 0
    relationshipTypes = set()
    for record in result:
        markedNodes += record["markedNodes"]
        relationshipTypes.update(record["relationshipTypes"])
    return markedNodes, list(relationshipTypes)


def processNodeType(session, currentType):
    for nextType in NODE_HIERARCHY[NODE_HIERARCHY.index(currentType) + 1 :]:
        markedNodes, relationshipTypes = session.execute_write(
            markRelatedNodes, currentType, nextType
        )
        print(f"  Marked {markedNodes} {nextType} nodes related to {currentType}")
        print(f"  Relationship types: {relationshipTypes}")


def markAllLegislativeTerms(tx):
    query = """
    MATCH (legislativeTerm:LegislativeTerm)
    WHERE legislativeTerm.shouldKeep IS NULL
    SET legislativeTerm.shouldKeep = true
    RETURN count(legislativeTerm) as markedTerms
    """
    result = tx.run(query)
    return result.single()["markedTerms"]


def countNodesByType(tx):
    query = """
    MATCH (node)
    RETURN labels(node) as nodeType, count(node) as count, 
           sum(CASE WHEN node.shouldKeep = true THEN 1 ELSE 0 END) as keptCount
    """
    result = tx.run(query)
    return list(result)


def deleteUnmarkedNodes(tx):
    query = """
    MATCH (node)
    WHERE node.shouldKeep IS NULL
    WITH count(node) as nodesToDelete
    CALL {
        WITH nodesToDelete
        MATCH (node)
        WHERE node.shouldKeep IS NULL
        DETACH DELETE node
    }
    RETURN nodesToDelete
    """
    result = tx.run(query)
    removeShouldKeepQuery = """
    MATCH (node)
    WHERE node.shouldKeep IS NOT NULL
    REMOVE node.shouldKeep
    """
    tx.run(removeShouldKeepQuery)
    return result.single()["nodesToDelete"]


def createSubsetGraph(driver):
    with driver.session(database=DB_NAME) as session:
        markedFiles = session.execute_write(markFilesFrom2024)
        print(f"Marked {markedFiles} files from 2024")

        print("\nInitial node counts:")
        initialCounts = session.execute_read(countNodesByType)
        for count in initialCounts:
            print(f"{count['nodeType']}: {count['count']} (Kept: {count['keptCount']})")

        for nodeType in NODE_HIERARCHY:
            print(f"\nProcessing {nodeType} nodes:")
            processNodeType(session, nodeType)

        print("\nMarking all LegislativeTerm nodes:")
        markedTerms = session.execute_write(markAllLegislativeTerms)
        print(f"Marked {markedTerms} LegislativeTerm nodes")

        print("\nFinal node counts before deletion:")
        finalCounts = session.execute_read(countNodesByType)
        for count in finalCounts:
            print(f"{count['nodeType']}: {count['count']} (Kept: {count['keptCount']})")

        deletedNodes = session.execute_write(deleteUnmarkedNodes)
        print(f"\nDeleted {deletedNodes} unmarked nodes")

        print("\nFinal node counts after deletion:")
        finalCounts = session.execute_read(countNodesByType)
        for count in finalCounts:
            print(f"{count['nodeType']}: {count['count']}")


def main():
    with GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD)) as driver:
        createSubsetGraph(driver)
    print("Subset graph created successfully.")


if __name__ == "__main__":
    main()

Marked 478 files from 2024

Initial node counts:
['Organization']: 24 (Kept: 0)
['Person']: 264 (Kept: 0)
['Membership']: 658 (Kept: 0)
['Meeting']: 102 (Kept: 0)
['AgendaItem']: 3 (Kept: 0)
['LegislativeTerm']: 34 (Kept: 0)
['Paper']: 111 (Kept: 0)
['Consultation']: 272 (Kept: 0)
['File']: 478 (Kept: 478)
['Location']: 102 (Kept: 0)

Processing File nodes:
  Marked 328 Paper nodes related to File
  Relationship types: ['PAPER_HAS_AUXILIARY_FILE']
  Marked 0 Consultation nodes related to File
  Relationship types: []
  Marked 0 AgendaItem nodes related to File
  Relationship types: []
  Marked 150 Meeting nodes related to File
  Relationship types: ['MEETING_HAS_AUXILIARY_FILE']
  Marked 0 Person nodes related to File
  Relationship types: []
  Marked 0 Organization nodes related to File
  Relationship types: []
  Marked 0 Membership nodes related to File
  Relationship types: []
  Marked 0 Location nodes related to File
  Relationship types: []

Processing Paper nodes:
  Marked 272 Co