In [1]:
import os
from typing import Dict, List, Union

from neo4j import GraphDatabase
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
driver = GraphDatabase.driver(uri=os.environ.get("NEO4J_URI"), auth=(os.environ.get("NEO4J_USERNAME"), os.environ.get("NEO4J_PASSWORD")))
driver.verify_authentication()
driver.verify_connectivity()

### Define Queries

These are modified queries to return only the IDs.

In [3]:
ravi_query: str = """
MATCH (n:dbt_table_or_consumption_view)
WHERE n.name ='prod_khc_sales.kroger.kroger_daily_point_of_sale_fact'
WITH n
MATCH p=()-[*{min_level}..{max_level}]->(n)
WITH n, nodes(p) AS nodes, relationships(p) AS rels
with n, {{ node: head(nodes) , rel: head(rels) }} AS root
WITH n, COLLECT(DISTINCT root) AS roots
MATCH p=(n)-[*{min_level}..{max_level}]->()
WITH n, roots, nodes(p) AS nodes, relationships(p) AS rels
with n, roots, {{ node: tail(nodes) , rel: CASE WHEN size(rels) > 1
THEN tail(rels) ELSE head(rels) END }} AS leaf
WITH {{node: n, rel: []}} as n1, roots, COLLECT(DISTINCT leaf)
AS leaves
WITH n1, roots, leaves
WITH id(n1['node']) AS sourceId, [n IN roots | id(n['node'])] AS rootIds, [n IN leaves WHERE isEmpty(n['node']) = false| n['node'][0]] AS leafNodeList
RETURN sourceId, rootIds, [n IN leafNodeList | id(n)] AS leafIds
"""

updated_query: str = """
MATCH (n:dbt_table_or_consumption_view{{name: 'prod_khc_sales.kroger.kroger_daily_point_of_sale_fact'}})
// Get Roots
CALL {{
    WITH n
    CALL apoc.path.expandConfig(n, {{
	    relationshipFilter: "<",
        minLevel: {min_level},
        maxLevel: {max_level},
        uniqueness: "RELATIONSHIP_GLOBAL"
    }})
    YIELD path AS p
    WITH nodes(p) AS nodes, 
         relationships(p) AS rels
    WITH {{ node: head(nodes) , rel: head(rels) }} AS root
    WITH COLLECT(DISTINCT root) AS roots
    RETURN roots
}}
WITH n, roots
// Get Leafs
CALL {{
    WITH n
    CALL apoc.path.expandConfig(n, {{
	    relationshipFilter: ">",
        minLevel: {min_level},
        maxLevel: {max_level},
        uniqueness: "RELATIONSHIP_GLOBAL"
    }})
    YIELD path AS p
    WITH n, nodes(p) AS nodes, relationships(p) AS rels
    WITH n, {{ node: tail(nodes) , rel: CASE WHEN size(rels) > 1
    THEN tail(rels) ELSE head(rels) END }} AS leaf
    RETURN {{node: n, rel: []}} as n1, COLLECT(DISTINCT leaf)
AS leaves
}}
WITH n1, roots, leaves
WITH id(n1['node']) AS sourceId, [n IN roots | id(n['node'])] AS rootIds, [n IN leaves WHERE isEmpty(n['node']) = false| n['node'][0]] AS leafNodeList
RETURN sourceId, rootIds, [n IN leafNodeList | id(n)] AS leafIds
"""

In [19]:
print(ravi_query.format(min_level=0, max_level=1000))



MATCH (n:dbt_table_or_consumption_view)
WHERE n.name ='prod_khc_sales.kroger.kroger_daily_point_of_sale_fact'
WITH n
MATCH p=()-[*0..1000]->(n)
WITH n, nodes(p) AS nodes, relationships(p) AS rels
with n, { node: head(nodes) , rel: head(rels) } AS root
WITH n, COLLECT(DISTINCT root) AS roots
MATCH p=(n)-[*0..1000]->()
WITH n, roots, nodes(p) AS nodes, relationships(p) AS rels
with n, roots, { node: tail(nodes) , rel: CASE WHEN size(rels) > 1
THEN tail(rels) ELSE head(rels) END } AS leaf
WITH {node: n, rel: []} as n1, roots, COLLECT(DISTINCT leaf)
AS leaves
WITH n1, roots, leaves
WITH id(n1['node']) AS sourceId, [n IN roots | id(n['node'])] AS rootIds, [n IN leaves WHERE isEmpty(n['node']) = false| n['node'][0]] AS leafNodeList
RETURN sourceId, rootIds, [n IN leafNodeList | id(n)] AS leafIds



### Define The Query Runner

In [5]:
def run_query(query_template: str, min_level: int, max_level: int) -> List[Union[List[str], str]]:
    formatted_query = query_template.format(min_level=min_level, max_level=max_level)
    with driver.session() as session:
        return session.run(formatted_query).values()[0]

In [17]:
run_query(query_template=ravi_query, min_level=0, max_level=1000)



[25506,
 [25506,
  25494,
  25903,
  60851,
  59254,
  25743,
  25493,
  50883,
  25516,
  25492,
  50887,
  18780,
  8399,
  50885,
  18787,
  8406,
  25903,
  25524,
  25494,
  25489,
  50884,
  18781,
  8400,
  25903,
  25516,
  25493,
  25525,
  25494,
  25489],
 [27712,
  25462,
  25510,
  25459,
  25463,
  25466,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  28948,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25791,
  25

### Construct The Dataset

This will be a Pandas DataFrame with the columns min_level, max_level, ravi_source, ravi_roots, ravi_leaves, updated_source, updated_roots, updated_leaves

In [7]:
columns = ["min_level", "max_level", "ravi_source", "ravi_roots", "ravi_leaves", "updated_source", "updated_roots", "updated_leaves"] 
data = pd.DataFrame(columns=columns)

In [8]:
def load_data_row(target_dataframe: pd.DataFrame, min_level: int, max_level: int, columns: List[str] = columns) -> pd.DataFrame:

    ravi_source, ravi_roots, ravi_leaves = run_query(query_template=ravi_query, min_level=min_level, max_level=max_level)
    updated_source, updated_roots, updated_leaves = run_query(query_template=updated_query, min_level=min_level, max_level=max_level)
    row = {"min_level": [min_level], "max_level": [max_level], 
           "ravi_source": [ravi_source], "ravi_roots": [ravi_roots], "ravi_leaves": [ravi_leaves], 
           "updated_source": [updated_source], "updated_roots": [updated_roots], "updated_leaves": [updated_leaves]}
    print(f"new row: {row}")

    return pd.concat([target_dataframe, pd.DataFrame(row, columns=columns)], ignore_index=True)

In [None]:
load_

In [9]:
def create_dataframe(path_range: List[int] = [0, 10], columns: List[str] = columns, use_cache: bool = True) -> pd.DataFrame:

    if use_cache:
        try:
            return pd.read_csv(f"data/{path_range[0]}_{path_range[1]}.csv")
        except Exception as e:
            print(e)

    data = pd.DataFrame(columns=columns)

    for i in range(1, path_range[1]+1):
        print("loading row: ", i)
        data = load_data_row(target_dataframe=data, min_level=0, max_level=i, columns=columns)
        
    data.to_csv(f"data/{path_range[0]}_to_{path_range[1]}.csv")
    return data

In [10]:
data = create_dataframe(path_range=[0, 15], use_cache=False)

loading row:  1




new row: {'min_level': [0], 'max_level': [1], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 25524, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 25791, 25464, 25905, 25446, 25445, 25845, 25819, 25474, 25448, 27714, 25857, 25732, 29623]], 'updated_source': [25506], 'updated_roots': [[25506, 25506, 25506, 25506, 25506]], 'updated_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 25791, 25464, 25905, 25446, 25445, 25845, 25819, 25474, 25448, 27714, 25857, 25732, 29623]]}
loading row:  2




new row: {'min_level': [0], 'max_level': [2], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 25493, 25516, 50885, 25903, 60851, 25524, 25494, 25489, 25525, 25489, 50884, 25903, 25516, 25493]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 25791, 25791, 25791, 25464, 25905, 25905, 25446, 25446, 25445, 25845, 25845, 25845, 25819, 25474, 25448, 25448, 25448, 25448, 27714, 25857, 25857, 25732, 25732, 29623, 29623]], 'updated_source': [25506], 'updated_roots': [[25506, 25506, 25506, 25506, 25506]], 'updated_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 25791, 25464, 25905, 25446, 25445, 25845, 25819, 25474, 25448, 27714, 25857, 25732, 29623, 28948, 25791, 25791, 25905, 25446, 25845, 25845, 25448, 25448, 25448, 25857, 25732, 29623]]}
loading row:  3




new row: {'min_level': [0], 'max_level': [3], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 25493, 50883, 25516, 25492, 50885, 18787, 25903, 59254, 25524, 25494, 25489, 50884, 25903, 25516, 25493, 25525, 25494, 25489, 18781]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25464, 25905, 25905, 25446, 25446, 25446, 25446, 25446, 25445, 25845, 25845, 25845, 25845, 25845, 25819, 25474, 25448, 25448, 25448, 25448, 25448, 25448, 25448, 27714, 25857, 25857, 25857, 25857, 25857, 25857, 25732, 25732, 25732, 29623, 29623, 29623]], 'updated_source': [25506], 'updated_roots': [[25506, 25506, 25506, 25506, 25506]], 'updated_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 25791, 25464, 25905, 25446, 25445, 25845, 25819, 25474, 25448, 27714, 25857, 25732, 29623, 28948, 25791, 25791, 25905, 25446, 25845, 25845, 25448, 25448, 25448, 25857, 25732, 29623, 28948, 28948, 28948, 25791, 25446, 25446,



new row: {'min_level': [0], 'max_level': [4], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25493, 50883, 25516, 25492, 50887, 50885, 18787, 8406, 25903, 25743, 25524, 25494, 25489, 50884, 18781, 25903, 25516, 25493, 25525, 25494, 25489, 8400]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25464, 25905, 25905, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25445, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25819, 25474, 25448, 25448, 25448, 25448, 25448, 25448, 25448, 25448, 27714, 25857, 25857, 25857, 25857, 25857, 25857, 25857, 25857, 25857, 25857, 25732, 25732, 25732, 25732, 25732, 29623, 29623, 29623, 29623, 29623, 29623]], 'updated_source': [25506], 'updated_roots



new row: {'min_level': [0], 'max_level': [5], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25464, 25905, 25905, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25445, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 25845, 2



new row: {'min_level': [0], 'max_level': [6], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25464, 25905, 25905, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25



new row: {'min_level': [0], 'max_level': [7], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25464, 25905, 25905, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25



new row: {'min_level': [0], 'max_level': [8], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25464, 25905, 25905, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25446, 25



new row: {'min_level': [0], 'max_level': [9], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25



new row: {'min_level': [0], 'max_level': [10], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 2



new row: {'min_level': [0], 'max_level': [11], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 2



new row: {'min_level': [0], 'max_level': [12], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 2



new row: {'min_level': [0], 'max_level': [13], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 2



new row: {'min_level': [0], 'max_level': [14], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 2



new row: {'min_level': [0], 'max_level': [15], 'ravi_source': [25506], 'ravi_roots': [[25506, 25494, 25903, 60851, 59254, 25743, 25493, 50883, 25516, 25492, 50887, 18780, 8399, 50885, 18787, 8406, 25903, 25524, 25494, 25489, 50884, 18781, 8400, 25903, 25516, 25493, 25525, 25494, 25489]], 'ravi_leaves': [[27712, 25462, 25510, 25459, 25463, 25466, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 28948, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 25791, 2

In [15]:
data

Unnamed: 0,min_level,max_level,ravi_source,ravi_roots,ravi_leaves,updated_source,updated_roots,updated_leaves
0,0,1,25506,"[25506, 25494, 25903, 25524, 25489]","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
1,0,2,25506,"[25506, 25494, 25903, 25493, 25516, 50885, 259...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
2,0,3,25506,"[25506, 25494, 25903, 60851, 25493, 50883, 255...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
3,0,4,25506,"[25506, 25494, 25903, 60851, 59254, 25493, 508...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
4,0,5,25506,"[25506, 25494, 25903, 60851, 59254, 25743, 254...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
5,0,6,25506,"[25506, 25494, 25903, 60851, 59254, 25743, 254...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
6,0,7,25506,"[25506, 25494, 25903, 60851, 59254, 25743, 254...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
7,0,8,25506,"[25506, 25494, 25903, 60851, 59254, 25743, 254...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
8,0,9,25506,"[25506, 25494, 25903, 60851, 59254, 25743, 254...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."
9,0,10,25506,"[25506, 25494, 25903, 60851, 59254, 25743, 254...","[27712, 25462, 25510, 25459, 25463, 25466, 289...",25506,"[25506, 25506, 25506, 25506, 25506]","[27712, 25462, 25510, 25459, 25463, 25466, 289..."


### Create Evaluation Functions

In [12]:
import numpy as np

def evaluate_row(row: Dict[str, Union[List[str], str]]) -> Dict[str, str]:
    ravi_roots = np.array(row["ravi_roots"])
    ravi_leaves = np.array(row["ravi_leaves"])
    updated_roots = np.array(row["updated_roots"])
    updated_leaves = np.array(row["updated_leaves"])

    ravi_roots_unique = np.setdiff1d(ravi_roots, updated_roots)
    ravi_leaves_unique = np.setdiff1d(ravi_leaves, updated_leaves)
    updated_roots_unique = np.setdiff1d(updated_roots, ravi_roots)
    updated_leaves_unique = np.setdiff1d(updated_leaves, ravi_leaves)

    return {
        "ravi_roots_unique": ravi_roots_unique,
        "ravi_leaves_unique": ravi_leaves_unique,
        "updated_roots_unique": updated_roots_unique,
        "updated_leaves_unique": updated_leaves_unique
    }

In [13]:
evaluate_row(data.iloc[0])

{'ravi_roots_unique': array([25489, 25494, 25524, 25903]),
 'ravi_leaves_unique': array([], dtype=int64),
 'updated_roots_unique': array([], dtype=int64),
 'updated_leaves_unique': array([], dtype=int64)}

In [14]:
data.iloc[0]

min_level                                                         0
max_level                                                         1
ravi_source                                                   25506
ravi_roots                      [25506, 25494, 25903, 25524, 25489]
ravi_leaves       [27712, 25462, 25510, 25459, 25463, 25466, 289...
updated_source                                                25506
updated_roots                   [25506, 25506, 25506, 25506, 25506]
updated_leaves    [27712, 25462, 25510, 25459, 25463, 25466, 289...
Name: 0, dtype: object