In [32]:
import pandas as pd

In [38]:
import hashlib
from pymerkle import InmemoryTree, verify_inclusion

def hash_row(row_values):
    """Hash a row by concatenating its column values."""
    concatenated = ''.join(map(str, row_values))
    return hashlib.sha256(concatenated.encode()).digest()

# Step 1: Create hashed rows for Dataset A and Dataset B
dataset_a = [["Alice", 25], ["Bob", 30], ["Charlie", 35]]
dataset_b = [["Alice", 25], ["Bob", 30], ["Charlie", 40]]

hashed_a = [hash_row(row) for row in dataset_a]
hashed_b = [hash_row(row) for row in dataset_b]

# Step 2: Build Merkle Trees using InmemoryTree
tree_a = InmemoryTree(algorithm="sha256")
tree_b = InmemoryTree(algorithm="sha256")

# Append hashed rows as entries into the trees
for hash_value in hashed_a:
    tree_a.append_entry(hash_value)

for hash_value in hashed_b:
    tree_b.append_entry(hash_value)

# Step 3: Compare Merkle Roots
root_a = tree_a.get_state()
root_b = tree_b.get_state()

if root_a == root_b:
    print("Datasets are identical for the selected columns and rows.")
else:
    print("Datasets differ.")

# Step 4: Verify specific shared rows
specific_row = ["Alice", 25]
specific_hash = hash_row(specific_row)

# Check inclusion in Tree A
leaf_index = None
for i in range(tree_a.get_size()):
    if tree_a.get_leaf(i+1) == specific_hash:
        leaf_index = i+1
        break

if leaf_index is not None:
    proof = tree_a.prove_inclusion(leaf_index, tree_a.get_size())
    if verify_inclusion(specific_hash, root_a, proof):
        print(f"Row {specific_row} exists in Dataset A.")
    else:
        print(f"Row {specific_row} does not exist in Dataset A.")
else:
    print(f"Row {specific_row} does not exist in Dataset A.")


Datasets differ.
Row ['Alice', 25] does not exist in Dataset A.


In [41]:
import hashlib
from pymerkle import InmemoryTree, verify_inclusion

def hash_row(row_values):
    """Hash a row by concatenating its column values."""
    concatenated = ''.join(map(str, row_values))
    return hashlib.sha256(concatenated.encode()).digest()

# Step 1: Create hashed rows for Dataset A and Dataset B
dataset_a = [["Alice", 25], ["Bob", 30], ["Charlie", 35]]
dataset_b = [["Alice", 25], ["Bob", 30], ["Charlie", 40]]

hashed_a = [hash_row(row) for row in dataset_a]
hashed_b = [hash_row(row) for row in dataset_b]

# Step 2: Build Merkle Trees using InmemoryTree
tree_a = InmemoryTree(algorithm="sha256")
tree_b = InmemoryTree(algorithm="sha256")

# Append hashed rows as entries into the trees
for hash_value in hashed_a:
    tree_a.append_entry(hash_value)

for hash_value in hashed_b:
    tree_b.append_entry(hash_value)

# Step 3: Compare Merkle Roots
root_a = tree_a.get_state()
root_b = tree_b.get_state()

if root_a == root_b:
    print("Datasets are identical for the selected columns and rows.")
else:
    print("Datasets differ.")

# Step 4: Verify specific shared rows
specific_row = ["Alice", 25]
specific_hash = hash_row(specific_row)

# Debug: Display leaves in tree A for verification
print("\nTree A Leaves:")
for i in range(1, tree_a.get_size() + 1):
    print(f"Leaf {i}: {tree_a.get_leaf(i)}")

# Check inclusion in Tree A
leaf_index = None
for i in range(tree_a.get_size()):
    # Retrieve the hash directly from the tree and compare
    if tree_a.get_leaf(i + 1) == specific_hash:
        leaf_index = i + 1
        break

if leaf_index is not None:
    proof = tree_a.prove_inclusion(leaf_index, tree_a.get_size())
    if verify_inclusion(specific_hash, root_a, proof):
        print(f"Row {specific_row} exists in Dataset A.")
    else:
        print(f"Row {specific_row} does not exist in Dataset A.")
else:
    print(f"Row {specific_row} does not exist in Dataset A.")


Datasets differ.

Tree A Leaves:
Leaf 1: b'\x13\xed\xfc\xec\xb7\x16\x96\xdeU#G\xe4\x00\xb7}\x82\x03p\xeb\x1c\xe8z \xfd\x89\x9cV~M\xbb\xdd\xd7'
Leaf 2: b'\xe4\xc3\xa7\xe8\xbe\x99Ec\xfc\x1b\x94\xce\x8aAL\xadZJ\x94\xda\xc4c\xce\x13\x86\xe1i^\x1f\x97\xf0\xb2'
Leaf 3: b'\xe2i3Z(\xfc\xda\xddc\xa9\xd4\x9exke\xc9\x90\x0f\x1dV`\xc3\xbb\xb9\x0bB\xfe\xd5\r<`\x0b'
Row ['Alice', 25] does not exist in Dataset A.


In [39]:
pd.DataFrame(dataset_a)

Unnamed: 0,0,1
0,Alice,25
1,Bob,30
2,Charlie,35


In [40]:
pd.DataFrame(dataset_b)

Unnamed: 0,0,1
0,Alice,25
1,Bob,30
2,Charlie,40
