In [1]:
import sys
import os
import pandas as pd
import torch

sys.path.append(os.path.abspath(os.path.join("..")))

from filter.horizontal_reduction import (
    transform,
    provenance_index_matching,
    compare,
    provenance_by_hashing,
)

In [2]:
data = {"col1": [1, 2, 3, 4, 5], "col2": [8, 6, 9, 7, 10]}
input_df = pd.DataFrame(data)
input_df

Unnamed: 0,col1,col2
0,1,8
1,2,6
2,3,9
3,4,7
4,5,10


In [3]:
filter = "(col1 > 2) & (col2 > 7)"

In [4]:
output_df = transform(input_df, filter)
output_df

Unnamed: 0,col1,col2
2,3,9
4,5,10


In [5]:
provenance_tensor = provenance_index_matching(input_df, output_df, sparse=True)

print(provenance_tensor)

tensor(indices=tensor([[2, 4],
                       [0, 1]]),
       values=tensor([1, 1]),
       size=(5, 2), nnz=2, dtype=torch.int8, layout=torch.sparse_coo)


In [6]:
provenance_tensor = provenance_index_matching(input_df, output_df, sparse=False)

print(provenance_tensor)

tensor([[0, 0],
        [0, 0],
        [1, 0],
        [0, 0],
        [0, 1]], dtype=torch.int8)


In [7]:
data = {"col1": [1, 2, 3, 4, 5], "col2": [8, 6, 9, 7, 10]}
input_df = pd.DataFrame(data)
input_df

Unnamed: 0,col1,col2
0,1,8
1,2,6
2,3,9
3,4,7
4,5,10


In [8]:
output_df = transform(input_df, filter)
output_df

Unnamed: 0,col1,col2
2,3,9
4,5,10


In [9]:
compare(input_df, filter)

INDEX MATCHING

Sparse Tensor Time: 0.000736s

Provenance Sparse Tensor :
tensor(indices=tensor([[2, 4],
                       [0, 1]]),
       values=tensor([1, 1]),
       size=(5, 2), nnz=2, dtype=torch.int8, layout=torch.sparse_coo)

Dense Tensor Time: 0.000541s

Provenance dense Tensor :
tensor([[0, 0],
        [0, 0],
        [1, 0],
        [0, 0],
        [0, 1]], dtype=torch.int8)

Results Consistent: True

 -------------------- 

BY HASHING

Sparse Tensor Time: 0.001248s

Provenance Sparse Tensor :
tensor(indices=tensor([[2, 4],
                       [0, 1]]),
       values=tensor([1, 1]),
       size=(5, 2), nnz=2, dtype=torch.int8, layout=torch.sparse_coo)

Dense Tensor Time: 0.000672s

Provenance dense Tensor :
tensor([[0, 0],
        [0, 0],
        [1, 0],
        [0, 0],
        [0, 1]], dtype=torch.int8)

Results Consistent: True


In [10]:
print(provenance_by_hashing(input_df, output_df))

tensor(indices=tensor([[2, 4],
                       [0, 1]]),
       values=tensor([1, 1]),
       size=(5, 2), nnz=2, dtype=torch.int8, layout=torch.sparse_coo)


In [11]:
print(provenance_by_hashing(input_df, output_df, False))

tensor([[0, 0],
        [0, 0],
        [1, 0],
        [0, 0],
        [0, 1]], dtype=torch.int8)


In [12]:
input_df = torch.rand(500, 500)
compare(input_df, filter="col1 > 0.95")

INDEX MATCHING

Sparse Tensor Time: 0.200214s

Provenance Sparse Tensor :
tensor(indices=tensor([[  7,  16,  30,  57,  91, 117, 120, 140, 146, 167, 184,
                        213, 216, 246, 251, 271, 300, 307, 325, 346, 374, 401,
                        430, 438, 449, 452, 469],
                       [  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
                         11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,
                         22,  23,  24,  25,  26]]),
       values=tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                      1, 1, 1, 1, 1, 1, 1, 1]),
       size=(500, 27), nnz=27, dtype=torch.int8, layout=torch.sparse_coo)

Dense Tensor Time: 0.162740s

Provenance dense Tensor :
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int8)

Results Consi

In [13]:
input_df = {"col1": [1, 2, 3], "col2": [4, 5, 6]}
output_df = {"col1": [2, 3], "col2": [5, 6]}

provenance_index_matching(input_df, output_df, False)

tensor([[0, 0],
        [1, 0],
        [0, 1]], dtype=torch.int8)