In [1]:
import sys
import os
import pandas as pd
import torch

sys.path.append(os.path.abspath(os.path.join("..")))

from filter.horizontal_reduction import (
    transform,
    provenance_index_matching,
    compare,
    provenance_by_hashing,
)

In [2]:
data = {"col1": [1, 2, 3, 4, 5], "col2": [8, 6, 9, 7, 10]}
input_df = pd.DataFrame(data)
input_df

Unnamed: 0,col1,col2
0,1,8
1,2,6
2,3,9
3,4,7
4,5,10


In [3]:
filter = "(col1 > 2) & (col2 > 7)"

In [4]:
output_df = transform(input_df, filter)
output_df

Unnamed: 0,col1,col2
2,3,9
4,5,10


In [5]:
provenance_tensor = provenance_index_matching(input_df, output_df, sparse=True)

print(provenance_tensor)

tensor(indices=tensor([[2, 4],
                       [0, 1]]),
       values=tensor([1, 1]),
       size=(5, 2), nnz=2, dtype=torch.int8, layout=torch.sparse_coo)


In [6]:
provenance_tensor = provenance_index_matching(input_df, output_df, sparse=False)

print(provenance_tensor)

tensor([[0, 0],
        [0, 0],
        [1, 0],
        [0, 0],
        [0, 1]], dtype=torch.int8)


In [8]:
data = {"col1": [1, 2, 3, 4, 5], "col2": [8, 6, 9, 7, 10]}
input_df = pd.DataFrame(data)
input_df

Unnamed: 0,col1,col2
0,1,8
1,2,6
2,3,9
3,4,7
4,5,10


In [9]:
output_df = transform(input_df, filter)
output_df

Unnamed: 0,col1,col2
2,3,9
4,5,10


In [10]:
print(provenance_by_hashing(input_df, output_df))

tensor(indices=tensor([[2, 4],
                       [0, 1]]),
       values=tensor([1, 1]),
       size=(5, 2), nnz=2, dtype=torch.int8, layout=torch.sparse_coo)


In [11]:
print(provenance_by_hashing(input_df, output_df, False))

tensor([[0, 0],
        [0, 0],
        [1, 0],
        [0, 0],
        [0, 1]], dtype=torch.int8)


In [12]:
input_df = torch.rand(500, 500)
compare(input_df, filter="col1 > 0.95")

INDEX MATCHING

Sparse Tensor Time: 0.000681s

Provenance Sparse Tensor :
tensor(indices=tensor([[ 40,  61,  86, 166, 169, 171, 177, 192, 205, 265, 281,
                        306, 321, 357, 401, 416, 455, 456, 466, 472, 490],
                       [  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
                         11,  12,  13,  14,  15,  16,  17,  18,  19,  20]]),
       values=tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                      1, 1]),
       size=(500, 21), nnz=21, dtype=torch.int8, layout=torch.sparse_coo)

Dense Tensor Time: 0.000361s

Provenance dense Tensor :
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int8)

Results Consistent: True

 -------------------- 

BY HASHING

Sparse Tensor Time: 0.329230s

Provenance Sparse Tensor :
tensor(indices=tens