# Tensorprov Class Testing

In [1]:
import sys
import os
import torch

sys.path.append(os.path.abspath(os.path.join("..")))

from tensorprov import Tensorprov
import pandas as pd

data1 = pd.DataFrame({"col1": [1, 2, 3, 4, 5], "col2": [6, 7, 8, 9, 10]})

data2 = pd.DataFrame({"col1": [2, 3, 4], "col2": [15, 25, 35]})

mask = "(col1 > 1) & (col2 > 7)"
columns = ["col2"]

tp = Tensorprov()

## 1. Horizontal Reduction

In [2]:
# Horizontal reduction transform
horizontal_result = tp.horizontal_reduction_transform(data1, mask)
print("Horizontal Reduction Result:")
print(horizontal_result)

# Horizontal reduction provenance
horizontal_prov = tp.provenance_horizontal_reduction_index_matching(
    data1, horizontal_result, False
)
print("Horizontal Reduction Provenance (Index Matching):")
print(horizontal_prov)

horizontal_prov_hash = tp.provenance_horizontal_reduction_hashing(
    data1, horizontal_result, False
)
print("Horizontal Reduction Provenance (Hashing):")
print(horizontal_prov_hash)

Horizontal Reduction Result:
   col1  col2
2     3     8
3     4     9
4     5    10
Horizontal Reduction Provenance (Index Matching):
tensor([[0, 0, 0],
        [0, 0, 0],
        [1, 0, 0],
        [0, 1, 0],
        [0, 0, 1]], dtype=torch.int8)
Horizontal Reduction Provenance (Hashing):
tensor([[0, 0, 0],
        [0, 0, 0],
        [1, 0, 0],
        [0, 1, 0],
        [0, 0, 1]], dtype=torch.int8)


## 2. Vertical Reduction

In [3]:
# Vertical reduction transform
vertical_result = tp.vertical_reduction_transform(data1, columns)
print("Vertical Reduction Result:")
print(vertical_result)

# Vertical reduction provenance
vertical_prov = tp.provenance_vertical_reduction_column_matching(
    data1, vertical_result, sparse=False
)
print("Vertical Reduction Provenance (Column Matching):")
print(vertical_prov)

vertical_prov_hash = tp.provenance_vertical_reduction_hashing(
    data1, vertical_result, sparse=False
)
print("Vertical Reduction Provenance (Hashing):")
print(vertical_prov_hash)

Vertical Reduction Result:
   col1
0     1
1     2
2     3
3     4
4     5
Vertical Reduction Provenance (Column Matching):
tensor([1, 0], dtype=torch.int8)
Vertical Reduction Provenance (Hashing):
tensor([1, 0], dtype=torch.int8)


## 3. Join

In [4]:
# Join transform
join_result = tp.join_transform(data1, data2, join_on="col1")
print("Join Result:")
print(join_result)

# Join provenance
join_prov = tp.provenance_join(data1, data2, join_result, join_on=["col1"])
print("Join Provenance:")
print(join_prov)

Join Result:
   col1  col2_x  col2_y
0     2       7      15
1     3       8      25
2     4       9      35
Join Provenance:
tensor([[[0, 0, 0],
         [1, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 1, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 1],
         [0, 0, 0]]], dtype=torch.int32)


## 4. Oversampling

In [5]:
# Oversampling transform
data = torch.tensor([[1, 2], [3, 4]])
oversample_result = tp.oversample_transform(data)
print("Oversampling Result:")
print(oversample_result)

# Oversampling provenance (Sparse)
oversample_prov_sparse = tp.provenance_oversample_sparse(
    data, oversample_result, method="horizontal"
)
print("Oversampling Provenance (Sparse):")
print(oversample_prov_sparse)

# Oversampling provenance (Dense)
oversample_prov_dense = tp.provenance_oversample_dense(
    data, oversample_result, method="horizontal"
)
print("Oversampling Provenance (Dense):")
print(oversample_prov_dense)

Oversampling Result:
tensor([[1, 2, 1, 2],
        [3, 4, 3, 4]])
Oversampling Provenance (Sparse):
tensor(indices=tensor([[0, 1, 0, 1],
                       [0, 1, 2, 3]]),
       values=tensor([1., 1., 1., 1.]),
       size=(2, 4), nnz=4, layout=torch.sparse_coo)
Oversampling Provenance (Dense):
tensor([[1., 0., 1., 0.],
        [0., 1., 0., 1.]])


## 5. Union

In [6]:
# Union provenance
union_prov = tp.provenance_union(data1, data2)
print("Union Provenance:")
print(union_prov)

# Union provenance with dataframe
union_df_prov = tp.provenance_with_df_union(data1, data2)
print("Union Provenance with DataFrame:")
print(union_df_prov)

Union Provenance:
(  col1  col2
0    1     6
1    2     7
2    3     8
3    4     9
4    5    10
5    2    15
6    3    25
7    4    35, tensor([[1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [0, 1],
        [0, 1],
        [0, 1]], dtype=torch.int32))
Union Provenance with DataFrame:
  col1  col2 source_id
0    1     6        D¹
1    2     7        D¹
2    3     8        D¹
3    4     9        D¹
4    5    10        D¹
5    2    15        D²
6    3    25        D²
7    4    35        D²
