In [1]:
import sys
import os
import pandas as pd
import torch

sys.path.append(os.path.abspath(os.path.join("..")))

from oversampling  import oversample, determine_provenance_dense,determine_provenance_sparse,compare_methods

In [2]:
data = torch.tensor([[1, 2], [3, 4]])

print("\nOriginal Data:\n", data)



Original Data:
 tensor([[1, 2],
        [3, 4]])


In [3]:
# Test horizontal oversampling
print("\nTesting Horizontal Oversampling...")
augmented_horizontal = oversample(data, method='horizontal', factor=2)
print("Oversampled Data:\n", augmented_horizontal)

expected_horizontal = torch.tensor([[1, 2, 1, 2], [3, 4, 3, 4]])
assert torch.equal(augmented_horizontal, expected_horizontal), "Horizontal oversampling failed!"



Testing Horizontal Oversampling...
Oversampled Data:
 tensor([[1, 2, 1, 2],
        [3, 4, 3, 4]])


In [4]:
# Test vertical oversampling
print("\nTesting Vertical Oversampling...")
augmented_vertical = oversample(data, method='vertical', factor=2)
print("Oversampled Data:\n", augmented_vertical)

expected_vertical = torch.tensor([[1, 2], [3, 4], [1, 2], [3, 4]])
assert torch.equal(augmented_vertical, expected_vertical), "Vertical oversampling failed!"


Testing Vertical Oversampling...
Oversampled Data:
 tensor([[1, 2],
        [3, 4],
        [1, 2],
        [3, 4]])


In [5]:
# Test sparse provenance determination
print("\nTesting Sparse Provenance...")
sparse_provenance_h = determine_provenance_sparse(data, augmented_horizontal, method='horizontal')
sparse_provenance_v = determine_provenance_sparse(data, augmented_vertical, method='vertical')
print("Sparse Provenance (Horizontal):\n", sparse_provenance_h.to_dense())
print("Sparse Provenance (Vertical):\n", sparse_provenance_v.to_dense())



Testing Sparse Provenance...
Sparse Provenance (Horizontal):
 tensor([[1., 0., 1., 0.],
        [0., 1., 0., 1.]])
Sparse Provenance (Vertical):
 tensor([[1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.]])


In [6]:
# Test dense provenance determination
print("\nTesting Dense Provenance...")
dense_provenance_h = determine_provenance_dense(data, augmented_horizontal, method='horizontal')
dense_provenance_v = determine_provenance_dense(data, augmented_vertical, method='vertical')
print("Dense Provenance (Horizontal):\n", dense_provenance_h)
print("Dense Provenance (Vertical):\n", dense_provenance_v)



Testing Dense Provenance...
Dense Provenance (Horizontal):
 tensor([[1., 0., 1., 0.],
        [0., 1., 0., 1.]])
Dense Provenance (Vertical):
 tensor([[1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.]])


In [7]:
# Check consistency between sparse and dense methods
assert torch.equal(sparse_provenance_h.to_dense(), dense_provenance_h), "Horizontal provenance mismatch!"
assert torch.equal(sparse_provenance_v.to_dense(), dense_provenance_v), "Vertical provenance mismatch!"

print("\nAll tests passed successfully!")



All tests passed successfully!


In [8]:
# Performance evaluation on larger data
large_data = torch.rand(500, 500)
compare_methods(large_data, method='horizontal', factor=1000)
compare_methods(large_data, method='vertical', factor=1000)


Evaluating horizontal oversampling with factor 1000:
Sparse Tensor Time: 0.003003s


Provenance Sparse Tensor : tensor(indices=tensor([[     0,      1,      2,  ...,    497,    498,    499],
                       [     0,      1,      2,  ..., 499997, 499998, 499999]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(500, 500000), nnz=500000, layout=torch.sparse_coo)
Provenance dense Tensor : tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])
Dense Tensor Time: 4.216685s
Results Consistent: True

Evaluating vertical oversampling with factor 1000:
Sparse Tensor Time: 0.154997s
Provenance Sparse Tensor : tensor(indices=tensor([[     0,      1,      2,  ..., 499997, 499998, 499999],
                       [     0,      1,      2,  ...,    497,    498,    499]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(500000, 500), nnz=500000,