In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join("..")))

import pandas as pd
import torch
import time
from union.union import (
    sparse_tensor_prov,
    add_source_identifiers,
    append_with_provenance,
)
import string
import random
import numpy as np

import tracemalloc

In [2]:
# Create sample dataframes for testing


# Function to generate random strings
def random_string(length=5):
    return "".join(random.choices(string.ascii_lowercase, k=length))


# Create first dataset (data_tensor_1) with 1000 rows
data_tensor_1 = pd.DataFrame(
    {
        "A": np.random.randint(1, 100, 2000),  # Random integers between 1 and 100
        "B": np.random.uniform(1.0, 100.0, 2000),  # Random floats between 1.0 and 100.0
        "C": [random_string() for _ in range(2000)],  # Random strings of length 5
        "D": np.random.choice([True, False], 2000),  # Random booleans
    }
)

# Create second dataset (data_tensor_2) with 1000 rows
data_tensor_2 = pd.DataFrame(
    {
        "A": np.random.randint(1, 100, 2000),
        "B": np.random.uniform(1.0, 100.0, 2000),
        "C": [random_string() for _ in range(2000)],
        "D": np.random.choice([True, False], 2000),
    }
)

In [3]:
# Step 1: Validate the DataFrames
validate_dataframes(data_tensor_1, data_tensor_2)

NameError: name 'validate_dataframes' is not defined

In [41]:
# Start memory tracking
tracemalloc.start()

# Start time tracking
start_time = time.time()

# Call the function
combined_df, provenance = sparse_tensor_prov(data_tensor_1, data_tensor_2)

# Track elapsed time
elapsed_time = time.time() - start_time

# Track peak memory usage
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

# Convert memory usage to MB
peak_memory_usage = peak / (1024 * 1024)

# Print results
print(f"Combined DataFrame:\n{combined_df}\n")
print(f"Provenance Matrix:\n{provenance}\n")
print(f"Time elapsed: {elapsed_time:.6f} seconds")
print(f"Peak memory usage: {peak_memory_usage:.2f} MB")

Combined DataFrame:
       A          B      C      D
0     98  13.949184  fpekr   True
1     80  16.734645  jmnbk  False
2     54  25.818972  ipcvu  False
3     17  55.114824  ytgwc  False
4     91  51.298058  adslu  False
...   ..        ...    ...    ...
3995  34  38.029495  yxrxl   True
3996  26  11.738725  qxavw   True
3997  96  70.566756  fbwsb   True
3998   6  32.212114  ouplw   True
3999  86  18.534082  kwjaj  False

[4000 rows x 4 columns]


Sparse Provenance Matrix:
 tensor([[1, 0],
        [1, 0],
        [1, 0],
        ...,
        [0, 1],
        [0, 1],
        [0, 1]], dtype=torch.int32)
Time elapsed for appending: 0.000000 seconds


In [42]:
# Step 3: Test the `provenance_matrix_sparse` method
provenance_sparse = provenance_matrix_sparse(data_tensor_1, data_tensor_2, combined_df)
print("\nSparse Provenance Matrix:\n", provenance_sparse)


Sparse Provenance Matrix:
 tensor([[1, 0],
        [1, 0],
        [1, 0],
        ...,
        [0, 1],
        [0, 1],
        [0, 1]], dtype=torch.int32)


In [44]:
tracemalloc.start()
start_time = time.time()


data_tensor_1_with_source, data_tensor_2_with_source = add_source_identifiers(
    data_tensor_1, data_tensor_2
)
combined_with_source, elapsed_time_source = append_with_provenance(
    data_tensor_1_with_source, data_tensor_2_with_source
)

elapsed_time = time.time() - start_time
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()


peak_memory_usage = peak / (1024 * 1024)


print("\nCombined DataFrame with Source Identifiers:\n", combined_with_source)
print(f"Time elapsed for appending with source identifiers: {elapsed_time:.6f} seconds")
print(f"Peak memory usage: {peak_memory_usage:.2f} MB")


Combined DataFrame with Source Identifiers:
        A          B      C      D source_id
0     98  13.949184  fpekr   True        D¹
1     80  16.734645  jmnbk  False        D¹
2     54  25.818972  ipcvu  False        D¹
3     17  55.114824  ytgwc  False        D¹
4     91  51.298058  adslu  False        D¹
...   ..        ...    ...    ...       ...
3995  34  38.029495  yxrxl   True        D²
3996  26  11.738725  qxavw   True        D²
3997  96  70.566756  fbwsb   True        D²
3998   6  32.212114  ouplw   True        D²
3999  86  18.534082  kwjaj  False        D²

[4000 rows x 5 columns]
Time elapsed for appending with source identifiers: 0.002541 seconds
Peak memory usage: 0.25 MB
