In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join("..")))


import pandas as pd
import torch
import time
from union.union import sparse_tensor_prov,add_source_identifiers,append_with_provenance
import string
import random
import numpy as np

import tracemalloc

In [2]:
# Create sample dataframes for testing

# Function to generate random strings
def random_string(length=5):
    return ''.join(random.choices(string.ascii_lowercase, k=length))

# Create first dataset (data_tensor_1) with 1000 rows
data_tensor_1 = pd.DataFrame({
    "A": np.random.randint(1, 100, 20000),  # Random integers between 1 and 100
    "B": np.random.uniform(1.0, 100.0, 20000),  # Random floats between 1.0 and 100.0
    "C": [random_string() for _ in range(20000)],  # Random strings of length 5
    "D": np.random.choice([True, False], 20000)  # Random booleans
})

# Create second dataset (data_tensor_2) with 1000 rows
data_tensor_2 = pd.DataFrame({
    "A": np.random.randint(1, 100, 20000),
    "B": np.random.uniform(1.0, 100.0, 20000),
    "C": [random_string() for _ in range(20000)],
    "D": np.random.choice([True, False], 20000)
})



In [3]:
# Start memory tracking
tracemalloc.start()

# Start time tracking
start_time = time.time()

# Call the function
combined_df, provenance = sparse_tensor_prov(data_tensor_1, data_tensor_2)

# Track elapsed time
elapsed_time = time.time() - start_time

# Track peak memory usage
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

# Convert memory usage to MB
peak_memory_usage = peak / (1024 * 1024)

# Print results
print(f"Combined DataFrame:\n{combined_df}\n")
print(f"Provenance Matrix:\n{provenance}\n")
print(f"Time elapsed: {elapsed_time:.6f} seconds")
print(f"Peak memory usage: {peak_memory_usage:.2f} MB")




Combined DataFrame:
        A          B      C      D
0      70  91.559363  rstcb  False
1      99   9.028830  nyliv  False
2      67  56.644996  tgzdy  False
3      68  42.205441  pdluc   True
4      69  95.071088  zzbfb  False
...    ..        ...    ...    ...
39995  98   8.535803  fwqtc  False
39996   1  92.022713  wzhni   True
39997  25  11.765293  qpxdo   True
39998  22  11.449024  bmuyt  False
39999  11  78.990516  zxvem  False

[40000 rows x 4 columns]

Provenance Matrix:
tensor([[1, 0],
        [1, 0],
        [1, 0],
        ...,
        [0, 1],
        [0, 1],
        [0, 1]], dtype=torch.int32)

Time elapsed: 0.036386 seconds
Peak memory usage: 0.84 MB


In [4]:
tracemalloc.start()  
start_time = time.time()  


data_tensor_1_with_source, data_tensor_2_with_source = add_source_identifiers(data_tensor_1, data_tensor_2)
combined_with_source = append_with_provenance(data_tensor_1_with_source, data_tensor_2_with_source)

elapsed_time = time.time() - start_time  
current, peak = tracemalloc.get_traced_memory() 
tracemalloc.stop()  


peak_memory_usage = peak / (1024 * 1024)


print("\nCombined DataFrame with Source Identifiers:\n", combined_with_source)
print(f"Time elapsed for appending with source identifiers: {elapsed_time:.6f} seconds")
print(f"Peak memory usage: {peak_memory_usage:.2f} MB")



Combined DataFrame with Source Identifiers:
         A          B      C      D source_id
0      70  91.559363  rstcb  False        D¹
1      99   9.028830  nyliv  False        D¹
2      67  56.644996  tgzdy  False        D¹
3      68  42.205441  pdluc   True        D¹
4      69  95.071088  zzbfb  False        D¹
...    ..        ...    ...    ...       ...
39995  98   8.535803  fwqtc  False        D²
39996   1  92.022713  wzhni   True        D²
39997  25  11.765293  qpxdo   True        D²
39998  22  11.449024  bmuyt  False        D²
39999  11  78.990516  zxvem  False        D²

[40000 rows x 5 columns]
Time elapsed for appending with source identifiers: 0.005700 seconds
Peak memory usage: 2.25 MB
