In [2]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

DATA_DIR = "datasets-p2"
pattern = re.compile(r"Financial_Records.*\.csv")
filenames = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if pattern.match(f)]
filenames.sort()  # optional for consistent ordering

print(f"Found {len(filenames)} matching datasets:")
for f in filenames:
    print(f" - {f}")

datasets = [pd.read_csv(fname) for fname in filenames]
dataset_names = [os.path.basename(fname) for fname in filenames]
num_datasets = len(datasets)

ref_df = datasets[0]
rows, cols = ref_df.shape
columns = ref_df.columns

diff_map = pd.DataFrame(0, index=range(rows), columns=columns)
uniqueness_counts = [0 for _ in range(num_datasets)]

for row in range(rows):
    for col in columns:
        values = [df.at[row, col] for df in datasets]

        try:
            values = [float(v) for v in values]
            is_numeric = True
        except:
            is_numeric = False

        if is_numeric:
            rounded_values = [round(v, 4) for v in values]
            counter = Counter(rounded_values)
        else:
            counter = Counter(values)

        if len(counter) > 1:
            diff_map.at[row, col] = 1
            # Count uniqueness
            for i, val in enumerate(values):
                if counter[val] == 1:
                    uniqueness_counts[i] += 1

plt.figure(figsize=(12, 6))
sns.heatmap(diff_map.astype(int), cmap="YlOrRd", cbar_kws={'label': 'Modified (1 = change)'})
plt.title("Difference Heatmap Across Fingerprinted Datasets")
plt.xlabel("Columns")
plt.ylabel("Rows")
plt.tight_layout()
plt.savefig("diff_heatmap.png")
plt.close()
print("Saved diff heatmap to 'diff_heatmap.png'")

total_cells = rows * len(columns)
modified_cells = diff_map.values.sum()
percent_modified = round((modified_cells / total_cells) * 100, 4)

summary_df = pd.DataFrame({
    "Metric": ["Total Cells", "Modified Cells", "Percent Modified"],
    "Value": [total_cells, modified_cells, percent_modified]
})
summary_df.to_csv("diff_summary.csv", index=False)
print("Saved diff summary to 'diff_summary.csv'")

unique_df = pd.DataFrame({
    "Dataset": dataset_names,
    "Unique Cell Contributions": uniqueness_counts
}).sort_values(by="Unique Cell Contributions", ascending=False)

unique_df.to_csv("unique_contributions.csv", index=False)
print("Saved dataset uniqueness ranking to 'unique_contributions.csv'")


Found 5 matching datasets:
 - datasets-p2\Financial_Records.csv
 - datasets-p2\Financial_Records_Bob.csv
 - datasets-p2\Financial_Records_Bob_Nemanja_Saveski.csv
 - datasets-p2\Financial_Records_Bob_Sabina_Khazari.csv
 - datasets-p2\Financial_Records_Bob_Thomas_Senstyler.csv
Saved diff heatmap to 'diff_heatmap.png'
📈 Saved diff summary to 'diff_summary.csv'
Saved dataset uniqueness ranking to 'unique_contributions.csv'
