In [1]:
import pandas as pd
import numpy as np
from collections import Counter

# 1. load all datasets
filenames = [
    "datasets-p2/Financial_Records.csv",
    "datasets-p2/Financial_Records_Bob.csv",
    "datasets-p2/Financial_Records_Bob_Nemanja_Saveski.csv",
    # "datasets-p2/Financial_Records_Bob_Sabina_Khazari.csv", Sabina's Bob dataset is the same as mine. So we have to remove it to avoid bias. 
    "datasets-p2/Financial_Records_Bob_Thomas_Senstyler.csv"
]

datasets = [pd.read_csv(fname) for fname in filenames]

for i, df in enumerate(datasets):
    print(f"Dataset {i+1} shape: {df.shape}")
    
columns_match = all(datasets[0].columns.equals(df.columns) for df in datasets)
assert columns_match, "Column mismatch detected across datasets!"

Dataset 1 shape: (70000, 25)
Dataset 2 shape: (70000, 25)
Dataset 3 shape: (70000, 25)
Dataset 4 shape: (70000, 25)


In [2]:
cleaned_df = datasets[0].copy()
diff_map = pd.DataFrame(0, index=cleaned_df.index, columns=cleaned_df.columns)
modification_log = []

for row in range(cleaned_df.shape[0]):
    for col in cleaned_df.columns:
        values = [df.at[row, col] for df in datasets]

        try:
            float_values = [float(v) for v in values]
            is_numeric = True
        except:
            is_numeric = False

        if is_numeric:
            std_dev = np.std(float_values)
            if std_dev < 1e-4:
                cleaned_value = round(np.mean(float_values), 4)
            else:
                rounded = [round(v, 4) for v in float_values]
                most_common = Counter(rounded).most_common(1)[0][0]
                cleaned_value = most_common
        else:
            most_common = Counter(values).most_common(1)[0][0]
            cleaned_value = most_common

        if any(val != cleaned_value for val in values):
            # Log the change
            modification_log.append({
                "row": row,
                "column": col,
                "original_values": values,
                "cleaned_value": cleaned_value
            })
            diff_map.at[row, col] = 1  # mark this cell as modified

        cleaned_df.at[row, col] = cleaned_value

cleaned_df.to_csv("output-datasets/Financial_Records_No_Fingerprint_v1.csv", index=False)
diff_map.to_csv("output-datasets/diff_map.csv", index=False)
log_df = pd.DataFrame(modification_log)
log_df.to_csv("output-datasets/modification_log.csv", index=False)

print(f"Cleaned dataset saved as 'output-datasets/Financial_Records_Cleaned.csv'")
print(f"Difference map saved as 'output-datasets/diff_map.csv'")
print(f"Log of all modified cells saved as 'output-datasets/modification_log.csv'")

Cleaned dataset saved as 'output-datasets/Financial_Records_Cleaned.csv'
Difference map saved as 'output-datasets/diff_map.csv'
Log of all modified cells saved as 'output-datasets/modification_log.csv'
