In [2]:
import os
import re
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
from collections import Counter

ATTACKED_DATASET_PATH = "output-datasets/7_v1/Financial_Records_No_Fingerprint_7_v1.csv"
DATA_DIR = "datasets-p2"
pattern = re.compile(r"Financial_Records.*\.csv")

def is_floatable(x):
    try:
        float(x)
        return True
    except:
        return False

def compare_datasets(ref_df, test_df, name):
    assert ref_df.shape == test_df.shape, "Shape mismatch!"

    total_cells = ref_df.size
    changed_cells = (ref_df != test_df).sum().sum()
    percent_changed = round((changed_cells / total_cells) * 100, 4)

    print(f"\nComparing with: {name}")
    print(f" - Percent of changed cells: {percent_changed}%")

    for col in ref_df.columns:
        ref_col = ref_df[col]
        test_col = test_df[col]

        if all(is_floatable(v) for v in ref_col) and all(is_floatable(v) for v in test_col):
            ref_vals = pd.to_numeric(ref_col, errors='coerce').dropna()
            test_vals = pd.to_numeric(test_col, errors='coerce').dropna()

            if len(ref_vals) > 0 and len(test_vals) > 0:
                ks_stat, ks_p = ks_2samp(ref_vals, test_vals)
                mean_diff = abs(ref_vals.mean() - test_vals.mean())
                std_diff = abs(ref_vals.std() - test_vals.std())

                print(f" - [NUMERIC] {col}:")
                print(f"    - KS Stat = {ks_stat:.4f}, p = {ks_p:.4f}")
                print(f"    - Mean diff = {mean_diff:.4f}, Std diff = {std_diff:.4f}")
        else:
            ref_set = set(ref_col.dropna().unique())
            test_set = set(test_col.dropna().unique())

            intersection = ref_set & test_set
            union = ref_set | test_set
            jaccard = len(intersection) / len(union) if union else 1.0

            print(f" - [CATEGORICAL] {col}: Jaccard similarity = {jaccard:.4f}")

    numeric_cols = [col for col in ref_df.columns if all(is_floatable(v) for v in ref_df[col])]
    if numeric_cols:
        ref_corr = ref_df[numeric_cols].apply(pd.to_numeric, errors='coerce').corr()
        test_corr = test_df[numeric_cols].apply(pd.to_numeric, errors='coerce').corr()
        corr_diff = (ref_corr - test_corr).abs().values.sum()
        print(f" - Correlation matrix diff (L1): {corr_diff:.4f}")

attacked_df = pd.read_csv(ATTACKED_DATASET_PATH)

for fname in sorted(os.listdir(DATA_DIR)):
    if pattern.match(fname):
        original_df = pd.read_csv(os.path.join(DATA_DIR, fname))
        compare_datasets(original_df, attacked_df, fname)



Comparing with: Financial_Records.csv
 - Percent of changed cells: 0.7213%
 - [NUMERIC] PID:
    - KS Stat = 0.0000, p = 1.0000
    - Mean diff = 0.0000, Std diff = 0.0000
 - [NUMERIC] age:
    - KS Stat = 0.0005, p = 1.0000
    - Mean diff = 0.0053, Std diff = 0.0113
 - [CATEGORICAL] sex: Jaccard similarity = 1.0000
 - [CATEGORICAL] marital_status: Jaccard similarity = 1.0000
 - [CATEGORICAL] job: Jaccard similarity = 1.0000
 - [CATEGORICAL] employment_since: Jaccard similarity = 1.0000
 - [CATEGORICAL] credit_hist: Jaccard similarity = 1.0000
 - [NUMERIC] credit_amount:
    - KS Stat = 0.0003, p = 1.0000
    - Mean diff = 0.7392, Std diff = 2.8540
 - [CATEGORICAL] purpose: Jaccard similarity = 1.0000
 - [NUMERIC] duration:
    - KS Stat = 0.0006, p = 1.0000
    - Mean diff = 0.0061, Std diff = 0.0039
 - [NUMERIC] installment_rate:
    - KS Stat = 0.0002, p = 1.0000
    - Mean diff = 0.0001, Std diff = 0.0005
 - [CATEGORICAL] debtors: Jaccard similarity = 1.0000
 - [NUMERIC] residenc