## V1: First version of the attack 
This version doesn't work as I don't handle mixed values right. E.g. columns like employment_since, stay same across cleaned and original datasets

In [1]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from dataset_uniqueness_utils import *

DATA_DIR = "datasets-p2"
OUT_DIR = "output-datasets"
os.makedirs(OUT_DIR, exist_ok=True)

unique_datasets, unique_names, num_unique = get_unique_datasets(DATA_DIR) 

print(f"Using {num_unique} unique datasets:")
for name in unique_names:
    print(f" - {name}")

base_df = unique_datasets[0].copy()
modification_log = []

for row in range(base_df.shape[0]):
    for col in base_df.columns:
        values = [df.at[row, col] for df in unique_datasets]
        try:
            float_values = [float(v) for v in values]
            is_numeric = True
        except:
            is_numeric = False

        if is_numeric:
            std_dev = np.std(float_values)
            if std_dev < 1e-4:
                cleaned_value = round(np.mean(float_values), 4)
            else:
                rounded = [round(v, 4) for v in float_values]
                most_common = Counter(rounded).most_common(1)[0][0]
                cleaned_value = most_common
        else:
            most_common = Counter(values).most_common(1)[0][0]
            cleaned_value = most_common

        if any(val != cleaned_value for val in values):
            modification_log.append({
                "row": row,
                "column": col,
                "original_values": values,
                "cleaned_value": cleaned_value
            })

        base_df.at[row, col] = cleaned_value
        
save_cleaned_output_versioned(base_df, modification_log, num_unique)

Using 5 unique datasets:
 - Financial_Records.csv
 - Financial_Records_Bob.csv
 - Financial_Records_Bob_Hedda_Fiedler.csv
 - Financial_Records_Bob_Nemanja_Saveski.csv
 - Financial_Records_Bob_Thomas_Senstyler.csv


KeyboardInterrupt: 

## V2: Versions works properly but has no noise
This version addresses the issue from V1. Also stores data as integers (like in the original), and not float like V1

In [1]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
import random
from dataset_uniqueness_utils import *

def is_floatable(x):
    try:
        float(x)
        return True
    except:
        return False

DATA_DIR = "datasets-p2"
OUT_DIR = "output-datasets"
os.makedirs(OUT_DIR, exist_ok=True)

unique_datasets, unique_names, num_unique = get_unique_datasets(DATA_DIR)

print(f"Using {num_unique} unique datasets:")
for name in unique_names:
    print(f" - {name}")

base_df = unique_datasets[0].copy()
modification_log = []

for row in range(base_df.shape[0]):
    for col in base_df.columns:
        values = [df.at[row, col] for df in unique_datasets]

        float_values = []
        num_parseable = 0
        for v in values:
            try:
                float_values.append(float(v))
                num_parseable += 1
            except:
                continue

        if num_parseable == len(values):
            std_dev = np.std(float_values)
            if std_dev < 1e-4:
                cleaned_value = round(np.mean(float_values), 4)
            else:
                rounded = [round(v, 4) for v in float_values]
                most_common = Counter(rounded).most_common(1)[0][0]
                cleaned_value = most_common

        elif num_parseable == 0:
            most_common = Counter(values).most_common(1)[0][0]
            cleaned_value = most_common

        else:
            if random.random() < 0.5:
                cleaned_value = round(np.mean(float_values), 4)
            else:
                cleaned_value = random.choice([v for v in values if not is_floatable(v)])

        if any(val != cleaned_value for val in values):
            modification_log.append({
                "row": row,
                "column": col,
                "original_values": values,
                "cleaned_value": cleaned_value
            })

        try:
            base_df.at[row, col] = int(float(cleaned_value))
        except:
            base_df.at[row, col] = cleaned_value

save_cleaned_output_versioned(base_df, modification_log, num_unique)

Using 7 unique datasets:
 - Financial_Records.csv
 - Financial_Records_Bob.csv
 - Financial_Records_Bob_Cakmak_Dilara.csv
 - Financial_Records_Bob_Hedda_Fiedler.csv
 - Financial_Records_Bob_Lorenz_Horburger.csv
 - Financial_Records_Bob_Nemanja_Saveski.csv
 - Financial_Records_Bob_Thomas_Senstyler.csv
Cleaned dataset saved as 'Financial_Records_No_Fingerprint_7_v1.csv' in 'output-datasets\7_v1'
Modification log saved as 'modification_log_7_v1.csv' in 'output-datasets\7_v1'


'output-datasets\\7_v1'

## V3: Version with 0.5% noise

In [3]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
import random
from dataset_uniqueness_utils import *

def is_floatable(x):
    try:
        float(x)
        return True
    except:
        return False

DATA_DIR = "datasets-p2"
OUT_DIR = "output-datasets"
os.makedirs(OUT_DIR, exist_ok=True)

unique_datasets, unique_names, num_unique = get_unique_datasets(DATA_DIR)

print(f"Using {num_unique} unique datasets:")
for name in unique_names:
    print(f" - {name}")

# Prepare a list of unique categorical values per column (from all datasets)
all_possible_values = {}
for col in unique_datasets[0].columns:
    values = set()
    for df in unique_datasets:
        values.update(df[col].dropna().unique())
    all_possible_values[col] = list(values)

base_df = unique_datasets[0].copy()
modification_log = []

for row in range(base_df.shape[0]):
    for col in base_df.columns:
        values = [df.at[row, col] for df in unique_datasets]
        all_identical = all(val == values[0] for val in values)

        float_values = []
        num_parseable = 0
        for v in values:
            try:
                float_values.append(float(v))
                num_parseable += 1
            except:
                continue

        # Fully categorical and identical
        if num_parseable == 0 and all_identical and random.random() < 0.005:
            # Introduce 0.5% noise by randomly picking a different category
            original = values[0]
            possible = [v for v in all_possible_values[col] if v != original]
            if possible:
                cleaned_value = random.choice(possible)
                modification_log.append({
                    "row": row,
                    "column": col,
                    "original_values": values,
                    "cleaned_value": cleaned_value,
                    "note": "0.5% noise injected (categorical)"
                })
            else:
                cleaned_value = original  # fallback to original if no options
        elif num_parseable == len(values):
            # Fully numeric
            std_dev = np.std(float_values)
            if std_dev < 1e-4:
                cleaned_value = round(np.mean(float_values), 4)
            else:
                rounded = [round(v, 4) for v in float_values]
                most_common = Counter(rounded).most_common(1)[0][0]
                cleaned_value = most_common
        else:
            # Mixed type
            if random.random() < 0.005:
                # 0.5% chance: override with a random categorical value
                candidates = [v for v in values if not is_floatable(v)]
                if candidates:
                    cleaned_value = random.choice(candidates)
                    modification_log.append({
                        "row": row,
                        "column": col,
                        "original_values": values,
                        "cleaned_value": cleaned_value,
                        "note": "0.5% noise injected (mixed → categorical)"
                    })
                elif float_values:
                    cleaned_value = round(np.mean(float_values), 4)
                else:
                    cleaned_value = random.choice(values)
            else:
                if float_values:
                    cleaned_value = round(np.mean(float_values), 4)
                else:
                    cleaned_value = random.choice(values)

        # Log modifications (excluding noise already logged)
        if all(val != cleaned_value for val in values):
            if not any("note" in entry and entry["row"] == row and entry["column"] == col for entry in modification_log):
                modification_log.append({
                    "row": row,
                    "column": col,
                    "original_values": values,
                    "cleaned_value": cleaned_value
                })

        try:
            base_df.at[row, col] = int(float(cleaned_value))
        except:
            base_df.at[row, col] = cleaned_value

save_cleaned_output_versioned(base_df, modification_log, num_unique)

Using 7 unique datasets:
 - Financial_Records.csv
 - Financial_Records_Bob.csv
 - Financial_Records_Bob_Cakmak_Dilara.csv
 - Financial_Records_Bob_Hedda_Fiedler.csv
 - Financial_Records_Bob_Lorenz_Horburger.csv
 - Financial_Records_Bob_Nemanja_Saveski.csv
 - Financial_Records_Bob_Thomas_Senstyler.csv
Cleaned dataset saved as 'Financial_Records_No_Fingerprint_7_v2.csv' in 'output-datasets\7_v2'
Modification log saved as 'modification_log_7_v2.csv' in 'output-datasets\7_v2'


'output-datasets\\7_v2'