# Full attack & Comparison

In [1]:
!python.exe .\dataset-shuffler.py Financial_Records.csv Shuffled_Financial_Records.csv "Marko123!@#"

## Attack Code 

In [2]:
import pandas as pd
import numpy as np
import hashlib
import random

# Load dataset
df = pd.read_csv("Shuffled_Financial_Records.csv")

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

### STEP 1: UID MANIPULATION ###
# Generate a synthetic UID from stable features and assign back to PID
def generate_uid(row, cols):
    concat_str = ''.join(str(row[col]) for col in cols)
    return int(hashlib.sha256(concat_str.encode()).hexdigest(), 16) % 10**8

uid_cols = ['age', 'sex', 'job', 'residence_since', 'property', 'housing']
df['PID'] = df.apply(lambda row: generate_uid(row, uid_cols), axis=1)

### STEP 2: BIT COLLISION ENGINEERING ###

# Categorical columns to attack
categorical_cols = [
    'sex', 'marital_status', 'job', 'credit_hist', 'purpose', 'debtors',
    'property', 'installment_other', 'housing', 'tel', 'online_banking', 'foreign'
]

# Function to randomly flip categorical values
def perturb_categorical(col):
    top_values = df[col].value_counts().nlargest(5).index.tolist()
    df[col] = df[col].apply(lambda x: random.choice(top_values) if random.random() < 0.3 else x)

for col in categorical_cols:
    perturb_categorical(col)

# Mixed-type columns: numeric part perturbation only
def perturb_mixed_numeric(col, value_range=0.1):
    numeric_mask = pd.to_numeric(df[col], errors='coerce').notnull()
    numeric_values = df.loc[numeric_mask, col].astype(float)
    noise = np.random.uniform(-value_range, value_range, size=numeric_values.shape[0])
    perturbed = (numeric_values * (1 + noise)).clip(lower=0).round().astype(int)
    df.loc[numeric_mask, col] = perturbed

perturb_mixed_numeric('employment_since', 0.2)
perturb_mixed_numeric('checking_account', 0.25)
perturb_mixed_numeric('savings', 0.25)

# Numeric columns to attack
numeric_cols = [
    'age', 'credit_amount', 'duration', 'installment_rate',
    'residence_since', 'existing_credits', 'liable_people',
    'monthly_rent_or_mortgage'
]

def perturb_numeric(col, shift_percent=0.1):
    values = df[col].astype(float)
    noise = np.random.normal(loc=0, scale=shift_percent, size=len(values))
    df[col] = (values * (1 + noise)).clip(lower=0).round().astype(int)

for col in numeric_cols:
    perturb_numeric(col, 0.15)

# Binary numeric columns (1 or 2) — flip occasionally
def flip_binary(col, prob=0.15):
    df[col] = df[col].apply(lambda x: 3 - x if random.random() < prob else x)

flip_binary('default', prob=0.2)

# Ensure all numeric columns are integers
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col] = df[col].round().astype(int)

# Save the attacked dataset
df.to_csv("ex1_1_12442103_Georgiev_Marko_1.csv", index=False)

## Preparation of The Data for Comparison

In [3]:
import pandas as pd

def prepare_dataset_for_fidelity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the dataset for fidelity comparison by:
    - Converting numeric columns to float (dropping invalids)
    - Standardizing categorical values
    - Encoding employment_since according to custom mapping (unemployed -> -1, 1<year -> 0)
    """

    numeric_columns = [
        "age", "credit_amount", "duration", "monthly_rent_or_mortgage",
        "installment_rate", "residence_since", "existing_credits",
        "liable_people", "default"  
    ]
    
    categorical_columns = [
        "sex", "marital_status", "job", "credit_hist", "purpose", "debtors",
        "property", "installment_other", "housing", "tel", "online_banking",
        "foreign"
    ]

    df = df.copy()

    # 1. Clean and convert numeric columns
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # force conversion to numeric
        # Drop rows with NaN in numeric columns (or impute if preferred)
        df = df[df[col].notna()]

    # 2. Clean categorical columns (standardize text)
    for col in categorical_columns:
        df[col] = df[col].astype(str).str.strip().str.lower()

    # 3. Custom handling for employment_since
    def convert_employment(val):
        val = str(val).strip().lower()
        if val == 'unemployed':
            return -1
        elif val == '<1 year':
            return 0
        else:
            try:
                return float(val)
            except ValueError:
                return None

    df["employment_since"] = df["employment_since"].apply(convert_employment)
    df = df[df["employment_since"].notna()]
    
    # 4. Handling mixed-type 'checking_account'
    def clean_checking_account(val):
        val = str(val).strip().lower()
        if val == 'no checking account':
            return val  # keep as string
        try:
            return float(val)  # keep numeric
        except ValueError:
            return None  # invalid entries removed

    df["checking_account"] = df["checking_account"].apply(clean_checking_account)
    df = df[df["checking_account"].notna()]

    return df

## Comparison Code

In [4]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import jensenshannon
import numpy as np

def compare_fidelity(prepared_df: pd.DataFrame, prepared_df_modified: pd.DataFrame):
    results = []

    # Define columns by type
    numeric_columns = [
        "age", "credit_amount", "duration", "monthly_rent_or_mortgage",
        "installment_rate", "residence_since", "existing_credits",
        "liable_people", "default", "employment_since"
    ]
    
    categorical_columns = [
        "sex", "marital_status", "job", "credit_hist", "purpose", "debtors",
        "property", "installment_other", "housing", "tel", "online_banking",
        "foreign"
    ]

    # 1. Purely Numeric Columns
    for col in numeric_columns:
        # Ensure both columns are numeric and drop rows with NaNs
        original_values = prepared_df[col]
        modified_values = prepared_df_modified[col]
        valid_mask = original_values.notna() & modified_values.notna()

        mse = mean_squared_error(original_values[valid_mask], modified_values[valid_mask])
        results.append({
            "column": col,
            "type": "numeric",
            "MSE": mse
        })

    # 2. Purely Categorical Columns
    for col in categorical_columns:
        original_values = prepared_df[col].astype(str)
        modified_values = prepared_df_modified[col].astype(str)

        # Agreement Rate
        match_mask = original_values == modified_values
        agreement_rate = match_mask.sum() / len(match_mask)

        # JSD on distributions
        dist_orig = original_values.value_counts(normalize=True).sort_index()
        dist_mod = modified_values.value_counts(normalize=True).sort_index()

        # Align index
        all_categories = sorted(set(dist_orig.index).union(dist_mod.index))
        p = np.array([dist_orig.get(cat, 0) for cat in all_categories])
        q = np.array([dist_mod.get(cat, 0) for cat in all_categories])
        jsd = jensenshannon(p, q)

        results.append({
            "column": col,
            "type": "categorical",
            "Agreement Rate": agreement_rate,
            "JSD": jsd
        })

    # 3. Mixed-Type: checking_account
    checking_orig = prepared_df["checking_account"]
    checking_mod = prepared_df_modified["checking_account"]

    # Numeric part
    numeric_mask = checking_orig.apply(lambda x: isinstance(x, float)) & \
                   checking_mod.apply(lambda x: isinstance(x, float))
    if numeric_mask.sum() > 0:
        mse_checking = mean_squared_error(checking_orig[numeric_mask], checking_mod[numeric_mask])
    else:
        mse_checking = None

    # Categorical part ('no checking account')
    cat_mask = checking_orig == "no checking account"
    mod_cat_mask = checking_mod == "no checking account"
    agreement_mask = cat_mask & mod_cat_mask
    agreement_count = agreement_mask.sum()
    total_cat = (cat_mask | mod_cat_mask).sum()
    cat_agreement_rate = agreement_count / total_cat if total_cat > 0 else None

    # Distributional comparison for 'no checking account' frequency
    dist_orig_check = checking_orig[checking_orig == "no checking account"]
    dist_mod_check = checking_mod[checking_mod == "no checking account"]
    p_check = np.array([len(dist_orig_check) / len(checking_orig)])
    q_check = np.array([len(dist_mod_check) / len(checking_mod)])
    jsd_checking = jensenshannon(p_check, q_check)

    results.append({
        "column": "checking_account",
        "type": "mixed",
        "MSE (numeric)": mse_checking,
        "Agreement Rate (categorical)": cat_agreement_rate,
        "JSD (categorical)": jsd_checking
    })
    
    return pd.DataFrame(results)

## Visualizing the Comparison

In [5]:
df = pd.read_csv('Shuffled_Financial_Records.csv')
df_modified = pd.read_csv('ex1_1_12442103_Georgiev_Marko_1.csv')

prepared_df = prepare_dataset_for_fidelity(df)
prepared_df_modified = prepare_dataset_for_fidelity(df_modified)

results_df = compare_fidelity(prepared_df, prepared_df_modified)

In [6]:
results_df.head(22)

Unnamed: 0,column,type,MSE,Agreement Rate,JSD,MSE (numeric),Agreement Rate (categorical),JSD (categorical)
0,age,numeric,35.217943,,,,,
1,credit_amount,numeric,421119.294171,,,,,
2,duration,numeric,17.212443,,,,,
3,monthly_rent_or_mortgage,numeric,3227.077114,,,,,
4,installment_rate,numeric,0.304414,,,,,
5,residence_since,numeric,0.291686,,,,,
6,existing_credits,numeric,0.047786,,,,,
7,liable_people,numeric,0.017514,,,,,
8,default,numeric,0.200343,,,,,
9,employment_since,numeric,1.154229,,,,,
