# Data Fidelity Comparison

## Preparation of The Data for Comparison

In [13]:
import pandas as pd

def prepare_dataset_for_fidelity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the dataset for fidelity comparison by:
    - Converting numeric columns to float (dropping invalids)
    - Standardizing categorical values
    - Encoding employment_since according to custom mapping (unemployed -> -1, 1<year -> 0)
    """

    numeric_columns = [
        "age", "credit_amount", "duration", "monthly_rent_or_mortgage",
        "installment_rate", "residence_since", "existing_credits",
        "liable_people", "default"  
    ]
    
    categorical_columns = [
        "sex", "marital_status", "job", "credit_hist", "purpose", "debtors",
        "property", "installment_other", "housing", "tel", "online_banking",
        "foreign"
    ]

    df = df.copy()

    # 1. Clean and convert numeric columns
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # force conversion to numeric
        # Drop rows with NaN in numeric columns (or impute if preferred)
        df = df[df[col].notna()]

    # 2. Clean categorical columns (standardize text)
    for col in categorical_columns:
        df[col] = df[col].astype(str).str.strip().str.lower()

    # 3. Custom handling for employment_since
    def convert_employment(val):
        val = str(val).strip().lower()
        if val == 'unemployed':
            return -1
        elif val == '<1 year':
            return 0
        else:
            try:
                return float(val)
            except ValueError:
                return None

    df["employment_since"] = df["employment_since"].apply(convert_employment)
    df = df[df["employment_since"].notna()]
    
    # 4. Handling mixed-type 'checking_account'
    def clean_checking_account(val):
        val = str(val).strip().lower()
        if val == 'no checking account':
            return val  # keep as string
        try:
            return float(val)  # keep numeric
        except ValueError:
            return None  # invalid entries removed

    df["checking_account"] = df["checking_account"].apply(clean_checking_account)
    df = df[df["checking_account"].notna()]

    return df

## Actual Comparison Function 

In [14]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import jensenshannon
import numpy as np

def compare_fidelity(prepared_df: pd.DataFrame, prepared_df_modified: pd.DataFrame):
    results = []

    # Define columns by type
    numeric_columns = [
        "age", "credit_amount", "duration", "monthly_rent_or_mortgage",
        "installment_rate", "residence_since", "existing_credits",
        "liable_people", "default", "employment_since"
    ]
    
    categorical_columns = [
        "sex", "marital_status", "job", "credit_hist", "purpose", "debtors",
        "property", "installment_other", "housing", "tel", "online_banking",
        "foreign"
    ]

    # 1. Purely Numeric Columns
    for col in numeric_columns:
        # Ensure both columns are numeric and drop rows with NaNs
        original_values = prepared_df[col]
        modified_values = prepared_df_modified[col]
        valid_mask = original_values.notna() & modified_values.notna()

        mse = mean_squared_error(original_values[valid_mask], modified_values[valid_mask])
        results.append({
            "column": col,
            "type": "numeric",
            "MSE": mse
        })

    # 2. Purely Categorical Columns
    for col in categorical_columns:
        original_values = prepared_df[col].astype(str)
        modified_values = prepared_df_modified[col].astype(str)

        # Agreement Rate
        match_mask = original_values == modified_values
        agreement_rate = match_mask.sum() / len(match_mask)

        # JSD on distributions
        dist_orig = original_values.value_counts(normalize=True).sort_index()
        dist_mod = modified_values.value_counts(normalize=True).sort_index()

        # Align index
        all_categories = sorted(set(dist_orig.index).union(dist_mod.index))
        p = np.array([dist_orig.get(cat, 0) for cat in all_categories])
        q = np.array([dist_mod.get(cat, 0) for cat in all_categories])
        jsd = jensenshannon(p, q)

        results.append({
            "column": col,
            "type": "categorical",
            "Agreement Rate": agreement_rate,
            "JSD": jsd
        })

    # 3. Mixed-Type: checking_account
    checking_orig = prepared_df["checking_account"]
    checking_mod = prepared_df_modified["checking_account"]

    # Numeric part
    numeric_mask = checking_orig.apply(lambda x: isinstance(x, float)) & \
                   checking_mod.apply(lambda x: isinstance(x, float))
    if numeric_mask.sum() > 0:
        mse_checking = mean_squared_error(checking_orig[numeric_mask], checking_mod[numeric_mask])
    else:
        mse_checking = None

    # Categorical part ('no checking account')
    cat_mask = checking_orig == "no checking account"
    mod_cat_mask = checking_mod == "no checking account"
    agreement_mask = cat_mask & mod_cat_mask
    agreement_count = agreement_mask.sum()
    total_cat = (cat_mask | mod_cat_mask).sum()
    cat_agreement_rate = agreement_count / total_cat if total_cat > 0 else None

    # Distributional comparison for 'no checking account' frequency
    dist_orig_check = checking_orig[checking_orig == "no checking account"]
    dist_mod_check = checking_mod[checking_mod == "no checking account"]
    p_check = np.array([len(dist_orig_check) / len(checking_orig)])
    q_check = np.array([len(dist_mod_check) / len(checking_mod)])
    jsd_checking = jensenshannon(p_check, q_check)

    results.append({
        "column": "checking_account",
        "type": "mixed",
        "MSE (numeric)": mse_checking,
        "Agreement Rate (categorical)": cat_agreement_rate,
        "JSD (categorical)": jsd_checking
    })
    
    return pd.DataFrame(results)

## Comparing the Datasets!

We must first create a shuffled version of the dataset by running: 
```
!python.exe .\dataset-shuffler.py Financial_Records.csv Shuffled_Financial_Records.csv "Marko123!@#"
```
Next, we execute:
```
!python dataset-attack.py
```
This will create the Attacked_Financial_Records.csv, needed to run the comparison below. 

In [15]:
df = pd.read_csv('Shuffled_Financial_Records.csv')
df_modified = pd.read_csv('extra_and_old/Attacked_Financial_Records.csv')

prepared_df = prepare_dataset_for_fidelity(df)
prepared_df_modified = prepare_dataset_for_fidelity(df_modified)

results_df = compare_fidelity(prepared_df, prepared_df_modified)

In [16]:
results_df.head(22)

Unnamed: 0,column,type,MSE,Agreement Rate,JSD,MSE (numeric),Agreement Rate (categorical),JSD (categorical)
0,age,numeric,35.217943,,,,,
1,credit_amount,numeric,421119.294171,,,,,
2,duration,numeric,17.212443,,,,,
3,monthly_rent_or_mortgage,numeric,3227.077114,,,,,
4,installment_rate,numeric,0.304414,,,,,
5,residence_since,numeric,0.291686,,,,,
6,existing_credits,numeric,0.047786,,,,,
7,liable_people,numeric,0.017514,,,,,
8,default,numeric,0.200343,,,,,
9,employment_since,numeric,1.154229,,,,,
