In [9]:
# --- CONFIGURATION ---
input_file = 'Financial_Records.csv'
output_file = 'Cleaned_Financial_Records_2.csv'
k_neighbors = 5
categorical_cols = [
    'sex', 'marital_status', 'job', 'foreign',
    'credit_hist', 'purpose', 'installment_rate', 'debtors',
    'checking_account', 'savings', 'housing', 'property',
    'online_banking', 'tel', 'employment_since'
]
numerical_cols = [
    'age', 'credit_amount', 'duration',
    'existing_credits', 'liable_people', 'residence_since',
    'monthly_rent_or_mortgage'
]
pid_col = 'PID'  # Rename if it's different in your dataset
rare_threshold = 0.01  # 1% threshold for rare values
change_pids = True
random_state = 42
# ----------------------

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

np.random.seed(random_state)
df = pd.read_csv(input_file)

# Drop rows with missing values to simplify
df.dropna(subset=numerical_cols + categorical_cols, inplace=True)

# Step 1: Normalize numerical columns
scaler = StandardScaler()
numerical_data = scaler.fit_transform(df[numerical_cols])

# Step 2: Fit KNN model
knn = NearestNeighbors(n_neighbors=k_neighbors + 1)
knn.fit(numerical_data)
distances, indices = knn.kneighbors(numerical_data)

# Step 3: Category Swapping with neighbors
for idx, row in df.iterrows():
    neighbor_indices = indices[idx][1:]  # Exclude self
    for col in categorical_cols:
        if np.random.rand() < 0.5:  # 50% chance to swap
            neighbor_val = df.loc[np.random.choice(neighbor_indices), col]
            df.at[idx, col] = neighbor_val

# Step 4: Replace rare values using frequent values — only for safe fields
safe_for_replacement = [col for col in categorical_cols if col not in ['checking_account', 'savings', 'employment_since']]

for col in safe_for_replacement:
    value_counts = df[col].value_counts(normalize=True)
    rare_values = value_counts[value_counts < rare_threshold].index
    common_values = value_counts[value_counts >= rare_threshold].index.tolist()

    def replace_if_rare(val):
        if val in rare_values and common_values:
            return np.random.choice(common_values)
        return val

    df[col] = df[col].apply(replace_if_rare)

# Step 5: Change PIDs (optional)
if change_pids and pid_col in df.columns:
    df[pid_col] = ['PID' + str(i).zfill(6) for i in range(len(df))]

# Save output
df.to_csv(output_file, index=False)
print(f"Fingerprint-breaking dataset written to: {output_file}")

Fingerprint-breaking dataset written to: Cleaned_Financial_Records_2.csv
