# 3 Synthetic Data Privacy

The resample exposure risk index was developed to measure the risk of re-identification of individuals in a dataset. Variables are weigthed in the similarity calculation, by the risk of rerolling the query variable into the target. Frequent variables are more vunerable to re-identification. 

In [None]:
import pandas as pd

from pandas import DataFrame
from KNN_adapters import KNNAdapter

def distance_to_closest_record(queries: DataFrame, targets: DataFrame, adapter: KNNAdapter) -> float:
    """
    For each query, find the closest record in the targets DataFrame using the provided KNNAdapter.
    
    Arguments:
        queries (DataFrame): DataFrame containing query records.
        targets (DataFrame): DataFrame containing target records.
        adapter (KNNAdapter): An instance of KNNAdapter to compute distances.
    Returns:
        float: The average distance to the closest record in targets for each query.
    """
    knn_model = adapter()
    # Ensure the adapter is fitted to the targets
    knn_model.fit_nn(targets)

    if queries.equals(targets):
        #find the next-nearest neighbor
        dists = knn_model.get_neighbors(queries, n_neighbors=2)[0]
        dists = dists[:, 1]  # Exclude the first column which is the distance to itself
    else:
        #find the nearest neighbor
        dists = knn_model.get_neighbors(queries, n_neighbors=1)[0]

    res_df = pd.DataFrame(dists, columns=['DCR'])
    res = res_df.aggregate(
        ['mean', 'std', 'min', lambda x: x.quantile(0.25), 'median', lambda x: x.quantile(0.75),'max']
    )
    res.index = ['mean', 'std', 'min', '25%', 'median', '75%', 'max']

    return res.T

In [15]:
from prepare_data import uci_dataset_id_import, preprocess_data
df = uci_dataset_id_import(2, silent_import=False)
df = preprocess_data(df, 'class')[0]

df_1 = df.sample(100, random_state=42).reset_index(drop=True)

df_2 = df.sample(1000, random_state=43).reset_index(drop=True)
df_1.head()

Dataset Name: Adult


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,-0.338691,4.0,-0.639643,15.0,-0.030373,2.0,4.0,0.0,4.0,1.0,-0.144804,-0.217127,-0.034087,39.0,0.0
1,-0.192816,7.0,0.315861,15.0,-0.030373,0.0,8.0,4.0,2.0,0.0,-0.144804,-0.217127,-1.64812,39.0,0.0
2,1.484746,6.0,-0.225345,9.0,1.136512,2.0,4.0,0.0,4.0,1.0,-0.144804,-0.217127,-1.244612,39.0,1.0
3,1.484746,4.0,2.494276,15.0,-0.030373,2.0,10.0,0.0,4.0,1.0,-0.144804,-0.217127,-0.356894,39.0,2.0
4,-0.192816,4.0,-1.325788,9.0,1.136512,2.0,10.0,0.0,4.0,1.0,-0.144804,-0.217127,0.77293,39.0,3.0


In [16]:
from KNN_adapters import REX_KNN
res21 = distance_to_closest_record(df_2, df_1, REX_KNN)
res11 = distance_to_closest_record(df_1, df_1, REX_KNN)

print(res21/res11)

         mean       std  min      25%    median       75%       max
REX  0.849581  0.976807  0.0  0.76591  0.806574  0.896608  1.177645


In [17]:
from KNN_adapters import GowerKNN
res21 = distance_to_closest_record(df_2, df_1, GowerKNN)
res11 = distance_to_closest_record(df_1, df_1, GowerKNN)
print(res21/res11)

           mean       std  min       25%    median      75%       max
Gower  0.789561  0.859841  0.0  0.736899  0.804781  0.80165  1.038999


In [20]:
from KNN_adapters import EuclideanKNN
res21 = distance_to_closest_record(df_2, df_1, EuclideanKNN)
res11 = distance_to_closest_record(df_1, df_1, EuclideanKNN)
print(res21/res11)

        mean       std  min       25%   median       75%       max
L2  1.059166  1.338597  0.0  0.952425  0.99763  1.028331  2.025563


## 2.1 Data Generation

In [2]:
import uci_dataset as dataset
from synthpop_adapter import rSynthpop

df = dataset.load_breast_cancer()
df_syn = rSynthpop(df)
df_syn()

PermissionError: [WinError 5] Access is denied: 'synthesis_info_synthpop_temp_0'