In [3]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import glob
from scipy.stats import pearsonr
import seaborn as sns

In [4]:
def convert_data2string(df):
    vals = df.values.astype(str)
    vals = np.array([' '.join(x) for x in vals])
    # should be fine now, but this checks that nothing is getting truncated due to type casting
    test = np.array([[y.isdigit() for y in x.split(' ')] for x in vals])
    assert (test.sum(axis=-1) == df.shape[-1]).all()
    return vals

In [6]:
# Tracts random
files_orig_rand = glob.glob('../results/init_random/tracts/*original*.t7')
files_rec_rand = glob.glob('../results/init_random/tracts/*reconstructed*.t7')
files_orig_rand = sorted(files_orig_rand)
files_rec_rand = sorted(files_rec_rand)
print(len(files_orig_rand), len(files_rec_rand))

50 50


In [12]:
# Invistigate confidential attribute distribution
all_orig_data = pd.concat([torch.load(files_orig_rand[i]) for i in range(len(files_orig_rand))])

data = all_orig_data['CENRACE'].values
label_frequency = [] 
for l in range(data.min(), data.max() + 1):
    c = len(np.where(data == l)[0])
    label_frequency.append(c)

label_frequency = np.array(label_frequency)
q1, q3 = np.quantile(label_frequency, q = [0.25, 0.75])
iqr = q3-q1
drop_races = np.where(label_frequency > q3 +15*iqr)[0]
print('Races to drop:\n', drop_races)

Races to drop:
 [0 1 2 3 5]


In [17]:
# Attribute disclosure experiments
orig_prototype_confidential_attr_same = []
orig_prototype_confidential_attr_diff = []
rec_prototype_confidential_attr_match = []
rec_prototype_confidential_attr_no_match = []
cnfd_attr_in_origi = []

for i in range(len(files_orig_rand)):
    orig_data = torch.load(files_orig_rand[i])
    filtered_orig_data = orig_data[~orig_data['CENRACE'].isin(drop_races)]
    rec_data =  pd.concat([d for d in torch.load(files_rec_rand[i])])

    # Remove 'CENRACE' column from original and reconstructed data
    orig_data_public_attr = filtered_orig_data.drop('CENRACE', axis=1)
    rec_data_public_attr = rec_data.drop('CENRACE', axis=1)

    # Convert DataFrames to strings
    orig_data_public_attr_str = convert_data2string(orig_data_public_attr)
    rec_data_public_attr_str = convert_data2string(rec_data_public_attr)

    # Create prototype sets and initialize dictionary
    prototype_orig_data_public_attr = sorted(set(orig_data_public_attr_str))

    # Collect confidential attribute data
    for prototype in prototype_orig_data_public_attr:
        idxs1 = np.where(orig_data_public_attr_str == prototype)[0]
        idxs2 = np.where(rec_data_public_attr_str == prototype)[0]
        cnfd1 = filtered_orig_data.iloc[idxs1]['CENRACE'].values
        cnfd2 = rec_data.iloc[idxs2]['CENRACE'].values
        if np.all(cnfd1 == cnfd1[0]):
            orig_prototype_confidential_attr_same.append(len(cnfd1))
            if np.all(cnfd2 == cnfd1[0]):
                rec_prototype_confidential_attr_match.append(cnfd2)
            else:
                cnfd_attr_in_origi.append(cnfd1[0])
                rec_prototype_confidential_attr_no_match.append(cnfd2)
        else:
            orig_prototype_confidential_attr_diff.append(len(cnfd1))

orig_prototype_confidential_attr_same = np.array(orig_prototype_confidential_attr_same)
orig_prototype_confidential_attr_diff = np.array(orig_prototype_confidential_attr_diff)
cnfd_attr_in_origi = np.array(cnfd_attr_in_origi)

In [18]:
# Number of prototypes in D with and without identical confidential attribute 
print('# Prototypes in D:', len(orig_prototype_confidential_attr_same) + len(orig_prototype_confidential_attr_diff))
print('# Prototypes with the same Race in D:', len(orig_prototype_confidential_attr_same))

# Prototypes in D: 1885
# Prototypes with the same Race in D: 1872


In [None]:
# # Plot a histogram with their distribution by their multiplicity. 
idxs = (orig_prototype_confidential_attr_same > 0)
ax = sns.histplot(orig_prototype_confidential_attr_same[idxs], kde = False, binwidth=2)
ax.set(xlabel='Multiplicity of prototype with the same Race', ylabel='Frequency')
# plt.savefig('figures/tract_rand_orig_prototype_same_multiplicity_hist_filter_0.png', dpi = 300)
plt.show()

# Plot the histogram also without singletons (prototypes with multiplicity=1). 
idxs = (orig_prototype_confidential_attr_same > 15)
ax = sns.histplot(orig_prototype_confidential_attr_same[idxs], kde = False, binwidth=2)
ax.set(xlabel='Multiplicity of prototype with the same Race', ylabel='Frequency')
# plt.savefig('figures/tract_rand_orig_prototype_same_multiplicity_hist_filtered_15.png', dpi = 300)
plt.show()
# # Also, which is the maximum multiplicity of these prototypes?
# print('Maximum multiplicity of these prototypes:', orig_prototype_confidential_attr_same.max())

In [15]:
# How many of them appear in D' by matching confidential attributes
print('Appear in D\' with matching confidential attributes:', len(rec_prototype_confidential_attr_match))
print('Appear in D\' with non-matching confidential attributes:', len(rec_prototype_confidential_attr_no_match))

Appear in D' with matching confidential attributes: 108
Appear in D' with non-matching confidential attributes: 1764


In [None]:
# Histogram of matching prototypes in D'
data = np.array([len(d) for d in rec_prototype_confidential_attr_match])
for filtr in [0, 1, 2]:
    idxs = (data > filtr)
    ax = sns.histplot(data=data[idxs], label='Matching', kde = False, binwidth=1, color = 'red')
    ax.set(xlabel='Multiplicity of prototype in D\'', ylabel='Frequency')
#     plt.savefig('figures/tract_rand_rec_prototype_matching_multiplicity_hist_filter_{}.png'.format(filtr), dpi = 300)
    plt.show()

In [None]:
# Histogram of non-matching prototypes in D'
data = np.array([len(d) for d in rec_prototype_confidential_attr_no_match])
for filtr in [0, 20, 50, 200]:
    idxs = (data > filtr)
    ax = sns.histplot(data=data[idxs], label='Non-matching', kde = False, binwidth=20, color = 'green')
    ax.set(xlabel='Multiplicity of prototype in D\'', ylabel='Frequency')
#     plt.savefig('figures/tract_rand_rec_prototype_non_matching_multiplicity_hist_filter_{}.png'.format(filtr), dpi = 300)
    plt.show()

In [19]:
# Divergence in confidential attribute of non-matching records
avg_diff = []
for cnfd, cnfds in zip(cnfd_attr_in_origi, rec_prototype_confidential_attr_no_match):
    a = (cnfds!=cnfd).mean()
    avg_diff.append(a)

print('Average difference in confidential attribute for non-matching records:{:0.2f}%'.format(np.mean(avg_diff)*100))

Average difference in confidential attribute for non-matching records:77.33%


In [20]:
# Tracts baseline
files_orig_base = glob.glob('../results/init_baseline/tracts/*original*.t7')
files_rec_base = glob.glob('../results/init_baseline/tracts/*reconstructed*.t7')
files_orig_base = sorted(files_orig_base)
files_rec_base = sorted(files_rec_base)
print(len(files_orig_base), len(files_rec_base))

50 50


In [21]:
# Attribute disclosure experiments
orig_prototype_confidential_attr_same = []
orig_prototype_confidential_attr_diff = []
rec_prototype_confidential_attr_match = []
rec_prototype_confidential_attr_no_match = []
cnfd_attr_in_origi = []

for i in range(len(files_orig_base)):
    orig_data = torch.load(files_orig_base[i])
    filtered_orig_data = orig_data[~orig_data['CENRACE'].isin(drop_races)]
    rec_data =  pd.concat([d for d in torch.load(files_rec_base[i])])

    # Remove 'CENRACE' column from original and reconstructed data
    orig_data_public_attr = filtered_orig_data.drop('CENRACE', axis=1)
    rec_data_public_attr = rec_data.drop('CENRACE', axis=1)

    # Convert DataFrames to strings
    orig_data_public_attr_str = convert_data2string(orig_data_public_attr)
    rec_data_public_attr_str = convert_data2string(rec_data_public_attr)

    # Create prototype sets and initialize dictionary
    prototype_orig_data_public_attr = sorted(set(orig_data_public_attr_str))

    # Collect confidential attribute data
    for prototype in prototype_orig_data_public_attr:
        idxs1 = np.where(orig_data_public_attr_str == prototype)[0]
        idxs2 = np.where(rec_data_public_attr_str == prototype)[0]
        cnfd1 = filtered_orig_data.iloc[idxs1]['CENRACE'].values
        cnfd2 = rec_data.iloc[idxs2]['CENRACE'].values
        if np.all(cnfd1 == cnfd1[0]):
            orig_prototype_confidential_attr_same.append(len(cnfd1))
            if np.all(cnfd2 == cnfd1[0]):
                rec_prototype_confidential_attr_match.append(cnfd2)
            else:
                cnfd_attr_in_origi.append(cnfd1[0])
                rec_prototype_confidential_attr_no_match.append(cnfd2)
        else:
            orig_prototype_confidential_attr_diff.append(len(cnfd1))

orig_prototype_confidential_attr_same = np.array(orig_prototype_confidential_attr_same)
orig_prototype_confidential_attr_diff = np.array(orig_prototype_confidential_attr_diff)
cnfd_attr_in_origi = np.array(cnfd_attr_in_origi)

In [22]:
# Number of prototypes in D with and without identical confidential attribute 
print('# Prototypes in D:', len(orig_prototype_confidential_attr_same) + len(orig_prototype_confidential_attr_diff))
print('# Prototypes with the same Race in D:', len(orig_prototype_confidential_attr_same))

# Prototypes in D: 1885
# Prototypes with the same Race in D: 1872


In [None]:
# # Plot a histogram with their distribution by their multiplicity. 
idxs = (orig_prototype_confidential_attr_same > 0)
ax = sns.histplot(orig_prototype_confidential_attr_same[idxs], kde = False, binwidth=2)
ax.set(xlabel='Multiplicity of prototype with the same Race', ylabel='Frequency')
plt.savefig('figures/tract_base_orig_prototype_same_multiplicity_hist_filter_0.png', dpi = 300)
plt.show()

# Plot the histogram also without singletons (prototypes with multiplicity=1). 
idxs = (orig_prototype_confidential_attr_same > 15)
ax = sns.histplot(orig_prototype_confidential_attr_same[idxs], kde = False, binwidth=2)
ax.set(xlabel='Multiplicity of prototype with the same Race', ylabel='Frequency')
plt.savefig('figures/tract_base_orig_prototype_same_multiplicity_hist_filtered_15.png', dpi = 300)
plt.show()
# # Also, which is the maximum multiplicity of these prototypes?
print('Maximum multiplicity of these prototypes:', orig_prototype_confidential_attr_same.max())

In [23]:
# How many of them appear in D' by matching confidential attributes
print('Appear in D\' with matching confidential attributes:', len(rec_prototype_confidential_attr_match))
print('Appear in D\' with non-matching confidential attributes:', len(rec_prototype_confidential_attr_no_match))

Appear in D' with matching confidential attributes: 221
Appear in D' with non-matching confidential attributes: 1651


In [None]:
# Histogram of matching prototypes in D'
data = np.array([len(d) for d in rec_prototype_confidential_attr_match])
for filtr in [0, 20, 100, 200]:
    idxs = (data > filtr)
    ax = sns.histplot(data=data[idxs], label='Matching', kde = False, binwidth=20, color = 'red')
    ax.set(xlabel='Multiplicity of prototype in D\'', ylabel='Frequency')
    plt.savefig('figures/tract_base_rec_prototype_matching_multiplicity_hist_filter_{}.png'.format(filtr), dpi = 300)
    plt.show()

In [None]:
# Histogram of non-matching prototypes in D'
data = np.array([len(d) for d in rec_prototype_confidential_attr_no_match])
for filtr in [0, 200, 500, 1000]:
    idxs = (data > filtr)
    ax = sns.histplot(data=data[idxs], label='Non-matching', kde = False, binwidth=150, color = 'green')
    ax.set(xlabel='Multiplicity of prototype in D\'', ylabel='Frequency')
    plt.savefig('figures/tract_base_rec_prototype_non_matching_multiplicity_hist_filter_{}.png'.format(filtr), dpi = 300)
    plt.show()

In [24]:
# Divergence in confidential attribute of non-matching records
avg_diff = []
for cnfd, cnfds in zip(cnfd_attr_in_origi, rec_prototype_confidential_attr_no_match):
    a = (cnfds!=cnfd).mean()
    avg_diff.append(a)

print('Average difference in confidential attribute for non-matching records:{:0.2f}%'.format(np.mean(avg_diff)*100))

Average difference in confidential attribute for non-matching records:27.56%
