In [1]:
%load_ext autoreload
%autoreload 2

import gfkmc
import numpy as np
import pandas as pd
from scipy.stats import rankdata
import random
from tree.gentree import read_tree
import math
import timeit
from typing import List
import itertools
from record_linkage import record_linkage
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = 'adult'
results_dir = f'results/{dataset}/gfkmc'
os.makedirs(results_dir, exist_ok=True)

if dataset == 'adult':
    df_orig = pd.read_csv('./datasets/adult_orig.csv').iloc[:1000]
    df = pd.read_csv('./das/adult_num_anon.csv')[df_orig.columns].iloc[:1000]
    categoricals = ['sex', 'race', 'workclass', 'marital-status', 'occupation', 'native-country', 'education']
    numericals = ['age']
    sensitives = ['salary-class']

elif dataset == 'adult_cat':
    df_orig = pd.read_csv('./datasets/adult_orig.csv').drop('age', axis=1)
    df = pd.read_csv('./das/adult_num_anon.csv')[df_orig.columns]
    categoricals = ['sex', 'race', 'workclass', 'marital-status', 'occupation', 'native-country', 'education']
    numericals = []
    sensitives = ['salary-class']

# sensitives = ['salary-class', 'race', 'sex']

k = 60
df.shape

(1000, 9)

In [3]:
df

Unnamed: 0,sex,age,race,marital-status,education,native-country,workclass,occupation,salary-class
0,Male,40,White,Never-married,Bachelors,United-States,State-gov,Adm-clerical,<=50K
1,Male,51,White,Married-civ-spouse,Bachelors,United-States,Self-emp-not-inc,Exec-managerial,<=50K
2,Male,40,White,Divorced,HS-grad,United-States,Private,Handlers-cleaners,<=50K
3,Male,51,Black,Married-civ-spouse,11th,United-States,Private,Handlers-cleaners,<=50K
4,Female,26,Black,Married-civ-spouse,Bachelors,Cuba,Private,Prof-specialty,<=50K
...,...,...,...,...,...,...,...,...,...
995,Female,45,White,Separated,Bachelors,United-States,Private,Prof-specialty,<=50K
996,Female,20,White,Divorced,HS-grad,United-States,Local-gov,Exec-managerial,<=50K
997,Male,63,White,Married-civ-spouse,HS-grad,United-States,Self-emp-not-inc,Farming-fishing,<=50K
998,Male,40,White,Married-civ-spouse,Some-college,United-States,Local-gov,Protective-serv,>50K


In [4]:
df_orig

Unnamed: 0,sex,age,race,marital-status,education,native-country,workclass,occupation,salary-class
0,Male,39,White,Never-married,Bachelors,United-States,State-gov,Adm-clerical,<=50K
1,Male,50,White,Married-civ-spouse,Bachelors,United-States,Self-emp-not-inc,Exec-managerial,<=50K
2,Male,38,White,Divorced,HS-grad,United-States,Private,Handlers-cleaners,<=50K
3,Male,53,Black,Married-civ-spouse,11th,United-States,Private,Handlers-cleaners,<=50K
4,Female,28,Black,Married-civ-spouse,Bachelors,Cuba,Private,Prof-specialty,<=50K
...,...,...,...,...,...,...,...,...,...
995,Female,43,White,Separated,Bachelors,United-States,Private,Prof-specialty,<=50K
996,Female,19,White,Divorced,HS-grad,United-States,Local-gov,Exec-managerial,<=50K
997,Male,58,White,Married-civ-spouse,HS-grad,United-States,Self-emp-not-inc,Farming-fishing,<=50K
998,Male,41,White,Married-civ-spouse,Some-college,United-States,Local-gov,Protective-serv,>50K


In [5]:
att_names = df[categoricals + sensitives].columns
att_tree = read_tree('./tree/adult/', att_names)

In [6]:
table = gfkmc.GFKMCTable(df, df_orig, numericals, categoricals, sensitives, att_tree)

In [7]:
local_start = timeit.default_timer()
remaining_groups = table.initial_clustering_phase(k)
stop = timeit.default_timer()
execution_time = stop - local_start
print(f"initial_clustering_phase execution time: {execution_time}")


initial_clustering_phase execution time: 0.010947572000077344


In [8]:

local_start = timeit.default_timer()
beta = int(len(remaining_groups) * 0.05)
table.weighting_phase(beta, remaining_groups)
stop = timeit.default_timer()
execution_time = stop - local_start
print(f"weighting_phase execution time: {execution_time}")


weighting_phase execution time: 0.2481860459993186


In [9]:
local_start = timeit.default_timer()
table.grouping_phase(k)
stop = timeit.default_timer()
execution_time = stop - local_start
print(f"grouping_phase execution time: {execution_time}")


centroid group=[732]
centroid group=[820]
centroid group=[928]
centroid group=[785]
centroid group=[208]
centroid group=[892]
centroid group=[630]
centroid group=[469]
centroid group=[417, 945]
centroid group=[99, 596]
centroid group=[601]
centroid group=[235]
centroid group=[479]
centroid group=[502]
centroid group=[510]
centroid group=[715]
grouping_phase execution time: 13.996983540000656


In [10]:
local_start = timeit.default_timer()
table.adjustment_phase()
stop = timeit.default_timer()
execution_time = stop - local_start
print(f"adjustment_phase execution time: {execution_time}")


adjustment_phase execution time: 2.2067533820008975


In [11]:
# gen_method = 'most_common_values'
gen_method = 'cluster_centroid'

df_anon = table.cluster_generalization(gen_method)

# df_anon['cluster'] = -1
# for i, cluster in enumerate(table.clusters):
#     df_anon.loc[cluster.r_indices, 'cluster'] = i
#     print(f"cluster={i}, len={cluster.size}")

ncp_value = table.ncp(df_anon)
print(ncp_value)

# df_anon.to_csv(f'./{results_dir}/adult_anon_k{k}_test.csv', index=False)



# with open(f'./{results_dir}/adult_anon_k{k}.txt', 'w') as f:
#     f.write(f'{ncp_value}\n{matches}')


0.25345503421674387


In [12]:
matches = record_linkage(df_orig, df_anon, att_tree, numericals, categoricals)
print(matches)

dn execution time: 0.000789631998486584s
dc execution time: 0.030196547999366885s
0 of 1000
dn execution time: 0.0006025249986123526s
dc execution time: 0.028225051999470452s
1 of 1000
dn execution time: 0.0005724929997086292s
dc execution time: 0.029628315000081784s
2 of 1000
dn execution time: 0.0005678140005329624s
dc execution time: 0.028331072999208118s
3 of 1000
dn execution time: 0.000558105999516556s
dc execution time: 0.02803543099980743s
4 of 1000
dn execution time: 0.0005407160006143386s
dc execution time: 0.028518947001430206s
5 of 1000
dn execution time: 0.0005581060013355454s
dc execution time: 0.028831837998950505s
6 of 1000
dn execution time: 0.0005605509995803004s
dc execution time: 0.02795232000062242s
7 of 1000
dn execution time: 0.0005539850008062785s
dc execution time: 0.027749568998842733s
8 of 1000
dn execution time: 0.0006719489992974559s
dc execution time: 0.02857866100021056s
9 of 1000
dn execution time: 0.0005870209988643182s
dc execution time: 0.027946173999

In [13]:
# for i, cluster in enumerate(table.clusters):
#     intra_diss = table.intra_diss(cluster)
#     print(f'table.intra_diss(table.clusters[{i}]) = {intra_diss}')

In [14]:
# for pair in itertools.combinations(range(len(table.clusters)), 2):
#     inter_diss = table.inter_diss(table.clusters[pair[0]], table.clusters[pair[1]])
#     print(f'table.inter_diss(table.clusters[{pair[0]}], table.clusters[{pair[1]}]) = {inter_diss}')