In [1]:
%load_ext autoreload
%autoreload 2

## Environment Setup (for Colab)

In [None]:
import os
from datetime import datetime

os.chdir('/content')
current_time = datetime.utcnow().isoformat().replace(':', '_')
os.makedirs(os.path.join(current_time))
os.chdir(f'./{current_time}')
WORKSPACE = 'random_sampling'

!git clone https://github.com/lng-ng/random_sampling.git

os.chdir(f'./{WORKSPACE}')

!pip install -r requirements.txt

## Download required data for the experiment

In [None]:
import gdown
import os
data_folder = os.path.join("expr1_data")
drive_folderid = "1G-7anLLgO9bZbg7fL_dAuxHhqf_VK67Y"
gdown.download_folder(id=drive_folderid)

Retrieving folder contents


Retrieving folder 1lsoqReQJ2e0MiQXI8CCYYsgk2rD_pLld arx
Retrieving folder 1mzW5gaUoBMc9B-rzYfY5MTz7OKbAzXBu 50
Processing file 1rQskp6R01XuWQ2MqbYSp8B6lYKmJuyra diabetes-anonymized.csv
Retrieving folder 17u7scK4YWArLn9mN206gUd57196X6X30 100
Processing file 1g7rlypojiqUgeVAdqQ_apjoAVh2OviUS diabetes-anonymized.csv
Retrieving folder 1x-T0NwLe8edJ69DxXboxoPBCYSh2fV98 150
Processing file 1wtOjEGJ1HS7wGnHBpwxpb_0JEqAxfBQE diabetes-anonymized.csv
Retrieving folder 1bwQ2XgYFSwa_Y1CG-IWPJFcbMI9ibaIh 200
Processing file 1d2pOIrtSFOGmoS4t5NMc5ZvjhhZeB2OF diabetes-anonymized.csv
Retrieving folder 1vUhFAPUKyHRyWBDlGf2X85asIZqC7tZW 250
Processing file 1ii-Ebd_xK3D6MojBxtWAesPr9ky54BWn diabetes-anonymized.csv
Retrieving folder 1HrMYd_yJg9HA0RUmuHa-9HneO9gwj-81 300
Processing file 1FXYg5X8L65hCvw2JvUqprZtVDA0k6qad diabetes-anonymized.csv
Retrieving folder 1n5b1Ucs1eet9xX9X-7X0LwXW4HI5oVUC 350
Processing file 1xfscx7U_kuNu56fpcjWjVlwLcXB33I8x diabetes-anonymized.csv
Retrieving folder 1yvFUKp5wYTDv1wpp

Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1rQskp6R01XuWQ2MqbYSp8B6lYKmJuyra
To: F:\Informatik\tkhiwi\random_sampling\expr1_data\arx\50\diabetes-anonymized.csv
100%|█████████████████████████████████████████████████████████████████████████████| 4.51M/4.51M [00:00<00:00, 9.46MB/s]
Downloading...
From: https://drive.google.com/uc?id=1g7rlypojiqUgeVAdqQ_apjoAVh2OviUS
To: F:\Informatik\tkhiwi\random_sampling\expr1_data\arx\100\diabetes-anonymized.csv
100%|█████████████████████████████████████████████████████████████████████████████| 4.51M/4.51M [00:00<00:00, 9.12MB/s]
Downloading...
From: https://drive.google.com/uc?id=1wtOjEGJ1HS7wGnHBpwxpb_0JEqAxfBQE
To: F:\Informatik\tkhiwi\random_sampling\expr1_data\arx\150\diabetes-anonymized.csv
100%|█████████████████████████████████████████████████████████████████████████████| 4.52M/4.52M [00:00<00:00, 9.79MB/s]
Downloading...
From: https

## Run the experiment

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

os.chdir("./kanonymity")
from datasets.categorical import DATASET_ATTRIBUTES_DICT
from basic_mondrian.utils.read_adult_data import read_tree

import clustering_based.clustering_based_k_anon as cbka



In [None]:
def init(data_path):
    dataset = 'diabetes'
    # Generalization hierarchies path
    gen_path = os.path.join('generalization', 'hierarchies', dataset, '')  # trailing /
    # folder for all results
    anon_folder = None
    numeric_folder = None
    df = pd.read_csv(data_path,delimiter=',')
    ATT_NAMES = list(df.columns)
    ATTRIBUTES_DICT = DATASET_ATTRIBUTES_DICT[dataset]
    QI_INDEX = [i for i,attr in enumerate(ATT_NAMES) if ATTRIBUTES_DICT[attr][0]]
    IS_CAT = [ATTRIBUTES_DICT[ATT_NAMES[idx]][1] for idx in QI_INDEX]
    SA_INDEX = [index for index in range(len(ATT_NAMES)) if index not in QI_INDEX]
    SA_NAMES = [ATT_NAMES[i] for i in SA_INDEX]
    QID_NAMES = [ATT_NAMES[i] for i in QI_INDEX]
    ATT_TREES = read_tree(gen_path, numeric_folder, dataset, ATT_NAMES, QI_INDEX, IS_CAT)
    #raw_data, header = read_raw_fromdf(df, numeric_folder, dataset, QI_INDEX, IS_CAT)
    #print(ATT_NAMES, ATTRIBUTES_DICT, QI_INDEX, IS_CAT, SA_INDEX)
    cbka.init(ATT_TREES, df, SA_INDEX, len(QI_INDEX))
    return df

In [None]:
SA_NAMES = ['hypertension', 'heart_disease', 'smoking_history', 'HbA1c_level', 'blood_glucose_level', 'diabetes']
QID_NAMES = ['gender', 'age', 'bmi']

res_df = pd.DataFrame(columns=['Algorithm', 'k', 'NCP'])

ks = [50,100,150,200,250,300,350,400,450,500]
methods = ['arx', 'mondrian', 'oka']
for m in methods:
    for k in ks:
        data_path = os.path.join("../expr1_data", f"{m}", f"{k}")
        if m == "arx":
            data_path = os.path.join(data_path, "diabetes-anonymized.csv")
        else:
            data_path = os.path.join(data_path, f"diabetes_anonymized_{k}_0.csv")
        df = init(data_path)
        ncp = 0
        grp = df.groupby(QID_NAMES).size().to_frame(name='size').reset_index()
        grp['ncp'] = grp.apply(lambda x: cbka.NCP(x[QID_NAMES].tolist()) * x['size'], axis='columns')
        ncp += grp['ncp'].sum()
        ncp /= len(QID_NAMES)
        ncp /= len(df)
        ncp *= 100
        res_df.loc[len(res_df)] = [m, k, ncp]

In [None]:
res_df

In [None]:
arx = res_df[res_df['Algorithm'] == 'arx'].sort_values(by='k', ascending=True)
mondrian = res_df[res_df['Algorithm'] == 'mondrian'].sort_values(by='k', ascending=True)
oka = res_df[res_df['Algorithm'] == 'oka'].sort_values(by='k', ascending=True)

# Data
data = [
    ("ARX", arx['k'], arx['NCP']),
    ("Mondrian", mondrian['k'], mondrian['NCP']),
    ("OKA", oka['k'], oka['NCP'])
]

# Create a line chart
plt.figure(figsize=(10, 6))

for algo, k_values, ncp_values in data:
    plt.plot(k_values, ncp_values, label=algo)

plt.xlabel("k values", fontsize=12)
plt.ylabel("NCP%", fontsize=12)
#plt.title("Comparison of Algorithms")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(fontsize=12)
plt.grid(True)
plt.ylim(0, 80)
plt.show()

In [None]:
# cleanup data folder
import shutil
shutil.rmtree(os.path.join("..", data_folder))