In [None]:
from utils import *
from datasets import *
from mdav import *
from train import *
from models import *
from attacks import *

import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, TensorDataset, Subset


import random
import time
import copy
import pickle
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

import scipy
import csv

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


%matplotlib inline

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning

# Filter out ConvergenceWarning and FitFailedWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
def seed_everything(seed=7):
    np.random.seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=7)

In [None]:
# Load and preprocess data
columns = ["age", "workClass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex", 
           "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

train_data = pd.read_csv('data/adult/adult.data', names=columns, sep=r' *, *', engine='python', na_values='?')
test_data = pd.read_csv('data/adult/adult.test', names=columns, sep=r' *, *', skiprows=1, engine='python', na_values='?')

num_pipeline = Pipeline(steps=[
    ("num_attr_selector", ColumnsSelector(type='int')),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ("cat_attr_selector", ColumnsSelector(type='object')),
    ("cat_imputer", CategoricalImputer(columns=['workClass','occupation', 'native-country'])),
    ("encoder", CategoricalEncoder(train_data, test_data, dropFirst=True))
])

full_pipeline = FeatureUnion([("num_pipe", num_pipeline), ("cat_pipeline", cat_pipeline)])

# Drop useless columns
train_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)
train_data.dropna(inplace=True)
test_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)
test_data.dropna(inplace=True)

# copy the data before preprocessing
train_copy = train_data.copy()
# convert the income column to 0 or 1 and then drop the column for the feature vectors
train_copy["income"] = train_copy["income"].apply(lambda x:0 if x=='<=50K' else 1)
# creating the feature vector 
X_train = train_copy.drop('income', axis =1)
# target values
y_train = train_copy['income'].values
# pass the data through the full_pipeline
X_train = full_pipeline.fit_transform(X_train)

# take a copy of the test data set
test_copy = test_data.copy()
# convert the income column to 0 or 1
test_copy["income"] = test_copy["income"].apply(lambda x:0 if x=='<=50K.' else 1)
# separating the feature vecotrs and the target values
X_test = test_copy.drop('income', axis =1)
y_test = test_copy['income'].values
# preprocess the test data using the full pipeline
# here we set the type_df param to 'test'
X_test = full_pipeline.transform(X_test)


# Randomly sample retain and forget sets
forget_ratio = 0.05
idxs = np.arange(len(y_train))
random.shuffle(idxs)
m = int(len(y_train)*forget_ratio)
retain_idxs = idxs[m:]
forget_idxs = idxs[:m]
X_retain = X_train[retain_idxs]
y_retain = y_train[retain_idxs]
X_forget = X_train[forget_idxs]
y_forget = y_train[forget_idxs]

# Create TensorDatasets
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.int64))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.int64))
retain_dataset = TensorDataset(torch.tensor(X_retain, dtype=torch.float32), torch.tensor(y_retain, dtype=torch.int64))
forget_dataset = TensorDataset(torch.tensor(X_forget, dtype=torch.float32), torch.tensor(y_forget, dtype=torch.int64))

# Create DataLoader instances
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
retain_loader = DataLoader(retain_dataset, batch_size=batch_size, shuffle=True)
forget_loader = DataLoader(forget_dataset, batch_size=batch_size, shuffle=False)

counter = Counter(y_train)
for k,v in counter.items():
    per = v / len(y_train) * 100
    print('Class=%s, Count=%d, Percentage=%.2f%%' % (k, v, per))
    
num_features = X_train.shape[-1]
num_classes = len(set(y_train))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
initial_model = MLPModel(num_features, 128, num_classes)
criterion = nn.CrossEntropyLoss()
lr = 1e-2
n_repeat = 3
max_epochs = 100
patience = None

In [None]:
# Step 2: Define and train M on D
train_accs = []
test_accs = []
mia_aucs = []
mia_advs = []
runtimes = []
for r in range(n_repeat):
    torch.cuda.empty_cache()
    model = copy.deepcopy(initial_model)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    t0 = time.time()
    model = train_model(model, train_loader, test_loader, criterion, optimizer, 
                        max_epochs, device=device, verbose_epoch = int(max_epochs/10), 
                        patience = patience)

    t1 = time.time()
    rt = t1-t0
    runtimes.append(rt)
    
    # Evaluate the model accuracy, and MIA
    model.eval()
    # Accuracy
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    train_accs.append(100.0*train_acc)
    test_accs.append(100.0*test_acc)
    #MIA
    idxs = np.arange(len(test_dataset))
    random.shuffle(idxs)
    rand_idxs = idxs[:m]
    logits_test, loss_test, test_labels = compute_attack_components(model, test_loader)
    logits_forget, loss_forget, forget_labels = compute_attack_components(model, forget_loader)
    attack_result = tf_attack(logits_forget, logits_test[rand_idxs], loss_forget, loss_test[rand_idxs], 
                          forget_labels, test_labels[rand_idxs])
    auc = attack_result.get_result_with_max_auc().get_auc()
    adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
    mia_aucs.append(100.0*auc)
    mia_advs.append(100.0*adv)

mean_runtime = np.mean(runtimes)
std_runtime = np.std(runtimes)
mean_train_acc = np.mean(train_accs)
std_train_acc = np.std(train_accs)
mean_test_acc = np.mean(test_accs)
std_test_acc = np.std(test_accs)
mean_mia_auc = np.mean(mia_aucs)
std_mia_auc = np.std(mia_aucs)
mean_mia_adv = np.mean(mia_advs)
std_mia_adv = np.std(mia_advs)

# Print the results
print('Training M on D time:{:0.2f}(±{:0.2f}) seconds'.format(mean_runtime, std_runtime))
print('Train accuracy:{:0.2f}(±{:0.2f})%'.format(mean_train_acc, std_train_acc))
print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_test_acc, std_test_acc))
print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_mia_auc, std_mia_auc))
print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_mia_adv, std_mia_adv))

# Save to CSV
csv_file_path = 'results/adult/mlp_m_d_fr={}.csv'.format(forget_ratio)

with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
    writer.writerow(['Training Time', mean_runtime, std_runtime])
    writer.writerow(['Train accuracy', mean_train_acc, std_train_acc])
    writer.writerow(['Test accuracy', mean_test_acc, std_test_acc])
    writer.writerow(['MIA AUC', mean_mia_auc, std_mia_auc])
    writer.writerow(['MIA Advantage', mean_mia_adv, std_mia_adv])


In [None]:
# Step 3: Train M_retain on D_retain
retain_accs = []
forget_accs = []
test_accs = []
mia_aucs = []
mia_advs = []
runtimes = []
for r in range(n_repeat):
    torch.cuda.empty_cache()
    model_ret = copy.deepcopy(initial_model)
    optimizer = optim.Adam(model_ret.parameters(), lr=lr)
    t0 = time.time()
    model_ret = train_model(model_ret, retain_loader, test_loader, criterion, optimizer, 
                    max_epochs, device=device, verbose_epoch = int(max_epochs/10), 
                        patience = patience)

    t1 = time.time()
    rt = t1-t0
    runtimes.append(rt)
    
    # Evaluate the model accuracy, and MIA
    model_ret.eval()
    # Accuracy
    retain_acc = accuracy(model_ret, retain_loader)
    test_acc = accuracy(model_ret, test_loader)
    forget_acc = accuracy(model_ret, forget_loader)
    retain_accs.append(100.0*retain_acc)
    forget_accs.append(100.0*forget_acc)
    test_accs.append(100.0*test_acc)
    #MIA
    logits_test, loss_test, test_labels = compute_attack_components(model_ret, test_loader)
    logits_forget, loss_forget, forget_labels = compute_attack_components(model_ret, forget_loader)
    attack_result = tf_attack(logits_forget, logits_test[rand_idxs], loss_forget, loss_test[rand_idxs], 
                          forget_labels, test_labels[rand_idxs])
    auc = attack_result.get_result_with_max_auc().get_auc()
    adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
    mia_aucs.append(100.0*auc)
    mia_advs.append(100.0*adv)
    

mean_retrain_runtime = np.mean(runtimes)
std_retrain_runtime = np.std(runtimes)
mean_retain_acc = np.mean(retain_accs)
std_retain_acc = np.std(retain_accs)
mean_forget_acc = np.mean(forget_accs)
std_forget_acc = np.std(forget_accs)
mean_retrain_test_acc = np.mean(test_accs)
std_retrain_test_acc = np.std(test_accs)
mean_retrain_mia_auc = np.mean(mia_aucs)
std_retrain_mia_auc = np.std(mia_aucs)
mean_retrain_mia_adv = np.mean(mia_advs)
std_retrain_mia_adv = np.std(mia_advs)

# Print the results
print('Retraining M on D_ret time:{:0.2f}(±{:0.2f}) seconds'.format(mean_retrain_runtime, std_retrain_runtime))
print('Retain accuracy:{:0.2f}(±{:0.2f})%'.format(mean_retain_acc, std_retain_acc))
print('Forget accuracy:{:0.2f}(±{:0.2f})%'.format(mean_forget_acc, std_forget_acc))
print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_retrain_test_acc, std_retrain_test_acc))
print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_retrain_mia_auc, std_retrain_mia_auc))
print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_retrain_mia_adv, std_retrain_mia_adv))

# Save to CSV
csv_retrain_file_path = 'results/adult/mlp_mret_dret_fr={}.csv'.format(forget_ratio)

with open(csv_retrain_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
    writer.writerow(['Retraining Time', mean_retrain_runtime, std_retrain_runtime])
    writer.writerow(['Retain accuracy', mean_retain_acc, std_retain_acc])
    writer.writerow(['Forget accuracy', mean_forget_acc, std_forget_acc])
    writer.writerow(['Test accuracy', mean_retrain_test_acc, std_retrain_test_acc])
    writer.writerow(['MIA AUC', mean_retrain_mia_auc, std_retrain_mia_auc])
    writer.writerow(['MIA Advantage', mean_retrain_mia_adv, std_retrain_mia_adv])


# k-anonymity

In [None]:
# Step 1: k-anonymize and prepare D_k
ft_epochs_list = [5, 10, 20]
for ft_epochs in ft_epochs_list:
    K = [3, 5, 10, 20, 80]
    for k in K:
        runtimes_k = []
        t0 = time.time()
        centroids, clusters, labels, X_train_k, y_train_k = mdav(copy.deepcopy(X_train), copy.deepcopy(y_train), k)
        print('Shape of X_train_k:{}, y_train_k:{}'.format(X_train_k.shape, y_train_k.shape))
         # Create TensorDatasets
        train_dataset_k = TensorDataset(torch.tensor(X_train_k, dtype=torch.float32), torch.tensor(y_train_k, dtype=torch.int64))
        train_loader_k = DataLoader(train_dataset_k, batch_size=batch_size, shuffle=True)
        t1 = time.time()
        rt_k = t1- t0
        runtimes_k.append(rt_k)

        train_accs_k = []
        test_accs_k = []
        mia_aucs_k = []
        mia_advs_k = []
        runtimes_train_k = []

        train_accs_k_D = []
        test_accs_k_D = []
        mia_aucs_k_D = []
        mia_advs_k_D = []
        runtimes_train_k_D = []

        retain_accs_k_ret = []
        forget_accs_k_ret = []
        test_accs_k_ret = []
        mia_aucs_k_ret = []
        mia_advs_k_ret = []
        runtimes_train_k_ret = []

        for r in range(n_repeat):
            torch.cuda.empty_cache()
            model_k = copy.deepcopy(initial_model)
            optimizer = optim.Adam(model_k.parameters(), lr=lr)
            t0 = time.time()
            model_k = train_model(model_k, train_loader_k, test_loader, criterion, optimizer, 
                            max_epochs, device=device, verbose_epoch = int(max_epochs/10), 
                            patience = patience)

            t1 = time.time()
            rt_train = t1- t0
            runtimes_train_k.append(rt_train)

            # Evaluate the model accuracy, and MIA
            model_k.eval()
            #Accuracy
            train_acc = accuracy(model_k, train_loader)
            test_acc = accuracy(model_k, test_loader)
            train_accs_k.append(100.0*train_acc)
            test_accs_k.append(100.0*test_acc)
            #MIA
            logits_test, loss_test, test_labels = compute_attack_components(model_k, test_loader)
            logits_forget, loss_forget, forget_labels = compute_attack_components(model_k, forget_loader)
            attack_result = tf_attack(logits_forget, logits_test[rand_idxs], loss_forget, loss_test[rand_idxs], 
                                  forget_labels, test_labels[rand_idxs])
            auc = attack_result.get_result_with_max_auc().get_auc()
            adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
            mia_aucs_k.append(100.0*auc)
            mia_advs_k.append(100.0*adv)

            model_k_D = copy.deepcopy(model_k)
            torch.cuda.empty_cache()
            optimizer = optim.Adam(model_k_D.parameters(), lr=lr)
            t0 = time.time()
            model_k_D = train_model(model_k_D, train_loader, test_loader, criterion, optimizer, 
                                ft_epochs, device=device, verbose_epoch = int(max_epochs/10), 
                                  patience = patience)

            t1 = time.time()
            rt = t1-t0
            runtimes_train_k_D.append(rt)

            # Evaluate the model accuracy, and MIA
            model_k_D.eval()
            #Accuracy
            train_acc = accuracy(model_k_D, train_loader)
            test_acc = accuracy(model_k_D, test_loader)
            train_accs_k_D.append(100.0*train_acc)
            test_accs_k_D.append(100.0*test_acc)
            #MIA
            logits_test, loss_test, test_labels = compute_attack_components(model_k_D, test_loader)
            logits_forget, loss_forget, forget_labels = compute_attack_components(model_k_D, forget_loader)
            attack_result = tf_attack(logits_forget, logits_test[rand_idxs], loss_forget, loss_test[rand_idxs], 
                                  forget_labels, test_labels[rand_idxs])
            auc = attack_result.get_result_with_max_auc().get_auc()
            adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
            mia_aucs_k_D.append(100.0*auc)
            mia_advs_k_D.append(100.0*adv)

            model_k_ret = copy.deepcopy(model_k)
            torch.cuda.empty_cache()
            optimizer = optim.Adam(model_k_ret.parameters(), lr=lr)
            t0 = time.time()
            model_k_ret = train_model(model_k_ret, retain_loader, test_loader, criterion, optimizer, 
                                ft_epochs, device=device, verbose_epoch = int(max_epochs/10), 
                                  patience = patience)

            t1 = time.time()
            rt = t1-t0
            runtimes_train_k_ret.append(rt)
            # Evaluate the model accuracy, and MIA
            model_k_ret.eval()
            #Accuracy
            retain_acc = accuracy(model_k_ret, retain_loader)
            forget_acc = accuracy(model_k_ret, forget_loader)
            test_acc = accuracy(model_k_ret, test_loader)
            retain_accs_k_ret.append(100.0*retain_acc)
            forget_accs_k_ret.append(100.0*forget_acc)
            test_accs_k_ret.append(100.0*test_acc)
            #MIA
            logits_test, loss_test, test_labels = compute_attack_components(model_k_ret, test_loader)
            logits_forget, loss_forget, forget_labels = compute_attack_components(model_k_ret, forget_loader)
            attack_result = tf_attack(logits_forget, logits_test[rand_idxs], loss_forget, loss_test[rand_idxs], 
                                  forget_labels, test_labels[rand_idxs])
            auc = attack_result.get_result_with_max_auc().get_auc()
            adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
            mia_aucs_k_ret.append(100.0*auc)
            mia_advs_k_ret.append(100.0*adv)


        # Anonymizing D and training M_k on D_k
        mean_anonymize_time = np.mean(runtimes_k)
        std_anonymize_time = np.std(runtimes_k)
        mean_train_k_time = np.mean(runtimes_train_k)
        std_train_k_time = np.std(runtimes_train_k)
        mean_train_k_acc = np.mean(train_accs_k)
        std_train_k_acc = np.std(train_accs_k)
        mean_test_k_acc = np.mean(test_accs_k)
        std_test_k_acc = np.std(test_accs_k)
        mean_mia_k_auc = np.mean(mia_aucs_k)
        std_mia_k_auc = np.std(mia_aucs_k)
        mean_mia_k_adv = np.mean(mia_advs_k)
        std_mia_k_adv = np.std(mia_advs_k)

        # Finetuning M_k on D
        mean_finetune_D_time = np.mean(runtimes_train_k_D)
        std_finetune_D_time = np.std(runtimes_train_k_D)
        mean_finetune_D_train_acc = np.mean(train_accs_k_D)
        std_finetune_D_train_acc = np.std(train_accs_k_D)
        mean_finetune_D_test_acc = np.mean(test_accs_k_D)
        std_finetune_D_test_acc = np.std(test_accs_k_D)
        mean_finetune_D_mia_auc = np.mean(mia_aucs_k_D)
        std_finetune_D_mia_auc = np.std(mia_aucs_k_D)
        mean_finetune_D_mia_adv = np.mean(mia_advs_k_D)
        std_finetune_D_mia_adv = np.std(mia_advs_k_D)

        # Finetuning M_k on D_ret
        mean_finetune_D_ret_time = np.mean(runtimes_train_k_ret)
        std_finetune_D_ret_time = np.std(runtimes_train_k_ret)
        mean_finetune_D_ret_train_acc = np.mean(retain_accs_k_ret)
        std_finetune_D_ret_train_acc = np.std(retain_accs_k_ret)
        mean_finetune_D_ret_forget_acc = np.mean(forget_accs_k_ret)
        std_finetune_D_ret_forget_acc = np.std(forget_accs_k_ret)
        mean_finetune_D_ret_test_acc = np.mean(test_accs_k_ret)
        std_finetune_D_ret_test_acc = np.std(test_accs_k_ret)
        mean_finetune_D_ret_mia_auc = np.mean(mia_aucs_k_ret)
        std_finetune_D_ret_mia_auc = np.std(mia_aucs_k_ret)
        mean_finetune_D_ret_mia_adv = np.mean(mia_advs_k_ret)
        std_finetune_D_ret_mia_adv = np.std(mia_advs_k_ret)



        # Print the results
        print('----------------------------------------')
        print('k=', k, 'Fine-tuning epochs=', ft_epochs)
        print('----------------------------------------')
        print('-----Anonymizing D and training M_k on D_k-----')
        print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_anonymize_time, std_anonymize_time))
        print('Training M_k on D_k time:{:0.2f}(±{:0.2f})'.format(mean_train_k_time, std_train_k_time))
        print('Train accuracy:{:0.2f}(±{:0.2f})%'.format(mean_train_k_acc, std_train_k_acc))
        print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_test_k_acc, std_test_k_acc))
        print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_mia_k_auc, std_mia_k_auc))
        print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_mia_k_adv, std_mia_k_adv))

        print('-----Finetuning M_k on D-----')
        print('Training M_k on D time:{:0.2f}(±{:0.2f})'.format(mean_finetune_D_time, std_finetune_D_time))
        print('Train accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_train_acc, std_finetune_D_train_acc))
        print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_test_acc, std_finetune_D_test_acc))
        print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_mia_auc, std_finetune_D_mia_auc))
        print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_mia_adv, std_finetune_D_mia_adv))

        print('-----Finetuning M_k on D_ret-----')
        print('Finetuning M_k on D_retain time:{:0.2f}(±{:0.2f}) seconds'.format(mean_finetune_D_ret_time, std_finetune_D_ret_time))
        print('Retain accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_train_acc, std_finetune_D_ret_train_acc))
        print('Forget accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_forget_acc, std_finetune_D_ret_forget_acc))
        print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_test_acc, std_finetune_D_ret_test_acc))
        print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_mia_auc, std_finetune_D_ret_mia_auc))
        print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_mia_adv, std_finetune_D_ret_mia_adv))
        print('----------------------------------------')

        # Save to CSV
        csv_anonymize_file_path = 'results/adult/mlp_mk={}_dk_fr={}.csv'.format(k, forget_ratio)
        csv_finetune_D_file_path = 'results/adult/mlp_mk={}_d_fr={}_epochs={}.csv'.format(k, forget_ratio, ft_epochs)
        csv_finetune_D_ret_file_path = 'results/adult/mlp_mk={}_dret_fr={}_epochs={}.csv'.format(k, forget_ratio, ft_epochs)

        # Writing to CSV for anonymizing, finetuning on D, and finetuning on D_ret
        with open(csv_anonymize_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
            writer.writerow(['Anonymizing Time', mean_anonymize_time, std_anonymize_time])
            writer.writerow(['Training M_k on D_k Time', mean_train_k_time, std_train_k_time])
            writer.writerow(['Train accuracy', mean_train_k_acc, std_train_k_acc])
            writer.writerow(['Test accuracy', mean_test_k_acc, std_test_k_acc])
            writer.writerow(['MIA AUC', mean_mia_k_auc, std_mia_k_auc])
            writer.writerow(['MIA Advantage', mean_mia_k_adv, std_mia_k_adv])

        with open(csv_finetune_D_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
            writer.writerow(['Training M_k on D Time', mean_finetune_D_time, std_finetune_D_time])
            writer.writerow(['Train accuracy', mean_finetune_D_train_acc, std_finetune_D_train_acc])
            writer.writerow(['Test accuracy', mean_finetune_D_test_acc, std_finetune_D_test_acc])
            writer.writerow(['MIA AUC', mean_finetune_D_mia_auc, std_finetune_D_mia_auc])
            writer.writerow(['MIA Advantage', mean_finetune_D_mia_adv, std_finetune_D_mia_adv])

        with open(csv_finetune_D_ret_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
            writer.writerow(['Finetuning M_k on D_retain Time', mean_finetune_D_ret_time, std_finetune_D_ret_time])
            writer.writerow(['Retain accuracy', mean_finetune_D_ret_train_acc, std_finetune_D_ret_train_acc])
            writer.writerow(['Forget accuracy', mean_finetune_D_ret_forget_acc, std_finetune_D_ret_forget_acc])
            writer.writerow(['Test accuracy', mean_finetune_D_ret_test_acc, std_finetune_D_ret_test_acc])
            writer.writerow(['MIA AUC', mean_finetune_D_ret_mia_auc, std_finetune_D_ret_mia_auc])
            writer.writerow(['MIA Advantage', mean_finetune_D_ret_mia_adv, std_finetune_D_ret_mia_adv])


# Differential privacy

In [None]:
# Step 1: k-anonymize and prepare D_k
ft_epochs_list = [5, 10, 20]
for ft_epochs in ft_epochs_list:
    EPS = [0.5, 2.5, 5.0, 25.0, 50.0, 100.0]
    for eps in EPS:
        dp_train_data = pd.read_csv('dp_data/adult/dp_adult_eps={}.csv'.format(eps), sep=r' *, *', engine='python', na_values='?')
        dp_train_data.head()
        # Drop useless columns
        dp_train_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)
        dp_train_data.dropna(inplace=True)
        # convert the income column to 0 or 1 and then drop the column for the feature vectors
        dp_train_data["income"] = dp_train_data["income"].apply(lambda x:0 if x=='<=50K' else 1)
        # creating the feature vector 
        X_train_dp = dp_train_data.drop('income', axis =1)
        # target values
        y_train_dp = dp_train_data['income'].values
        # pass the data through the full_pipeline
        X_train_dp = full_pipeline.fit_transform(X_train_dp)
        # Create TensorDatasets
        train_dataset_k = TensorDataset(torch.tensor(X_train_dp, dtype=torch.float32), torch.tensor(y_train_dp, dtype=torch.int64))
        train_loader_k = DataLoader(train_dataset_k, batch_size=batch_size, shuffle=True)

        train_accs_k = []
        test_accs_k = []
        mia_aucs_k = []
        mia_advs_k = []
        runtimes_train_k = []

        train_accs_k_D = []
        test_accs_k_D = []
        mia_aucs_k_D = []
        mia_advs_k_D = []
        runtimes_train_k_D = []

        retain_accs_k_ret = []
        forget_accs_k_ret = []
        test_accs_k_ret = []
        mia_aucs_k_ret = []
        mia_advs_k_ret = []
        runtimes_train_k_ret = []

        for r in range(n_repeat):
            torch.cuda.empty_cache()
            model_k = copy.deepcopy(initial_model)
            optimizer = optim.Adam(model_k.parameters(), lr=lr)
            t0 = time.time()
            model_k = train_model(model_k, train_loader_k, test_loader, criterion, optimizer, 
                            max_epochs, device=device, verbose_epoch = int(max_epochs/10), 
                            patience = patience)

            t1 = time.time()
            rt_train = t1- t0
            runtimes_train_k.append(rt_train)

            # Evaluate the model accuracy, and MIA
            model_k.eval()
            #Accuracy
            train_acc = accuracy(model_k, train_loader)
            test_acc = accuracy(model_k, test_loader)
            train_accs_k.append(100.0*train_acc)
            test_accs_k.append(100.0*test_acc)
            #MIA
            idxs = np.arange(len(test_dataset))
            random.shuffle(idxs)
            rand_idxs = idxs[:m]
            logits_test, loss_test, test_labels = compute_attack_components(model_k, test_loader)
            logits_forget, loss_forget, forget_labels = compute_attack_components(model_k, forget_loader)
            attack_result = tf_attack(logits_forget, logits_test[rand_idxs], loss_forget, loss_test[rand_idxs], 
                                  forget_labels, test_labels[rand_idxs])
            auc = attack_result.get_result_with_max_auc().get_auc()
            adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
            mia_aucs_k.append(100.0*auc)
            mia_advs_k.append(100.0*adv)

            model_k_D = copy.deepcopy(model_k)
            torch.cuda.empty_cache()
            optimizer = optim.Adam(model_k_D.parameters(), lr=lr)
            t0 = time.time()
            model_k_D = train_model(model_k_D, train_loader, test_loader, criterion, optimizer, 
                                ft_epochs, device=device, verbose_epoch = int(max_epochs/10), 
                                  patience = patience)

            t1 = time.time()
            rt = t1-t0
            runtimes_train_k_D.append(rt)

            # Evaluate the model accuracy, and MIA
            model_k_D.eval()
            #Accuracy
            train_acc = accuracy(model_k_D, train_loader)
            test_acc = accuracy(model_k_D, test_loader)
            train_accs_k_D.append(100.0*train_acc)
            test_accs_k_D.append(100.0*test_acc)
            #MIA
            logits_test, loss_test, test_labels = compute_attack_components(model_k_D, test_loader)
            logits_forget, loss_forget, forget_labels = compute_attack_components(model_k_D, forget_loader)
            attack_result = tf_attack(logits_forget, logits_test[rand_idxs], loss_forget, loss_test[rand_idxs], 
                                  forget_labels, test_labels[rand_idxs])
            auc = attack_result.get_result_with_max_auc().get_auc()
            adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
            mia_aucs_k_D.append(100.0*auc)
            mia_advs_k_D.append(100.0*adv)

            model_k_ret = copy.deepcopy(model_k)
            torch.cuda.empty_cache()
            optimizer = optim.Adam(model_k_ret.parameters(), lr=lr)
            t0 = time.time()
            model_k_ret = train_model(model_k_ret, retain_loader, test_loader, criterion, optimizer, 
                                ft_epochs, device=device, verbose_epoch = int(max_epochs/10), 
                                  patience = patience)

            t1 = time.time()
            rt = t1-t0
            runtimes_train_k_ret.append(rt)
            # Evaluate the model accuracy, and MIA
            model_k_ret.eval()
            #Accuracy
            retain_acc = accuracy(model_k_ret, retain_loader)
            forget_acc = accuracy(model_k_ret, forget_loader)
            test_acc = accuracy(model_k_ret, test_loader)
            retain_accs_k_ret.append(100.0*retain_acc)
            forget_accs_k_ret.append(100.0*forget_acc)
            test_accs_k_ret.append(100.0*test_acc)
            #MIA
            logits_test, loss_test, test_labels = compute_attack_components(model_k_ret, test_loader)
            logits_forget, loss_forget, forget_labels = compute_attack_components(model_k_ret, forget_loader)
            attack_result = tf_attack(logits_forget, logits_test[rand_idxs], loss_forget, loss_test[rand_idxs], 
                                  forget_labels, test_labels[rand_idxs])
            auc = attack_result.get_result_with_max_auc().get_auc()
            adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
            mia_aucs_k_ret.append(100.0*auc)
            mia_advs_k_ret.append(100.0*adv)


        # Anonymizing D and training M_k on D_k
        mean_train_k_time = np.mean(runtimes_train_k)
        std_train_k_time = np.std(runtimes_train_k)
        mean_train_k_acc = np.mean(train_accs_k)
        std_train_k_acc = np.std(train_accs_k)
        mean_test_k_acc = np.mean(test_accs_k)
        std_test_k_acc = np.std(test_accs_k)
        mean_mia_k_auc = np.mean(mia_aucs_k)
        std_mia_k_auc = np.std(mia_aucs_k)
        mean_mia_k_adv = np.mean(mia_advs_k)
        std_mia_k_adv = np.std(mia_advs_k)

        # Finetuning M_k on D
        mean_finetune_D_time = np.mean(runtimes_train_k_D)
        std_finetune_D_time = np.std(runtimes_train_k_D)
        mean_finetune_D_train_acc = np.mean(train_accs_k_D)
        std_finetune_D_train_acc = np.std(train_accs_k_D)
        mean_finetune_D_test_acc = np.mean(test_accs_k_D)
        std_finetune_D_test_acc = np.std(test_accs_k_D)
        mean_finetune_D_mia_auc = np.mean(mia_aucs_k_D)
        std_finetune_D_mia_auc = np.std(mia_aucs_k_D)
        mean_finetune_D_mia_adv = np.mean(mia_advs_k_D)
        std_finetune_D_mia_adv = np.std(mia_advs_k_D)

        # Finetuning M_k on D_ret
        mean_finetune_D_ret_time = np.mean(runtimes_train_k_ret)
        std_finetune_D_ret_time = np.std(runtimes_train_k_ret)
        mean_finetune_D_ret_train_acc = np.mean(retain_accs_k_ret)
        std_finetune_D_ret_train_acc = np.std(retain_accs_k_ret)
        mean_finetune_D_ret_forget_acc = np.mean(forget_accs_k_ret)
        std_finetune_D_ret_forget_acc = np.std(forget_accs_k_ret)
        mean_finetune_D_ret_test_acc = np.mean(test_accs_k_ret)
        std_finetune_D_ret_test_acc = np.std(test_accs_k_ret)
        mean_finetune_D_ret_mia_auc = np.mean(mia_aucs_k_ret)
        std_finetune_D_ret_mia_auc = np.std(mia_aucs_k_ret)
        mean_finetune_D_ret_mia_adv = np.mean(mia_advs_k_ret)
        std_finetune_D_ret_mia_adv = np.std(mia_advs_k_ret)

        # Print the results
        print('----------------------------------------')
        print('Epsilon=', eps, 'Fine-tuning epochs=', ft_epochs)
        print('----------------------------------------')
        print('-----Anonymizing D and training M_dp on D_dp-----')
        print('Training M_k on D_k time:{:0.2f}(±{:0.2f})'.format(mean_train_k_time, std_train_k_time))
        print('Train accuracy:{:0.2f}(±{:0.2f})%'.format(mean_train_k_acc, std_train_k_acc))
        print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_test_k_acc, std_test_k_acc))
        print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_mia_k_auc, std_mia_k_auc))
        print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_mia_k_adv, std_mia_k_adv))

        print('-----Finetuning M_k on D-----')
        print('Training M_k on D time:{:0.2f}(±{:0.2f})'.format(mean_finetune_D_time, std_finetune_D_time))
        print('Train accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_train_acc, std_finetune_D_train_acc))
        print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_test_acc, std_finetune_D_test_acc))
        print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_mia_auc, std_finetune_D_mia_auc))
        print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_mia_adv, std_finetune_D_mia_adv))

        print('-----Finetuning M_k on D_ret-----')
        print('Finetuning M_k on D_retain time:{:0.2f}(±{:0.2f}) seconds'.format(mean_finetune_D_ret_time, std_finetune_D_ret_time))
        print('Retain accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_train_acc, std_finetune_D_ret_train_acc))
        print('Forget accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_forget_acc, std_finetune_D_ret_forget_acc))
        print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_test_acc, std_finetune_D_ret_test_acc))
        print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_mia_auc, std_finetune_D_ret_mia_auc))
        print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_finetune_D_ret_mia_adv, std_finetune_D_ret_mia_adv))
        print('----------------------------------------')

        # Save to CSV
        csv_anonymize_file_path = 'results/adult/mlp_mdp_eps={}_fr={}.csv'.format(eps, forget_ratio)
        csv_finetune_D_file_path = 'results/adult/mlp_mdpd_eps={}_fr={}_epochs={}.csv'.format(eps, forget_ratio, ft_epochs)
        csv_finetune_D_ret_file_path = 'results/adult/mlp_mdpret_eps={}_fr={}_epochs={}.csv'.format(eps, forget_ratio, ft_epochs)

        # Writing to CSV for anonymizing, finetuning on D, and finetuning on D_ret
        with open(csv_anonymize_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
            writer.writerow(['Training M_k on D_k Time', mean_train_k_time, std_train_k_time])
            writer.writerow(['Train Accuracy', mean_train_k_acc, std_train_k_acc])
            writer.writerow(['Test Accuracy', mean_test_k_acc, std_test_k_acc])
            writer.writerow(['MIA AUC', mean_mia_k_auc, std_mia_k_auc])
            writer.writerow(['MIA Advantage', mean_mia_k_adv, std_mia_k_adv])

        with open(csv_finetune_D_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
            writer.writerow(['Training M_k on D Time', mean_finetune_D_time, std_finetune_D_time])
            writer.writerow(['Train Accuracy', mean_finetune_D_train_acc, std_finetune_D_train_acc])
            writer.writerow(['Test Accuracy', mean_finetune_D_test_acc, std_finetune_D_test_acc])
            writer.writerow(['MIA AUC', mean_finetune_D_mia_auc, std_finetune_D_mia_auc])
            writer.writerow(['MIA Advantage', mean_finetune_D_mia_adv, std_finetune_D_mia_adv])

        with open(csv_finetune_D_ret_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
            writer.writerow(['Finetuning M_k on D_retain Time', mean_finetune_D_ret_time, std_finetune_D_ret_time])
            writer.writerow(['Retain Accuracy', mean_finetune_D_ret_train_acc, std_finetune_D_ret_train_acc])
            writer.writerow(['Forget Accuracy', mean_finetune_D_ret_forget_acc, std_finetune_D_ret_forget_acc])
            writer.writerow(['Test Accuracy', mean_finetune_D_ret_test_acc, std_finetune_D_ret_test_acc])
            writer.writerow(['MIA AUC', mean_finetune_D_ret_mia_auc, std_finetune_D_ret_mia_auc])
            writer.writerow(['MIA Advantage', mean_finetune_D_ret_mia_adv, std_finetune_D_ret_mia_adv])