In [None]:
import pandas as pd
import warnings, sys, random, os, csv
from contextlib import redirect_stdout
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('./DP_Random_Forest')
from Smooth_Random_Trees import DP_Random_Forest
from sklearn.feature_selection import SelectKBest, f_classif
from diffprivlib.models import LogisticRegression as DPLR
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate, GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from opendp.smartnoise.synthesizers.quail import QUAILSynthesizer
from opendp.smartnoise.synthesizers.pytorch.pytorch_synthesizer import PytorchDPSynthesizer
from opendp.smartnoise.synthesizers.preprocessors.preprocessing import GeneralTransformer
from opendp.smartnoise.synthesizers.pytorch.nn.patectgan import PATECTGAN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

myseed = 42
np.random.seed(myseed)
random.seed(myseed)

# create dirs for datastes and figures
def create_dir_if_not_exists(dir_name):
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

create_dir_if_not_exists('figures')
create_dir_if_not_exists('tmp_results')
create_dir_if_not_exists('tmp_synthetic_data')
create_dir_if_not_exists('results')
      
# plot is based on the example of diffprivlib, see https://github.com/IBM/differential-privacy-library 
epsilons = np.logspace(-2, 2, 50)
def make_privacy_utility_plot(xlabel, clf_epsilons, clf_accuracies):
    fig, ax = plt.subplots()
    plt.semilogx(clf_epsilons, clf_accuracies, label=xlabel)
    plt.xlabel("Epsilon")
    plt.ylabel("Accuracy")
    plt.ylim(0,1)
    plt.axhline(y=0.955, color='coral', linestyle='dotted', label="Random Forest")
    plt.axhline(y=0.5, color='grey', linestyle='dotted', label="Random Classifier")
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.legend(loc='lower right', frameon=False)
    plt.grid(True, alpha=0.15, linestyle='--')
    plt.savefig(f"./figures/{xlabel.replace(' ', '')}.pdf", bbox_inches='tight')
    plt.show()

## 1. Read data and select features

In [None]:
df = pd.read_csv("../data/browsererkennung_features_tranco500.csv", delimiter=",")
# filter rows which consist mainly of NaNs
df = df.loc[:, df.isnull().sum() < 0.5*df.shape[0]]
df = df.fillna(-1)
df["browser"] = [0 if val=="firefox" else 1 for val in df.browser]

# use only URLs with 10 runs
counts = df['url'].value_counts()
indices = [i for i,v in enumerate(counts) if v == 10]
complete = counts[indices].index
urls = list(complete)
df = df[df['url'].isin(urls)]

# filter constant features
df = df.loc[:,df.apply(pd.Series.nunique) != 1]
groups = df['url'] 

# select 10 best features
target = np.array(df["browser"])
df = df.drop(["run_id", "url", "browser"], axis=1)
selector = SelectKBest(f_classif, k=10)
features_df = selector.fit_transform(df, target)

# selected features
selected_features = selector.get_support(indices=True)
plot_features = {}
for s_feature in selected_features:
    plot_features[df.columns[s_feature]] = selector.scores_[s_feature]
plot_features = {feature: score for feature, score in sorted(plot_features.items(), key=lambda x: x[1])}
features_df = pd.DataFrame(features_df, columns=df.columns[selector.get_support()])

# insert browser at first position
features_df.insert(loc=0,column='browser', value=target)

## 2. Check Accuracy

### 2.1 Random Forest without Privacy

In [None]:
X = features_df.drop(["browser"], axis=1)
y = features_df["browser"]
gkf = GroupKFold(n_splits = 10)
accuracies_cv = []
recalls_chrome_cv = []
recalls_ff_cv = []
precisions_chrome_cv = []
precisions_ff_cv = []

# cross validation
for train_index, test_index in gkf.split(X, y, groups):
    X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
    y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]
    scaler = StandardScaler()
    pipeline = Pipeline([('scaler', scaler), ('clf', RandomForestClassifier(n_jobs=-1,
                                                                            random_state=myseed,
                                                                            n_estimators=50))])
    pipeline.fit(X_train_cv, y_train_cv)
    y_pred = pipeline.predict(X_test_cv)
    accuracies_cv.append(accuracy_score(y_test_cv, y_pred))
    recalls_chrome_cv.append(recall_score(y_test_cv, y_pred, pos_label=1))
    recalls_ff_cv.append(recall_score(y_test_cv, y_pred, pos_label=0))
    precisions_chrome_cv.append(precision_score(y_test_cv, y_pred, pos_label=1))
    precisions_ff_cv.append(precision_score(y_test_cv, y_pred, pos_label=0))

# report results
print("Accuracy: %0.3f \t\t Std: %0.3f" % (np.mean(accuracies_cv), np.std(accuracies_cv)))            
print("Precision - FF: %0.3f \t\t Std: %0.3f" % (np.mean(precisions_ff_cv), np.std(precisions_ff_cv)))
print("Recall - FF: %0.3f \t\t Std: %0.3f" % (np.mean(recalls_ff_cv), np.std(recalls_ff_cv)))
print("Precision - Chrome: %0.3f \t Std: %0.3f" % (np.mean(precisions_chrome_cv),
                                                   np.std(precisions_chrome_cv)))
print("Recall - Chrome: %0.3f \t\t Std: %0.3f\n" % (np.mean(recalls_chrome_cv), np.std(recalls_chrome_cv)))

### 2.2 Differentially Private Random Forest

### 2.2.1 Results for Epsilon = 1

In [None]:
np.random.seed(myseed)
random.seed(myseed)
X = features_df.drop(["browser"], axis=1)
y = features_df["browser"]
accuracies_cv = []
recalls_chrome_cv = []
recalls_ff_cv = []
precisions_chrome_cv = []
precisions_ff_cv = []
gkf = GroupKFold(n_splits = 10)

# cross validation
for train_index, test_index in gkf.split(X, y, groups):
    X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
    y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]
    class_labels_train = np.array(y_train_cv).reshape(-1,1)
    class_labels_test = np.array(y_test_cv).reshape(-1,1)
    X_train_new = np.concatenate(((np.array(class_labels_train).reshape(-1,1)), X_train_cv), axis=1)
    X_test_new = np.concatenate(((np.array(class_labels_test).reshape(-1,1)), X_test_cv), axis=1)
    with open('dprf_output.txt', 'w') as dprf_file:
        with redirect_stdout(dprf_file):
            # epsilon is 1 here
            forest = DP_Random_Forest(X_train_new, X_test_new, [], 50, 1)
    accuracies_cv.append(forest._accuracy)
    recalls_chrome_cv.append(recall_score(y_test_cv, forest._predicted_labels, pos_label=1))
    recalls_ff_cv.append(recall_score(y_test_cv, forest._predicted_labels, pos_label=0))
    precisions_chrome_cv.append(precision_score(y_test_cv, forest._predicted_labels, pos_label=1))
    precisions_ff_cv.append(precision_score(y_test_cv, forest._predicted_labels, pos_label=0))

# report results
print("Accuracy: %0.3f \t\t Std: %0.3f" % (np.mean(accuracies_cv), np.std(accuracies_cv)))
print("Precision - FF: %0.3f \t\t Std: %0.3f" % (np.mean(precisions_ff_cv), np.std(precisions_ff_cv)))
print("Recall - FF: %0.3f \t\t Std: %0.3f" % (np.mean(recalls_ff_cv), np.std(recalls_ff_cv)))
print("Precision - Chrome: %0.3f \t Std: %0.3f" % (np.mean(precisions_chrome_cv),
                                                   np.std(precisions_chrome_cv)))
print("Recall - Chrome: %0.3f \t\t Std: %0.3f\n" % (np.mean(recalls_chrome_cv), np.std(recalls_chrome_cv)))

### 2.2.2 Results for Different Values of Epsilon

In [None]:
accuracies_rf = []
np.random.seed(myseed)
random.seed(myseed)
X = features_df.drop(["browser"], axis=1)
y = features_df["browser"]

# calculate evaluation metrics for different privacy budgets
for val in epsilons:
    accuracies_cv = []
    recalls_chrome_cv = []
    recalls_ff_cv = []
    precisions_chrome_cv = []
    precisions_ff_cv = []
    gkf = GroupKFold(n_splits = 10)
    for train_index, test_index in gkf.split(X, y, groups):
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]
        class_labels_train = np.array(y_train_cv).reshape(-1,1)
        class_labels_test = np.array(y_test_cv).reshape(-1,1)
        X_train_new = np.concatenate(((np.array(class_labels_train).reshape(-1,1)), X_train_cv), axis=1)
        X_test_new = np.concatenate(((np.array(class_labels_test).reshape(-1,1)), X_test_cv), axis=1)
        with open('dprf_output.txt', 'w') as dprf_file:
            with redirect_stdout(dprf_file):
                # val is epsilon here
                forest = DP_Random_Forest(X_train_new, X_test_new, [], 50, val)
        accuracies_cv.append(forest._accuracy)
    if round(val) in [1, 10, 100]:
        print("Epsilon: ", val)
        print("Accuracy: %0.3f \t Std: %0.3f\n" % (np.mean(accuracies_cv), np.std(accuracies_cv)))
    accuracies_rf.append(np.mean(accuracies_cv))

# plot values for different epsilons
make_privacy_utility_plot("DP Random Forest", epsilons, accuracies_rf)

## 3. Read Dataframe from CSV and Obtain Results
To obtain the results on newly generated datframes, execute the below cells containing QUAIL and PATE-CTGAN (4.) and set the use_given_dataset variable to False before executing this cell.

In [None]:
use_given_dataset = True
tmp_path = ""
# create dataframes
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.3, 
                                                    stratify=target, random_state=myseed)
# use the given datasets in synthetic_data/ or not
if not use_given_dataset:
    tmp_path = "tmp_"

result_file_synth_data = f"./{tmp_path}results/results_synth_data.csv"
result_file_synth_real_data = f"./{tmp_path}results/results_synth_data_real_data.csv"

if not use_given_dataset:
    # delete temporary results files if present
    if os.path.isfile(result_file_synth_data):
        os.remove(result_file_synth_data)
    if os.path.isfile(result_file_synth_real_data):
        os.remove(result_file_synth_real_data)

# evaluations for each dataset 
for file in sorted(os.listdir(f"./{tmp_path}synthetic_data/")):
    print(file, "\n")
    mydf = pd.read_csv(f"./{tmp_path}synthetic_data/" + file, index_col=0)
    y = mydf["browser"]
    X = mydf.drop(["browser"], axis=1)
    skf = StratifiedKFold(n_splits=10, random_state=myseed, shuffle=True)
    
    # cross validation only on synthetic data
    accuracies_cv = []
    recalls_chrome_cv = []
    recalls_ff_cv = []
    precisions_chrome_cv = []
    precisions_ff_cv = []
    for train_index, test_index in skf.split(X, y):
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]
        pipeline = Pipeline([('scaler', StandardScaler()), 
                             ('rfc', RandomForestClassifier(n_jobs=-1,random_state=myseed, 
                                                            n_estimators=50))])
        pipeline.fit(X_train_cv, y_train_cv)
        y_pred = pipeline.predict(X_test_cv)
        accuracies_cv.append(accuracy_score(y_test_cv, y_pred))
        recalls_chrome_cv.append(recall_score(y_test_cv, y_pred, pos_label=1))
        recalls_ff_cv.append(recall_score(y_test_cv, y_pred, pos_label=0))
        precisions_chrome_cv.append(precision_score(y_test_cv, y_pred, pos_label=1))
        precisions_ff_cv.append(precision_score(y_test_cv, y_pred, pos_label=0))
    
    # write results to file
    file_exists = os.path.isfile(result_file_synth_data)
    with open(result_file_synth_data, "a") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        if not file_exists:
            writer.writerow(["Accuracy", "Acc. StdDev", 
                             "Precision Chrome", "Prec. Chr. StdDev", 
                             "Precision Firefox", "Prec. FF. StdDev",
                             "Recall Chrome", "Rec. Ch. StdDev", 
                             "Recall Firefox", "Rec. FF. StdDev"])
        writer.writerow(["%0.3f" % np.mean(accuracies_cv), "%0.3f" % np.std(accuracies_cv),
                         "%0.3f" % np.mean(precisions_chrome_cv), "%0.3f" % np.std(precisions_chrome_cv),
                         "%0.3f" % np.mean(precisions_ff_cv), "%0.3f" % np.std(precisions_ff_cv),
                         "%0.3f" % np.mean(recalls_chrome_cv), "%0.3f" % np.std(recalls_chrome_cv),
                         "%0.3f" % np.mean(recalls_ff_cv), "%0.3f" % np.std(recalls_ff_cv)])

    # training on synthetic data and evaluation on real data 
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    rfc = RandomForestClassifier(n_jobs=-1, random_state=myseed, n_estimators=50)
    rfc.fit(X, y)

    # test model with real test dataframe
    X_test_real = X_test.drop(["browser"], axis=1)
    X_test_real = scaler.transform(X_test_real)
    X_test_real = pd.DataFrame(X_test_real)
    y_pred = rfc.predict(X_test_real)
    
    # write results to file
    file_exists = os.path.isfile(result_file_synth_real_data)
    with open(result_file_synth_real_data, "a") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        if not file_exists:
            writer.writerow(["Accuracy", "Precision Chrome", "Precision Firefox",
                             "Recall Chrome", "Recall Firefox"])
        writer.writerow(["%0.3f" % accuracy_score(y_test, y_pred),
                         "%0.3f" % precision_score(y_test, y_pred, pos_label=1),
                         "%0.3f" % precision_score(y_test, y_pred, pos_label=0),
                         "%0.3f" % recall_score(y_test, y_pred, pos_label=1),
                         "%0.3f" % recall_score(y_test, y_pred, pos_label=0)])

## 4. Create Synthetic Dataframes: PATE-CTGAN + QUAIL

Note: Executing this cell might take some time!

This cell generates 10 datasets under the condition that these datasets are somewhat balanced.

In [None]:
# the code is based on the example in the smartnoise-samples repository
# https://github.com/opendp/smartnoise-samples/blob/master/whitepaper-demos/5-ml-synthetic-data.ipynb
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.3, 
                                                    stratify=target, random_state=myseed)

# differentially private classifier
# we clip values to 100, suitable for most features
def get_dp_classifier(epsilon):
            return DPLR(epsilon=epsilon, data_norm=100)

# differentially private synthezizer
def get_pate_ctgan_synthezizer(epsilon):
    return PytorchDPSynthesizer(epsilon=epsilon, preprocessor=None, gan=PATECTGAN(loss='cross_entropy'))

class_balance = 0
i = 1
list_of_df = []
# we create 10 datasets
while len(list_of_df) < 10:
    class_balance = 0
    # ensure that the dataset is somewhat balanced
    while class_balance < 0.75:
        # use 50% of the privacy budget for the classifier, 50% for the synthezizer
        quail_synth = QUAILSynthesizer(2, get_pate_ctgan_synthezizer, get_dp_classifier, 'browser',
                                       eps_split=0.5, seed=myseed)
        quail_synth.fit(X_train)

        # specify size of synthetic data frame as training dataset size
        training_data_size = (X_train.shape[0])
        synthetic_data = quail_synth.sample(int(training_data_size))
        synthetic_dataframe = pd.DataFrame(synthetic_data, columns=X_train.columns)
        
        # check if dataset is somewhat balanced
        class_occurences = synthetic_dataframe.browser.value_counts()
        class_balance = min(class_occurences) / max(class_occurences)
        print("Try Number:", i)
        i+=1
        print("Ratio between min_instances and max_instances of classes is:", class_balance)
        if class_balance < 0.75:
            print("...trying again to create a balanced dataset.\n")
    print(class_occurences, "\n")
    list_of_df.append(synthetic_dataframe)
    print(f"so far we have {len(list_of_df)} datasets")

for index, bdf in enumerate(list_of_df):
    bdf.to_csv(f"./tmp_synthetic_data/balanced_df_v{index}.csv")