# SPEML Exercise 1 - Anonymization

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

%load_ext autoreload
%autoreload 2

## Load Data

In [3]:
# column names from https://archive.ics.uci.edu/ml/datasets/Adult
header = ["age" ,"workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country", "target"] 

adults = pd.read_csv("../data/adult.data", sep=", ", engine='python', names=header, na_values=["?","nan"], index_col=False)
adults["target"].replace(["<=50K", ">50K"], [0,1], inplace=True)
adults.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0


## Define some re-usable functions

In [4]:
%%file preproc.py

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

def preproc(data, onehot_encoder=None, scaler=None, numeric=None, categorical=None):
    this_data = data.copy()
    this_data.dropna(inplace=True) # drop all lines with missing values
    this_data.reset_index(inplace=True, drop=True)

    drops = ["education-num"] # drop this since we have education and education-num 
    this_data.drop(columns=drops, axis=1, inplace=True)

    # numericals to scale
    if numeric is None:
        to_scale = ["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week"]
    else:
        to_scale = numeric
        if "education-num" in to_scale:
            to_scale.remove("education-num")

    if not scaler:
        this_scaler = MinMaxScaler()
        this_scaler.fit(this_data[to_scale])
    else:
        this_scaler = scaler 
    this_data[to_scale] = this_scaler.transform(this_data[to_scale])

    # categorical columns to encode
    if categorical is None:
        onehot = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
    else:
        onehot = categorical

    # in case no encoder is provided we create one which can then be re-used (train-test split)
    if not onehot_encoder:
        this_enc = OneHotEncoder(handle_unknown='ignore')
        this_enc.fit(this_data[onehot])
    else:
        this_enc = onehot_encoder

    encoded = pd.DataFrame(this_enc.transform(this_data[onehot]).toarray(), columns=this_enc.get_feature_names())
    this_data = pd.concat([this_data.drop(columns=onehot),
                    encoded],
                    axis=1)

    # assert that all went well and we did not loose or add any rows...
    assert(data.dropna().shape[0] == this_data.shape[0])

    return this_data, this_enc, this_scaler

Overwriting preproc.py


In [5]:
%%file classify.py

import time
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
#from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
#from sklearn.metrics import average_precision_score

def classify(X_train, y_train, X_test, y_test):   
    names = ["SVC", "RandomForest", "GaussianNB"]

    classifiers = [
        SVC(gamma=2, C=1),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        GaussianNB()]

    results = pd.DataFrame(columns=names, index=["f1", "roc_auc"])

    for i in enumerate(classifiers):
        cname = names[i[0]]
        c = i[1]

        print("Fitting {} now...".format(cname))
        s = time.time()
        c.fit(X_train, y_train)
        print("--- Done! Took {:1.4f}s".format(time.time()-s))
        y_pred = c.predict(X_test)
    
        results[cname]["f1"] = f1_score(y_test, y_pred, average='micro')
        results[cname]["roc_auc"] = roc_auc_score(y_test, y_pred)
    
    return results

Overwriting classify.py


In [6]:
import classify
import preproc

## Pre-Anonymisation

In [7]:
from sklearn.model_selection import train_test_split

adults.dropna(inplace=True)
adults.reset_index(inplace=True, drop=True)

X = adults.drop("target", axis=1)
y = adults["target"].dropna()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
pp_train, encoder, scaler = preproc.preproc(X_train)

pp_test, encoder, scaler = preproc.preproc(X_test, encoder, scaler)

print("Data: ", pp_train.shape[0]+pp_test.shape[0], adults.dropna().shape[0])
print("---    Train, Test")
print("X:    ", pp_train.shape[0], pp_test.shape[0])
print("Y:    ", y_train.shape[0], y_test.shape[0])
pp_train.head(3)

Data:  30162 30162
---    Train, Test
X:     20208 9954
Y:     20208 9954


Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,x0_Federal-gov,x0_Local-gov,x0_Private,x0_Self-emp-inc,x0_Self-emp-not-inc,...,x7_Portugal,x7_Puerto-Rico,x7_Scotland,x7_South,x7_Taiwan,x7_Thailand,x7_Trinadad&Tobago,x7_United-States,x7_Vietnam,x7_Yugoslavia
0,0.671233,0.079722,0.0,0.418962,0.397959,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.219178,0.19157,0.0,0.0,0.142857,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.082192,0.121931,0.0,0.0,0.397959,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
results_raw = classify.classify(pp_train, y_train, pp_test, y_test)
results_raw

Fitting SVC now...
--- Done! Took 116.8917s
Fitting RandomForest now...
--- Done! Took 0.0426s
Fitting GaussianNB now...
--- Done! Took 0.0394s


Unnamed: 0,SVC,RandomForest,GaussianNB
f1,0.816858,0.747438,0.500904
roc_auc,0.713279,0.5,0.653093


---
## Anonymize
Anonymized Quasi-identifiers data with __k-Anonymization__ using **ARX** Javava lib, see ```anonymization/arx.java```
### Quasi-Identifiers
- workclass
- education
- marital-status
- relationship
- native-country
- age
- occupation
- race
- sex


In [9]:
cols_categorical = ["workclass", "education", "marital-status", "relationship", "native-country", "age", "occupation", "race", "sex"]
cols_numerical = set(header)-set(cols_categorical)
cols_numerical.remove('target') # we dont want to use target in preprocessing later
cols_numerical = list(cols_numerical)

ks = [1,2,3,5,10]
results_anon = []

for k in ks:
    adults_anonym = pd.read_csv("../data/k-anonymity/anonymized_k{}.data".format(k), sep=",", engine='python', na_values=["?","nan"], index_col=False)
    adults_anonym["target"].replace(["<=50K", ">50K"], [0,1], inplace=True)

    adults_anonym.dropna(inplace=True)
    adults_anonym.reset_index(inplace=True, drop=True)

    X = adults_anonym.drop("target", axis=1)
    y = adults_anonym["target"].dropna()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)
    pp_train, encoder, scaler = preproc.preproc(X_train, numeric=cols_numerical, categorical=cols_categorical)

    pp_test, encoder, scaler = preproc.preproc(X_test, encoder, scaler, numeric=cols_numerical, categorical=cols_categorical)

    #print("Data: ", pp_train.shape[0]+pp_test.shape[0], adults.dropna().shape[0])
    print("---    Train, Test")
    print("X:    ", pp_train.shape[0], pp_test.shape[0])
    print("Y:    ", y_train.shape[0], y_test.shape[0])

    results_anon.append(classify.classify(pp_train, y_train, pp_test, y_test))

---    Train, Test
X:     20208 9954
Y:     20208 9954
Fitting SVC now...
--- Done! Took 167.8935s
Fitting RandomForest now...
--- Done! Took 0.0446s
Fitting GaussianNB now...
--- Done! Took 0.0588s
---    Train, Test
X:     20528 10112
Y:     20528 10112
Fitting SVC now...
--- Done! Took 45.0864s
Fitting RandomForest now...
--- Done! Took 0.0457s
Fitting GaussianNB now...
--- Done! Took 0.0224s
---    Train, Test
X:     20640 10167
Y:     20640 10167
Fitting SVC now...
--- Done! Took 30.7569s
Fitting RandomForest now...
--- Done! Took 0.0453s
Fitting GaussianNB now...
--- Done! Took 0.0177s
---    Train, Test
X:     20788 10240
Y:     20788 10240
Fitting SVC now...
--- Done! Took 21.1690s
Fitting RandomForest now...
--- Done! Took 0.0432s
Fitting GaussianNB now...
--- Done! Took 0.0145s
---    Train, Test
X:     20867 10279
Y:     20867 10279
Fitting SVC now...
--- Done! Took 20.2027s
Fitting RandomForest now...
--- Done! Took 0.0474s
Fitting GaussianNB now...
--- Done! Took 0.0164s


In [10]:
_results = pd.DataFrame()
for k, res in zip(ks, results_anon):
    thisr = res.reset_index()
    thisr["k-anon"] = k
    thisr.set_index(["k-anon", "index"], inplace=True)
    _results = pd.concat([_results, thisr])
    
_results

Unnamed: 0_level_0,Unnamed: 1_level_0,SVC,RandomForest,GaussianNB
k-anon,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,f1,0.782098,0.747438,0.462929
1,roc_auc,0.62063,0.5,0.632957
2,f1,0.818434,0.756131,0.620847
2,roc_auc,0.720695,0.504954,0.721045
3,f1,0.81617,0.764237,0.715354
3,roc_auc,0.716781,0.523024,0.770721
5,f1,0.81709,0.755176,0.725098
5,roc_auc,0.717696,0.511436,0.77586
10,f1,0.827804,0.772838,0.727114
10,roc_auc,0.728224,0.531637,0.778962


In [13]:
print(_results.to_markdown(tablefmt="github"))

|                 |      SVC |   RandomForest |   GaussianNB |
|-----------------|----------|----------------|--------------|
| (1, 'f1')       | 0.782098 |       0.747438 |     0.462929 |
| (1, 'roc_auc')  | 0.62063  |       0.5      |     0.632957 |
| (2, 'f1')       | 0.818434 |       0.756131 |     0.620847 |
| (2, 'roc_auc')  | 0.720695 |       0.504954 |     0.721045 |
| (3, 'f1')       | 0.81617  |       0.764237 |     0.715354 |
| (3, 'roc_auc')  | 0.716781 |       0.523024 |     0.770721 |
| (5, 'f1')       | 0.81709  |       0.755176 |     0.725098 |
| (5, 'roc_auc')  | 0.717696 |       0.511436 |     0.77586  |
| (10, 'f1')      | 0.827804 |       0.772838 |     0.727114 |
| (10, 'roc_auc') | 0.728224 |       0.531637 |     0.778962 |


The Generalization from k-Anonymity had a slightly positive effect on the selected classifers. 

Raw data:

|         |      SVC |   RandomForest |   GaussianNB |
|---------|----------|----------------|--------------|
| f1      | 0.816858 |       0.747438 |     0.500904 |
| roc_auc | 0.713279 |       0.5      |     0.653093 |

Anonymization with different k=`[1,2,3,5]`:

|                 |      SVC |   RandomForest |   GaussianNB |
|-----------------|----------|----------------|--------------|
| (1, 'f1')       | 0.782098 |       0.747438 |     0.462929 |
| (1, 'roc_auc')  | 0.62063  |       0.5      |     0.632957 |
| (2, 'f1')       | 0.818434 |       0.756131 |     0.620847 |
| (2, 'roc_auc')  | 0.720695 |       0.504954 |     0.721045 |
| (3, 'f1')       | 0.81617  |       0.764237 |     0.715354 |
| (3, 'roc_auc')  | 0.716781 |       0.523024 |     0.770721 |
| (5, 'f1')       | 0.81709  |       0.755176 |     0.725098 |
| (5, 'roc_auc')  | 0.717696 |       0.511436 |     0.77586  |
| (10, 'f1')      | 0.827804 |       0.772838 |     0.727114 |
| (10, 'roc_auc') | 0.728224 |       0.531637 |     0.778962 |

Arguably, the generalization might have helped the classifiers to not overfit on some less important variables, especially on simpler methods like RandomForest or the GaussianNB. 


## Microaggregation