# SPEML Exercise 1 - Anonymization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

## Load Data

In [2]:
# column names from https://archive.ics.uci.edu/ml/datasets/Adult
header = ["age" ,"workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country", "target"] 

adults = pd.read_csv("../data/adult.data", sep=", ", engine='python', names=header, na_values=["?","nan"], index_col=False)
adults["target"].replace(["<=50K", ">50K"], [0,1], inplace=True)
adults.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


## Define some re-usable functions

In [37]:
%%file preproc.py

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

def preproc(data, onehot_encoder=None, scaler=None):
    this_data = data.copy()
    this_data.dropna(inplace=True) # drop all lines with missing values
    this_data.reset_index(inplace=True, drop=True)

    drops = ["education-num"] # drop this since we have education and education-num 
    this_data.drop(columns=drops, axis=1, inplace=True)

    # numericals to scale
    to_scale = ["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week"]

    if not scaler:
        this_scaler = MinMaxScaler()
        this_scaler.fit(this_data[to_scale])
    else:
        this_scaler = scaler 
    this_data[to_scale] = this_scaler.transform(this_data[to_scale])

    # categorical columns to drop
    onehot = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

    # in case no encoder is provided we create one which can then be re-used (train-test split)
    if not onehot_encoder:
        this_enc = OneHotEncoder(handle_unknown='ignore')
        this_enc.fit(this_data[onehot])
    else:
        this_enc = onehot_encoder

    encoded = pd.DataFrame(this_enc.transform(this_data[onehot]).toarray(), columns=this_enc.get_feature_names())
    this_data = pd.concat([this_data.drop(columns=onehot),
                    encoded],
                    axis=1)

    # assert that all went well and we did not loose or add any rows...
    assert(data.dropna().shape[0] == this_data.shape[0])

    return this_data, this_enc, this_scaler

Writing preproc.py


In [38]:
%%file classify.py

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
#from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
#from sklearn.metrics import average_precision_score

def classify(X_train, y_train, X_test, y_test):   
    names = ["SVC", "RandomForest", "GaussianNB"]

    classifiers = [
        SVC(gamma=2, C=1),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        GaussianNB()]

    results = pd.DataFrame(columns=names, index=["f1", "roc_auc"])

    for i in enumerate(classifiers):
        cname = names[i[0]]
        c = i[1]

        print("Fitting {} now...".format(cname))
        s = time.time()
        c.fit(X_train, y_train)
        print("--- Done! Took {:1.4f}s".format(time.time()-s))
        y_pred = c.predict(X_test)
    
        results[cname]["f1"] = f1_score(y_test, y_pred, average='micro')
        results[cname]["roc_auc"] = roc_auc_score(y_test, y_pred)
    
    return results

Writing classify.py


## Pre-Anonymisation

In [35]:
from sklearn.model_selection import train_test_split

adults.dropna(inplace=True)
adults.reset_index(inplace=True, drop=True)

X = adults.drop("target", axis=1)
y = adults["target"].dropna()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
pp_train, encoder, scaler = preproc(X_train)

pp_test, encoder, scaler = preproc(X_test, encoder, scaler)

print("Data: ", pp_train.shape[0]+pp_test.shape[0], adults.dropna().shape[0])
print("X:    ", pp_train.shape[0], pp_test.shape[0])
print("Y:    ", y_train.shape[0], y_test.shape[0])
pp_train.head(3)

Data:  30162 30162
X:     20208 9954
Y:     20208 9954


Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,x0_Federal-gov,x0_Local-gov,x0_Private,x0_Self-emp-inc,x0_Self-emp-not-inc,...,x7_Portugal,x7_Puerto-Rico,x7_Scotland,x7_South,x7_Taiwan,x7_Thailand,x7_Trinadad&Tobago,x7_United-States,x7_Vietnam,x7_Yugoslavia
0,0.671233,0.079722,0.0,0.418962,0.397959,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.219178,0.19157,0.0,0.0,0.142857,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.082192,0.121931,0.0,0.0,0.397959,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [36]:
results_raw = classify(pp_train, y_train, pp_test, y_test)
results_raw

Fitting SVC now...
--- Done! Took 116.7426s
Fitting RandomForest now...
--- Done! Took 0.0447s
Fitting GaussianNB now...
--- Done! Took 0.0395s


Unnamed: 0,SVC,RandomForest,GaussianNB
f1,0.816858,0.747438,0.500904
roc_auc,0.713279,0.5,0.653093
