In [1]:
import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict
import operator

import pandas as pd
import numpy as np

In [2]:
ATT_NAME = ['age', 'workclass', 'final_weight', 'education',
            'education_num', 'marital_status', 'occupation', 'relationship',
            'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
            'native_country', 'class']

In [3]:
df = pd.read_csv('../data/adult.data', names=ATT_NAME)

In [4]:
df.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
def cat_to_num(df):
    """ Convert categorical values to numerical values from 0 to n. """
    category_maps = defaultdict(dict)
    for col in df.columns:
        unique_values = sorted(df[col].unique())
        if col in CAT_COLS:
            for i, v in enumerate(unique_values):
                category_maps[col][v] = i
            df[col] = df[col].apply(lambda x: category_maps[col][x])
    return df, category_maps

In [6]:
class Partition(object):

    def __init__(self, data):
        self.df = data
        self.disallowed = []

    def __len__(self):
        return len(self.df)

    def get_normalized_width(self):
        width = self.df.max() - self.df.min()
        return width / np.array(RANGE)

    def choose_dimension(self):
        # largest amount of unique values first
        most_values = sorted([(n, len(col.unique())) for n, col in self.df.iteritems()],
                             key=operator.itemgetter(1), reverse=True)
        for x,_ in most_values:
            if x not in self.disallowed:
                return x
        return None
        # highest normalized width first
        # TODO: check what method makes more sense and what exactly the normalized width is suppose to be
        # for x in np.argsort(self.get_normalized_width())[::-1].index.values:
        #     if x not in self.disallowed:
        #         return x
        # return None
    
    def get_split_value(self, dim):
        return self.df[dim].median()

In [7]:
ANOM_COLS = ['age', 'education_num', 'race', 'sex']
CAT_COLS = ['race', 'sex']
K = 5

# select columns to anonymize
df_anom = df[ANOM_COLS]
# convert categorical values to numerical
df_anom, category_maps = cat_to_num(df_anom)
# compute range of values
RANGE = (df_anom.max() - df_anom.min()).values
# subsample
# df_anom = df_anom.sample(1000)
# create partition object
partition = Partition(df_anom)

# recursively call mondrian
results = []
def mondrian(partition):
    if len(partition) <= (2 * K + 1):
        results.append(partition.df)
    else:
        dim = partition.choose_dimension()
        if dim is None:
            results.append(partition.df)
        else:
            split_val = partition.get_split_value(dim)
            if partition.df[dim].max == split_val:
                lhs = Partition(partition.df[partition.df[dim] < split_val])
                rhs = Partition(partition.df[partition.df[dim] >= split_val])
            else:
                lhs = Partition(partition.df[partition.df[dim] <= split_val])
                rhs = Partition(partition.df[partition.df[dim] > split_val])
            if not len(lhs) or not len(rhs):
                partition.disallowed.append(dim)
                mondrian(partition)
            else:
                mondrian(lhs)
                mondrian(rhs)

In [8]:
mondrian(partition)

In [9]:
def aggregate_partitions(partitions):

    df_list = []
    for res in partitions:
        for col in res.columns:
            res[col] = int(res[col].mean())
        df_list.append(res)

    return pd.concat(df_list)

In [10]:
anom = aggregate_partitions(results)

In [11]:
def num_to_cat(df, category_maps):
    for col in df.columns:
        if col in CAT_COLS:
            reversed_map = dict([(v, k) for k, v in category_maps[col].items()])
            df[col] = df[col].apply(lambda x: reversed_map[x])
    return df

In [12]:
anom = num_to_cat(anom, category_maps)

In [13]:
tmp = df[df.columns.difference(ANOM_COLS)]

In [14]:
res = anom.join(tmp)

In [15]:
res.head()

Unnamed: 0,age,education_num,race,sex,capital_gain,capital_loss,class,education,final_weight,hours_per_week,marital_status,native_country,occupation,relationship,workclass
106,17,6,Other,Female,34095,0,<=50K,10th,304873,32,Never-married,United-States,?,Own-child,?
209,17,6,Other,Female,0,0,<=50K,11th,65368,12,Never-married,United-States,Sales,Own-child,Private
262,17,6,Other,Female,0,0,<=50K,11th,245918,12,Never-married,United-States,Other-service,Own-child,Private
271,17,6,Other,Female,1055,0,<=50K,9th,191260,24,Never-married,United-States,Other-service,Own-child,Private
335,17,6,Other,Female,0,0,<=50K,5th-6th,270942,48,Never-married,Mexico,Other-service,Other-relative,Private
