# Perturbing histograms using the Laplace mechanism

Acknowledgement: The core functions defined in this notebook are derived from source code developed at Data61 by Arthur Street and Gianpaolo Gioiosa.

In [1]:
import numpy as np

from pandas import DataFrame, Index, MultiIndex, RangeIndex, read_csv
from typing import cast, Mapping, NewType, Sequence, Text, Tuple

In [2]:
Record = NewType('Record', Tuple[int, ...])
Dataset = NewType('Dataset', Mapping[Record, float])

## Utility functions

In [3]:
def encode_values_as_integers(raw_data: DataFrame, include_all_in_range: bool = False) -> DataFrame:
    data = raw_data.copy()
    for column_name in data.columns:
        if include_all_in_range:
            sorted_values = list(range(min(data[column_name]), max(data[column_name]) + 1))
        else:
            sorted_values = sorted({value for value in data[column_name]})
        value_dict = {value: i for i, value in enumerate(sorted_values)}
        data[column_name] = raw_data[column_name].map(value_dict)
    return data

In [4]:
def get_dataset_from_data(data: DataFrame) -> Dataset:
    maxima = data.agg('max')
    column_names = [column for column in data.columns]
    if len(column_names) > 1:
        index = MultiIndex.from_product([RangeIndex.from_range(range(maxima[column_name] + 1))
                                         for column_name in column_names], names=column_names)
    else:
        index = Index(range(maxima[column_names[0]] + 1), name=column_names[0])
    dataset_as_series = data.groupby(column_names).size().reindex(index=index, fill_value=0)
    d = dataset_as_series.to_dict()
    if len(column_names) == 1:
        d = {(k,): v for k, v in d.items()}
    dataset = cast(Dataset, d)
    return dataset

In [5]:
def as_data_frame(dataset: Dataset, feature_names: Sequence[Text], output_index_name: Text) -> DataFrame:
    index = MultiIndex.from_tuples(dataset.keys(), names=feature_names)
    return DataFrame.from_dict(cast(dict, dataset),
                               'index',
                               columns=[output_index_name]).reindex(index, fill_value=0)

## Read in the raw data

In [6]:
data_path = '1_PIF/our-synthetic.csv'
feature_names = ['AGE', 'eye_color', 'countryofresidence', 'POSTCODE']  # Currently only numeric features are supported, but this is easily fixed.
raw_data = read_csv(data_path)[feature_names]
raw_data.head(10)

Unnamed: 0,AGE,eye_color,countryofresidence,POSTCODE
0,99,Brown,Australia,2649
1,108,Hazel,Australia,1780
2,59,Hazel,Australia,2940
3,58,Blue,Australia,2945
4,30,Brown,Australia,2729
5,6,Grey,Australia,2531
6,104,Brown,Australia,839
7,11,Grey,Australia,2901
8,105,Green,Australia,846
9,115,Grey,Australia,2912


## Encode the raw data

In [7]:
data_encoded = encode_values_as_integers(raw_data=raw_data, include_all_in_range=False)
data_encoded.head(10)

Unnamed: 0,AGE,eye_color,countryofresidence,POSTCODE
0,99,1,0,1592
1,108,4,0,812
2,59,4,0,1883
3,58,0,0,1888
4,30,1,0,1672
5,6,3,0,1474
6,104,1,0,139
7,11,3,0,1844
8,105,2,0,146
9,115,3,0,1855


## Represent the raw data as a histogram

In [8]:
dataset = get_dataset_from_data(data_encoded)
as_data_frame(dataset, feature_names, 'count').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
AGE,eye_color,countryofresidence,POSTCODE,Unnamed: 4_level_1
0,0,0,0,0
0,0,0,1,0
0,0,0,2,0
0,0,0,3,0
0,0,0,4,0
0,0,0,5,0
0,0,0,6,0
0,0,0,7,0
0,0,0,8,0
0,0,0,9,0


## The Laplace mechanism

In [9]:
def apply_laplace_mechanism(dataset: Dataset, epsilon: float, sensitivity: float = 1.0):
    return cast(Dataset, dict((possible_record, np.random.laplace(loc=true_count, scale=sensitivity / epsilon))
                for possible_record, true_count in dataset.items()))

## Perturb the histogram with a 'low' value of epsilon (= 0.05)

In [10]:
perturbed_dataset_low_epsilon = apply_laplace_mechanism(dataset=dataset, epsilon=0.05, sensitivity=1)
as_data_frame(perturbed_dataset_low_epsilon, feature_names, 'count').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
AGE,eye_color,countryofresidence,POSTCODE,Unnamed: 4_level_1
0,0,0,0,11.46219
0,0,0,1,-40.728309
0,0,0,2,3.400876
0,0,0,3,-107.575802
0,0,0,4,4.410668
0,0,0,5,22.48533
0,0,0,6,8.359264
0,0,0,7,4.493887
0,0,0,8,-17.592277
0,0,0,9,-0.853373


## Perturb the histogram with a 'medium' value of epsilon (= 0.1)

In [11]:
perturbed_dataset_medium_epsilon = apply_laplace_mechanism(dataset=dataset, epsilon=0.1, sensitivity=1)
as_data_frame(perturbed_dataset_medium_epsilon, feature_names, 'count').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
AGE,eye_color,countryofresidence,POSTCODE,Unnamed: 4_level_1
0,0,0,0,1.210392
0,0,0,1,-2.169745
0,0,0,2,7.739395
0,0,0,3,9.1573
0,0,0,4,-3.741585
0,0,0,5,-4.918998
0,0,0,6,15.768169
0,0,0,7,17.821234
0,0,0,8,4.396213
0,0,0,9,3.000579


## Perturb the histogram with a 'high' value of epsilon (= 1.0)

In [12]:
perturbed_dataset_high_epsilon = apply_laplace_mechanism(dataset=dataset, epsilon=1.0, sensitivity=1)
as_data_frame(perturbed_dataset_high_epsilon, feature_names, 'count').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
AGE,eye_color,countryofresidence,POSTCODE,Unnamed: 4_level_1
0,0,0,0,1.418013
0,0,0,1,-3.811734
0,0,0,2,0.116891
0,0,0,3,-1.248403
0,0,0,4,-0.210335
0,0,0,5,-0.678026
0,0,0,6,-6.153651
0,0,0,7,-0.251566
0,0,0,8,0.184327
0,0,0,9,1.001882


## Perturb the histogram with a 'very high' value of epsilon (= 50.0)

In [13]:
perturbed_dataset_very_high_epsilon = apply_laplace_mechanism(dataset=dataset, epsilon=50.0, sensitivity=1)
as_data_frame(perturbed_dataset_very_high_epsilon, feature_names, 'count').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
AGE,eye_color,countryofresidence,POSTCODE,Unnamed: 4_level_1
0,0,0,0,-0.098949
0,0,0,1,0.006863
0,0,0,2,0.007042
0,0,0,3,0.039668
0,0,0,4,-0.057906
0,0,0,5,0.005682
0,0,0,6,-0.015913
0,0,0,7,0.009139
0,0,0,8,-0.033469
0,0,0,9,-0.002333
