# Perturbing histograms using the Laplace mechanism

Acknowledgement: The core functions defined in this notebook are derived from source code developed at Data61 by Arthur Street and Gianpaolo Gioiosa.

In [1]:
import numpy as np

from pandas import DataFrame, Index, MultiIndex, RangeIndex, read_csv
from typing import cast, Mapping, NewType, Sequence, Text, Tuple

In [2]:
Record = NewType('Record', Tuple[int, ...])
Dataset = NewType('Dataset', Mapping[Record, float])

## Utility functions

In [3]:
def encode_values_as_integers(raw_data: DataFrame, include_all_in_range: bool = False) -> DataFrame:
    data = raw_data.copy()
    for column_name in data.columns:
        if include_all_in_range:
            sorted_values = list(range(min(data[column_name]), max(data[column_name]) + 1))
        else:
            sorted_values = sorted({value for value in data[column_name]})
        value_dict = {value: i for i, value in enumerate(sorted_values)}
        data[column_name] = raw_data[column_name].map(value_dict)
    return data

In [4]:
def get_dataset_from_data(data: DataFrame) -> Dataset:
    maxima = data.agg('max')
    column_names = [column for column in data.columns]
    if len(column_names) > 1:
        index = MultiIndex.from_product([RangeIndex.from_range(range(maxima[column_name] + 1))
                                         for column_name in column_names], names=column_names)
    else:
        index = Index(range(maxima[column_names[0]] + 1), name=column_names[0])
    dataset_as_series = data.groupby(column_names).size().reindex(index=index, fill_value=0)
    d = dataset_as_series.to_dict()
    if len(column_names) == 1:
        d = {(k,): v for k, v in d.items()}
    dataset = cast(Dataset, d)
    return dataset

In [5]:
def as_data_frame(dataset: Dataset, feature_names: Sequence[Text], output_index_name: Text) -> DataFrame:
    index = MultiIndex.from_tuples(dataset.keys(), names=feature_names)
    return DataFrame.from_dict(cast(dict, dataset),
                               'index',
                               columns=[output_index_name]).reindex(index, fill_value=0)

## Read in the raw data

In [6]:
data_path = '1_PIF/our-synthetic.csv'
feature_names = ['AGE']  # Currently only numeric features are supported, but this is easily fixed.
raw_data = read_csv(data_path)[feature_names]
raw_data.head(10)

Unnamed: 0,AGE
0,99
1,108
2,59
3,58
4,30
5,6
6,104
7,11
8,105
9,115


## Encode the raw data

In [7]:
data_encoded = encode_values_as_integers(raw_data=raw_data, include_all_in_range=True)
data_encoded.head(10)

Unnamed: 0,AGE
0,99
1,108
2,59
3,58
4,30
5,6
6,104
7,11
8,105
9,115


## Represent the raw data as a histogram

In [8]:
dataset = get_dataset_from_data(data_encoded)
as_data_frame(dataset, feature_names, 'count').head(10)

Unnamed: 0_level_0,count
AGE,Unnamed: 1_level_1
0,324
1,316
2,333
3,369
4,326
5,310
6,357
7,354
8,342
9,324


## The Laplace mechanism

In [9]:
def apply_laplace_mechanism(dataset: Dataset, epsilon: float, sensitivity: float = 1.0):
    return cast(Dataset, dict((possible_record, np.random.laplace(loc=true_count, scale=sensitivity / epsilon))
                for possible_record, true_count in dataset.items()))

## Perturb the histogram with a 'low' value of epsilon (= 0.05)

In [10]:
perturbed_dataset_low_epsilon = apply_laplace_mechanism(dataset=dataset, epsilon=0.05, sensitivity=1)
as_data_frame(perturbed_dataset_low_epsilon, feature_names, 'count').head(10)

Unnamed: 0_level_0,count
AGE,Unnamed: 1_level_1
0,341.86099
1,336.886311
2,383.084638
3,350.315667
4,320.422668
5,372.855539
6,344.952186
7,391.471956
8,334.137174
9,389.91434


## Perturb the histogram with a 'medium' value of epsilon (= 0.1)

In [11]:
perturbed_dataset_medium_epsilon = apply_laplace_mechanism(dataset=dataset, epsilon=0.1, sensitivity=1)
as_data_frame(perturbed_dataset_medium_epsilon, feature_names, 'count').head(10)

Unnamed: 0_level_0,count
AGE,Unnamed: 1_level_1
0,302.399802
1,313.677804
2,326.779528
3,369.615059
4,301.574694
5,312.631608
6,353.808789
7,363.911027
8,348.8811
9,327.362779


## Perturb the histogram with a 'high' value of epsilon (= 1.0)

In [12]:
perturbed_dataset_high_epsilon = apply_laplace_mechanism(dataset=dataset, epsilon=1.0, sensitivity=1)
as_data_frame(perturbed_dataset_high_epsilon, feature_names, 'count').head(10)

Unnamed: 0_level_0,count
AGE,Unnamed: 1_level_1
0,322.984761
1,319.045338
2,333.407014
3,372.198134
4,325.36321
5,309.775674
6,357.602996
7,353.752264
8,342.111329
9,324.203053


## Perturb the histogram with a 'very high' value of epsilon (= 50.0)

In [13]:
perturbed_dataset_very_high_epsilon = apply_laplace_mechanism(dataset=dataset, epsilon=50.0, sensitivity=1)
as_data_frame(perturbed_dataset_very_high_epsilon, feature_names, 'count').head(10)

Unnamed: 0_level_0,count
AGE,Unnamed: 1_level_1
0,324.000381
1,316.0145
2,332.999968
3,369.000058
4,325.98239
5,310.050011
6,356.959147
7,354.02251
8,341.986699
9,324.01576
