# MWEM input generator

This code writes a certain histogram representation of a given data set to a CSV file. The CSV file can be read in order to obtain the data set in the form expected by the implementation of MWEM available at https://github.com/mrtzh/PrivateMultiplicativeWeights.jl.

In [12]:
from pandas import cut, DataFrame, Index, MultiIndex, RangeIndex, read_csv
from typing import cast, Mapping, NewType, Sequence, Text, Tuple

In [13]:
Record = NewType('Record', Tuple[int, ...])
Dataset = NewType('Dataset', Mapping[Record, float])

## Utility functions

Acknowledgement: The functions in this section are derived from source code developed at Data61 by Arthur Street and Gianpaolo Gioiosa.

In [14]:
def encode_values_as_integers(raw_data: DataFrame, include_all_in_range: bool = False) -> DataFrame:
    data = raw_data.copy()
    for column_name in data.columns:
        if include_all_in_range:
            sorted_values = list(range(min(data[column_name]), max(data[column_name]) + 1))
        else:
            sorted_values = sorted({value for value in data[column_name]})
        value_dict = {value: i for i, value in enumerate(sorted_values)}
        data[column_name] = raw_data[column_name].map(value_dict)
    return data

In [15]:
def get_dataset_from_data(data: DataFrame) -> Dataset:
    maxima = data.agg('max')
    column_names = [column for column in data.columns]
    if len(column_names) > 1:
        index = MultiIndex.from_product([RangeIndex.from_range(range(maxima[column_name] + 1))
                                         for column_name in column_names], names=column_names)
    else:
        index = Index(range(maxima[column_names[0]] + 1), name=column_names[0])
    dataset_as_series = data.groupby(column_names).size().reindex(index=index, fill_value=0)
    d = dataset_as_series.to_dict()
    if len(column_names) == 1:
        d = {(k,): v for k, v in d.items()}
    dataset = cast(Dataset, d)
    return dataset

In [16]:
def as_data_frame(dataset: Dataset, feature_names: Sequence[Text], output_index_name: Text) -> DataFrame:
    index = MultiIndex.from_tuples(dataset.keys(), names=feature_names)
    return DataFrame.from_dict(cast(dict, dataset),
                               'index',
                               columns=[output_index_name]).reindex(index, fill_value=0)

## Read in the raw data

In [17]:
data_path = '1_PIF/our-synthetic.csv'
feature_names = ['gender', 'AGE', 'POSTCODE', 'blood_group', 'eye_color']
number_of_records = 2000
if number_of_records:
    raw_data = read_csv(data_path)[feature_names].iloc[:number_of_records, :]
else:
    raw_data = read_csv(data_path)[feature_names]
if 'our-synthetic.csv' in data_path and 'job' in feature_names:
    raw_data = raw_data.fillna('Unemployed')
raw_data.head(10)

Unnamed: 0,gender,AGE,POSTCODE,blood_group,eye_color
0,F,99,2649,B-,Brown
1,M,108,1780,A-,Hazel
2,M,59,2940,B+,Hazel
3,M,58,2945,B+,Blue
4,M,30,2729,AB-,Brown
5,M,6,2531,A-,Grey
6,F,104,839,B+,Brown
7,M,11,2901,AB-,Grey
8,F,105,846,A-,Green
9,M,115,2912,A+,Grey


In [33]:
len(raw_data)

2000

In [19]:
data_binned = raw_data.copy()

# Bin the data (if desired)

In [20]:
binning_specification = {'AGE': 5, 'POSTCODE': 50}
bin_labels_specification = {name_of_feature_to_bin: list(range(number_of_bins_for_feature))
                            for name_of_feature_to_bin, number_of_bins_for_feature in binning_specification.items()}
for name_of_feature_to_bin, number_of_bins_for_feature in binning_specification.items():
    data_binned['{0}_binned'.format(name_of_feature_to_bin)] = cut(data_binned[name_of_feature_to_bin],
                                                                   binning_specification[name_of_feature_to_bin],
                                                                   labels=False)
    data_binned = data_binned.drop(columns=[name_of_feature_to_bin])
data_binned.head(10)

Unnamed: 0,gender,blood_group,eye_color,AGE_binned,POSTCODE_binned
0,F,B-,Brown,4,12
1,M,A-,Hazel,4,8
2,M,B+,Hazel,2,13
3,M,B+,Blue,2,14
4,M,AB-,Brown,1,12
5,M,A-,Grey,0,11
6,F,B+,Brown,4,3
7,M,AB-,Grey,0,13
8,F,A-,Green,4,3
9,M,A+,Grey,4,13


# Encode the binned data

In [26]:
# Also encode the non binned data for comparison
data_encoded = encode_values_as_integers(raw_data=raw_data, include_all_in_range=False)

data_binned_and_encoded = encode_values_as_integers(raw_data=data_binned, include_all_in_range=False)
data_binned_and_encoded.head(10)

Unnamed: 0,gender,blood_group,eye_color,AGE_binned,POSTCODE_binned
0,0,5,1,4,10
1,1,1,4,4,6
2,1,4,4,2,11
3,1,4,0,2,12
4,1,3,1,1,10
5,1,1,3,0,9
6,0,4,1,4,1
7,1,3,3,0,11
8,0,1,2,4,1
9,1,0,3,4,11


# Represent the binned and encoded data as a histogram

The histogram is represented as a list of normalised weights.

In [27]:
number_of_records = raw_data.shape[0]

protected_dataset = get_dataset_from_data(data_binned_and_encoded)
protected_histogram = [count / number_of_records for possible_record, count in protected_dataset.items()];

unprotected_dataset = get_dataset_from_data(data_encoded)
unprotected_histogram = [count / number_of_records for possible_record, count in unprotected_dataset.items()];

# Output the histogram 

Because the MWEM implementation is in a different language we output the histogram weights as a CSV file.

In [28]:
protected_histogram_df = DataFrame(protected_histogram, columns=['weight'])
unprotected_histogram_df = DataFrame(unprotected_histogram, columns=['weight'])
protected_histogram_df.head(10)

Unnamed: 0,weight
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.001
8,0.0005
9,0.0005


In [24]:
output_path = 'binned_synthetic_weights.csv'
protected_histogram_df.to_csv(output_path, index=False)

In [29]:
output_path = 'unbinned_synthetic_weights.csv'
unprotected_histogram_df.to_csv(output_path, index=False)

In [31]:
len(protected_histogram_df)

19200

In [32]:
len(unprotected_histogram_df)

9275760