In [3]:
import pandas as pd
import json

# Dataset

In [38]:
dataset_path = "../../../assets/datasets/teacher_survey.csv"
df = pd.read_csv(dataset_path)
df.columns = ['name',
              'sex',
              'age',
              'maritalStatus',
              'hasChildren',
              'highestEducationLevel',
              'sourceOfStress',
              'smoker',
              'optimism',
              'lifeSatisfaction',
              'selfEsteem']
df.head()

Unnamed: 0,name,sex,age,maritalStatus,hasChildren,highestEducationLevel,sourceOfStress,smoker,optimism,lifeSatisfaction,selfEsteem
0,Ariel Lucero,2,21,1,2,5,1,2,19,20,31
1,Sienna Shah,2,42,4,1,2,5,2,23,26,33
2,Denzel Crawford,2,47,4,1,2,4,2,24,27,36
3,Emelia Norman,2,48,4,2,3,3,2,30,22,40
4,Abigail Koch,2,41,7,1,6,1,2,21,21,35


# Definitions

**Unit of privacy** - often referred to as an "individual" or "participant," depending on the context. It's the smallest unit of data about which privacy is preserved. For example, in a dataset of users, the unit of privacy would be the data pertaining to a single user. Differential privacy mechanisms ensure that the privacy of each individual unit is protected, making it difficult to infer information about any specific individual from the aggregated data or analysis results.

**Privacy parameter / ϵ (epsilon)** - quantifies the level of privacy or the privacy loss of a mechanism. A smaller ϵ value indicates stronger privacy guarantees, meaning the output of the differential privacy mechanism is less dependent on any single individual's data. Conversely, a larger ϵ suggests weaker privacy protections. This parameter controls the trade-off between privacy and accuracy in the data analysis or query results. A common rule-of-thumb is to limit your overall ϵ spend to 1.0. 

**Measurement** - randomized function that takes a dataset and returns a differentially private release.

**Transformations** - how we compute statistical summaries in a way that can be privatized.

In [19]:
max_contributions = 1 # Unit of privacy

# Processing

## Transformation

In [60]:
from opendp.transformations import make_split_dataframe, make_select_column, then_cast, then_impute_constant, then_clamp, then_sum, then_mean, then_resize
from opendp.measurements import then_laplace
from opendp.mod import enable_features
enable_features("contrib")

In [52]:
age_transf = (
    make_split_dataframe(separator=",", col_names=df.columns.tolist())  >>
    make_select_column("age", str) >>
    then_cast(TOA=float) >>
    then_impute_constant(0.0)
) # select age column, vectorize, cast from str to float

age_processed = age_transf(open(dataset_path).read())
age_processed[:6]

[45.0, 21.0, 42.0, 47.0, 48.0, 41.0]

## Measurement

In [91]:
# Count mean age without privacy protection
non_private_mean = (
    age_transf >> 
    then_clamp(bounds=(18.0,70.0)) >> 
    then_resize(size=len(df), constant=20.) >> 
    then_mean()
)

print(non_private_mean(open(dataset_path).read()))

37.36648092584655


In [114]:
sensitivity = 1
epsilon = 1.1

# Add noise to query result for DP protection
private_mean = (
    age_transf >> 
    then_clamp(bounds=(18.0,70.0)) >> 
    then_resize(size=len(df), constant=20.) >> 
    then_mean() >>
    then_laplace(scale=(sensitivity / epsilon)) # adding noise to result to make if DP. Higher scale == more privacy & less accuracy
)

print(private_mean(open(dataset_path).read()))

38.290615457381
