## DIFFERENTIAL PRIVACY USING PYDP

### Imports

In [1]:
import pydp as dp # by convention our package is to be imported as dp (dp for Differential Privacy!)
from pydp.algorithms.laplacian import BoundedSum, BoundedMean, Count, Max
import pandas as pd


### Fetching required data

In [4]:
base_dir = '../../'
paths = ['data/dp_data/01.csv', 'data/dp_data/02.csv', 'data/dp_data/03.csv', 'data/dp_data/04.csv', 'data/dp_data/05.csv']

combined_df_temp = map(lambda url: pd.read_csv(base_dir + url, sep=',', engine='python'), paths )
original_dataset = pd.concat(combined_df_temp)
print(original_dataset.head())
print(original_dataset.shape)

   id first_name   last_name                      email  sales_amount  \
0   1   Osbourne    Gillions  ogillions0@feedburner.com         31.94   
1   2      Glynn      Friett          gfriett1@blog.com         12.46   
2   3       Jori    Blockley      jblockley2@unesco.org        191.14   
3   4     Garald      Dorian     gdorian3@webeden.co.uk        126.58   
4   5      Mercy  Pilkington      mpilkington4@jugem.jp         68.32   

        state  
0     Florida  
1  California  
2    Colorado  
3       Texas  
4     Florida  
(5000, 6)


### Creating a Paralel Database
Differ by only one record

In [5]:
redact_dataset = original_dataset.copy()
redact_dataset = redact_dataset[1:]
print(original_dataset.head())
print(redact_dataset.head())

   id first_name   last_name                      email  sales_amount  \
0   1   Osbourne    Gillions  ogillions0@feedburner.com         31.94   
1   2      Glynn      Friett          gfriett1@blog.com         12.46   
2   3       Jori    Blockley      jblockley2@unesco.org        191.14   
3   4     Garald      Dorian     gdorian3@webeden.co.uk        126.58   
4   5      Mercy  Pilkington      mpilkington4@jugem.jp         68.32   

        state  
0     Florida  
1  California  
2    Colorado  
3       Texas  
4     Florida  
   id first_name   last_name                    email  sales_amount  \
1   2      Glynn      Friett        gfriett1@blog.com         12.46   
2   3       Jori    Blockley    jblockley2@unesco.org        191.14   
3   4     Garald      Dorian   gdorian3@webeden.co.uk        126.58   
4   5      Mercy  Pilkington    mpilkington4@jugem.jp         68.32   
5   6       Elle  McConachie  emcconachie5@census.gov         76.91   

        state  
1  California  
2    C

#### SuccessfulMembership inference

In [9]:
sum_original_dataset = round(sum(original_dataset['sales_amount'].to_list()), 2)
sum_redact_dataset = round(sum(redact_dataset['sales_amount'].to_list()), 2)
sales_amount_Osbourne = round((sum_original_dataset - sum_redact_dataset), 2)
assert sales_amount_Osbourne == original_dataset.iloc[0, 4]
sales_amount_Osbourne

31.94

In [10]:
print(f"{sum_original_dataset} - {sum_redact_dataset}")

636594.59 - 636562.65


### Differentially Private Sum for original dataset

In [11]:
dp_sum_original_dataset = BoundedSum(epsilon= 1.5, lower_bound =  5, upper_bound = 250, dtype ='float') 
dp_sum_og = dp_sum_original_dataset.quick_result(original_dataset['sales_amount'].to_list())
dp_sum_og = round(dp_sum_og, 2)
print(dp_sum_og)

636553.73


### Differentially Private Sum for parallel dataset

In [12]:
dp_redact_dataset = BoundedSum(epsilon= 1.5, lower_bound =  5, upper_bound = 250, dtype ='float')
dp_redact_dataset.add_entries(redact_dataset['sales_amount'].to_list())
dp_sum_redact=round(dp_redact_dataset.result(), 2)
print(dp_sum_redact)

636734.35


In [13]:
dp_sum_original_dataset = BoundedSum(epsilon= 1.5, lower_bound =  5, upper_bound = 250, dtype ='float')

dp_redact_dataset = BoundedSum(epsilon= 1.5, lower_bound =  5, upper_bound = 250, dtype ='float')


### Summary

In [14]:
print(f"Sum of sales_value in the orignal dataset: {sum_original_dataset}")
print(f"Sum of sales_value in the orignal dataset with DP: {dp_sum_og}")
assert dp_sum_og != sum_original_dataset


print(f"Sum of sales_value in the second dataset: {sum_redact_dataset}")
print(f"Sum of sales_value in the second dataset with DP: {dp_sum_redact}")
assert dp_sum_redact != sum_redact_dataset


print(f"Difference in Sum with DP: {round(dp_sum_og - dp_sum_redact, 2)}")
print(f"Actual Difference in Sum: {sales_amount_Osbourne}")
assert round(dp_sum_og - dp_sum_redact, 2) != sales_amount_Osbourne

Sum of sales_value in the orignal dataset: 636594.59
Sum of sales_value in the orignal dataset with DP: 636553.73
Sum of sales_value in the second dataset: 636562.65
Sum of sales_value in the second dataset with DP: 636734.35
Difference in Sum with DP: -180.62
Actual Difference in Sum: 31.94
