# Assignment 2: Privacy-Preserving Data Sharing
### Security and Privacy Project - MECD (2022/2023)

### Autores

- Duarte Meneses - 2019216949
- Patricia Costa - 2019213995

### Import dataset

In [57]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

infri = pd.read_csv('data/infringement_dataset_v2.csv')

infri.fillna(value = -1,  
          inplace = True) 

display(infri)

Unnamed: 0,loan_id,infringed,contract_type,gender,has_own_car,has_own_realty,num_children,annual_income,credit_amount,credit_annuity,...,first_name,last_name,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_approved,past_loans_refused,past_loans_canceled,past_loans_unused,past_loans_total
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,Robert,Watkins,9251.77500,179055.000,179055.00,1.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,Jane,Navarro,56553.99000,435436.500,484191.00,3.0,0.0,0.0,0.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,David,Seagraves,5357.25000,24282.000,20106.00,1.0,0.0,0.0,0.0,1.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,Deborah,Tandy,23651.17500,272203.260,291695.50,5.0,3.0,1.0,0.0,9.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,David,Walker,12278.80500,150530.250,166638.75,6.0,0.0,0.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,Lawrence,Mcwayne,6605.91000,40455.000,40455.00,1.0,0.0,0.0,0.0,1.0
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,Belinda,Loveland,10074.46500,57595.500,56821.50,1.0,0.0,0.0,0.0,1.0
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,Aileen,Rasmussen,4770.40500,24162.750,20625.75,2.0,0.0,0.0,0.0,2.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,Mary,Marbury,10681.13250,121317.750,134439.75,2.0,0.0,0.0,0.0,2.0


### 2.2. Sensitivity

##### Mean of past events

In [60]:
from matplotlib import pyplot as plt
columns = ['past_avg_amount_annuity', 'past_avg_amt_application', 'past_avg_amt_credit', 'past_loans_total']
colunas = infri[['age','past_avg_amount_annuity', 'past_avg_amt_application', 'past_avg_amt_credit', 'past_loans_total']]
colunas = colunas.groupby("age").mean()
display(colunas)

Unnamed: 0_level_0,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_total
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20,6225.39,76134.74625,74877.75,4.0
21,9159.34453,80978.706301,81849.636828,2.464115
22,10051.850022,89170.454517,90652.144514,3.011933
23,10098.28183,90166.832392,92516.294039,3.144195
24,10615.73388,95217.211373,99730.425124,3.65096
25,10714.413734,100203.209124,105048.83986,3.829655
26,10902.062821,104514.162115,111127.259162,3.917781
27,11587.003144,109004.296578,119014.075642,4.25059
28,11930.996138,112678.478874,123953.178302,4.319749
29,12248.130484,117989.971491,131008.289447,4.427379


In [77]:
def sensitivity_mean(dataset):
    l = []
    mean_past = dataset.mean()
    print("Total mean:", mean_past)

    for i in range(len(dataset)):
        aux = dataset.drop(dataset.index[i])
        l.append(abs(mean_past - aux.mean()))

    return max(l)

df_sensitivity = pd.DataFrame(columns=columns, index = ["Original Sensitivity", "Epsilon = 0.01", "Epsilon = 0.2"])

s_annuity = sensitivity_mean(colunas['past_avg_amount_annuity'])
s_application = sensitivity_mean(colunas['past_avg_amt_application'])
s_credit = sensitivity_mean(colunas['past_avg_amt_credit'])
s_loans = sensitivity_mean(colunas['past_loans_total'])

df_sensitivity.loc["Original Sensitivity"] = [s_annuity, s_application, s_credit, s_loans]

display(df_sensitivity)

Total mean: 13328.026144908183
Total mean: 139488.50141987667
Total mean: 152558.7080045867
Total mean: 4.503573521762351


Unnamed: 0,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_total
Original Sensitivity,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.01,,,,
Epsilon = 0.2,,,,


### 2.3. Differential Privacy

In [78]:
def add_laplace_noise(data, sensitivity, epsilon):
  return data + np.random.laplace(loc=0, scale=sensitivity/epsilon)

def percentage_error(orig, est):
  return ((orig-est) / orig) * 100

##### Adding noise

In [79]:
noise_annuity_001 = add_laplace_noise(colunas['past_avg_amount_annuity'], s_annuity, 0.01)
s_annuity_001 = sensitivity_mean(noise_annuity_001)

noise_application_001 = add_laplace_noise(colunas['past_avg_amt_application'], s_application, 0.01)
s_application_001 = sensitivity_mean(noise_application_001)

noise_credit_001 = add_laplace_noise(colunas['past_avg_amt_credit'], s_credit, 0.01)
s_credit_001 = sensitivity_mean(noise_credit_001)

noise_loans_001 = add_laplace_noise(colunas['past_loans_total'], s_loans, 0.01)
s_loans_001 = sensitivity_mean(noise_loans_001)

df_sensitivity.loc["Epsilon = 0.01"] = [s_annuity_001, s_application_001, s_credit_001, s_loans_001]
display(df_sensitivity)


Total mean: 23177.468405881864
Total mean: -30891.66486203754
Total mean: 86368.0761593605
Total mean: 6.466062073275817


Unnamed: 0,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_total
Original Sensitivity,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.01,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.2,,,,


In [80]:
noise_annuity_02= add_laplace_noise(colunas['past_avg_amount_annuity'], s_annuity, 0.2)
s_annuity_02 = sensitivity_mean(noise_annuity_02)

noise_application_02 = add_laplace_noise(colunas['past_avg_amt_application'], s_application, 0.2)
s_application_02 = sensitivity_mean(noise_application_02)

noise_credit_02= add_laplace_noise(colunas['past_avg_amt_credit'], s_credit, 0.2)
s_credit_02 = sensitivity_mean(noise_credit_02)

noise_loans_02 = add_laplace_noise(colunas['past_loans_total'], s_loans, 0.2)
s_loans_02 = sensitivity_mean(noise_loans_02)

df_sensitivity.loc["Epsilon = 0.2"] = [s_annuity_02, s_application_02, s_credit_02, s_loans_02]
display(df_sensitivity)

Total mean: 10979.471401499528
Total mean: 133585.07775246655
Total mean: 147385.53412954367
Total mean: 4.578264740587639


Unnamed: 0,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_total
Original Sensitivity,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.01,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.2,144.951758,1292.933779,1585.325674,0.041622


In [81]:
noise_annuity_log2 = add_laplace_noise(colunas['past_avg_amount_annuity'], s_annuity, np.log(2))
s_annuity_log2 = sensitivity_mean(noise_annuity_log2)

noise_application_log2 = add_laplace_noise(colunas['past_avg_amt_application'], s_application, np.log(2))
s_application_log2 = sensitivity_mean(noise_application_log2)

noise_credit_log2 = add_laplace_noise(colunas['past_avg_amt_credit'], s_credit, np.log(2))
s_credit_log2 = sensitivity_mean(noise_credit_log2)

noise_loans_log2 = add_laplace_noise(colunas['past_loans_total'], s_loans, np.log(2))
s_loans_log2 = sensitivity_mean(noise_loans_log2)

df_sensitivity.loc["Epsilon = np.log(2)"] = [s_annuity_log2, s_application_log2, s_credit_log2, s_loans_log2]
display(df_sensitivity)

Total mean: 13507.737338863528
Total mean: 140530.422289607
Total mean: 151004.7096377545
Total mean: 4.502872637520872


Unnamed: 0,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_total
Original Sensitivity,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.01,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.2,144.951758,1292.933779,1585.325674,0.041622
Epsilon = np.log(2),144.951758,1292.933779,1585.325674,0.041622


In [82]:
noise_annuity_log3 = add_laplace_noise(colunas['past_avg_amount_annuity'], s_annuity, np.log(3))
s_annuity_log3 = sensitivity_mean(noise_annuity_log3)

noise_application_log3 = add_laplace_noise(colunas['past_avg_amt_application'], s_application, np.log(3))
s_application_log3 = sensitivity_mean(noise_application_log3)

noise_credit_log3 = add_laplace_noise(colunas['past_avg_amt_credit'], s_credit, np.log(3))
s_credit_log3 = sensitivity_mean(noise_credit_log3)

noise_loans_log3 = add_laplace_noise(colunas['past_loans_total'], s_loans, np.log(3))
s_loans_log3 = sensitivity_mean(noise_loans_log3)

df_sensitivity.loc["Epsilon = np.log(3)"] = [s_annuity_log3, s_application_log3, s_credit_log3, s_loans_log3]
display(df_sensitivity)

Total mean: 13334.775762754067
Total mean: 141001.68617699275
Total mean: 152944.89776986823
Total mean: 4.5150038845887375


Unnamed: 0,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_total
Original Sensitivity,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.01,144.951758,1292.933779,1585.325674,0.041622
Epsilon = 0.2,144.951758,1292.933779,1585.325674,0.041622
Epsilon = np.log(2),144.951758,1292.933779,1585.325674,0.041622
Epsilon = np.log(3),144.951758,1292.933779,1585.325674,0.041622


### Errors

In [76]:
orig_mean = colunas['past_avg_amount_annuity'].mean()
noise_mean = noise_annuity.mean()
print(percentage_error(orig_mean, noise_mean))

-4.207459748923787


##### Mean of people age of infringements

In [120]:
df = infri[['infringed', 'age']]
display(df)

Unnamed: 0,infringed,age
0,1,25
1,0,45
2,0,52
3,0,52
4,0,54
...,...,...
307506,0,25
307507,0,56
307508,0,41
307509,1,32


In [121]:
analyse = df[df['infringed'] == 1]
original_count = len(analyse)

l = []
mean_age = analyse['age'].mean()
print("Total mean:", mean_age)
for i in range(original_count):
    aux = analyse.drop(analyse.index[i])
    l.append(abs(mean_age - aux['age'].mean()))

sensitivity_age = max(l)
print("Sensitivity: ", sensitivity_age)


Total mean: 40.279959718026184
Sensitivity:  0.0011166629182213228


##### Mean of people age of infringements

In [129]:
dp_count = add_laplace_noise(original_count, sensitivity_age, 0.01)
print(original_count, dp_count, percentage_error(original_count, dp_count))

24825 24824.946790178597 0.0002143396632530148
