# CS211: Data Privacy
## Final Project
### Nikhil and James

In [1]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

# Preliminary utility functions
def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(qs, sensitivity, epsilon):
    return [laplace_mech(q, sensitivity, epsilon) for q in qs]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# Database containing the default data of credit card clients Taiwan during 2005 
cc_default_db = pd.read_csv('https://raw.githubusercontent.com/nikhilchoppa/cs211-final-project/main/UCI_Credit_Card.csv')

  plt.style.use('seaborn-whitegrid')


In [2]:
cc_default_db.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


### There are 25 variables in the `cc_default_db` Dataset:

- ID: ID of each client
- LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
- SEX: Gender (1=male, 2=female)
- EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
- MARRIAGE: Marital status (1=married, 2=single, 3=others)
- AGE: Age in years
- PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)
- PAY_2: Repayment status in August, 2005 (scale same as above)
- PAY_3: Repayment status in July, 2005 (scale same as above)
- PAY_4: Repayment status in June, 2005 (scale same as above)
- PAY_5: Repayment status in May, 2005 (scale same as above)
- PAY_6: Repayment status in April, 2005 (scale same as above)
- BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
- BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
- BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
- BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
- BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
- BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
- PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
- PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
- PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
- PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
- PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
- PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
- default.payment.next.month: Default payment (1=yes, 0=no)

## Privacy Budget
define the privacy budget in terms of epsilon $\epsilon$

In [3]:
epsilon = 0.1

Adding Differential Privacy to `AGE, LIMIT_BAL, BILL_AMT, PAY_AMT`

In [5]:
def dp():
    # load the data into a numpy array
    data = np.array(['AGE','LIMIT_BAL', 'BILL_AMT', 'PAY_AMT'])

    # compute the sensitivity of the data
    sensitivity = 1

    # add random noise drawn from a Laplace distribution
    # with a mean of zero and a scale parameter determined
    # by the privacy budget and the sensitivity of the data
    noise = np.random.laplace(0, sensitivity / epsilon, data.shape)

    return noise

dp()

array([ 0.19685231,  3.2577291 , -0.17727136,  1.2171817 ])

## Calculating non DP statistics
In order to compare and show how adding noise affects the queries

In [6]:
# Counting queries for every row
col_val_counts = {}
for col in cc_default_db.columns:
    if col != 'ID': # Skip the ID column for obvious reasons
        col_val_counts[col] = cc_default_db[col].value_counts()

col_sums = {}
for col in cc_default_db.columns:
    if col != 'ID':
        col_sums[col] = cc_default_db[col].sum()

col_averages = {}
for col in cc_default_db.columns:
    if col != 'ID':
        col_averages[col] = cc_default_db[col].mean()

## Format database in order to calculate some statistics
### ...Merge all the columns that have `"_#"`
This is just signifying month to month statistics and it is going to be easier to work with if they are centralized to one column for PAY, BILL_AMT,and PAY_AMT
The time complexity here is a little unreasonable 

In [7]:
# Generate column names
merge_cols = []
for comb_col in ['PAY', 'BILL_AMT', 'PAY_AMT']:
    comb_row = []
    for i in range(1,7):
        if comb_col == 'PAY':
            comb_row.append(comb_col + '_' + str(i))
        else:
            comb_row.append(comb_col + str(i))
    merge_cols.append(comb_row)

In [8]:
fresh_cols = []
for col in merge_cols:
    new_row = []
    for row in range(len(cc_default_db)):
        new_col = []
        for sub_col in col:
            new_col.append(cc_default_db[sub_col][row])
        new_row.append(new_col)
    fresh_cols.append(new_row)

In [9]:
# Apply what we did above
for subcol in merge_cols:
    for c in subcol:
        cc_default_db.drop(c, axis=1, inplace=True)
cc_default_db['PAY'], cc_default_db['BILL_AMT'], cc_default_db['PAY_AMT'] = [fresh_cols[0], fresh_cols[1], fresh_cols[2]]

In [10]:
cc_default_db.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,default.payment.next.month,PAY,BILL_AMT,PAY_AMT
0,1,20000.0,2,2,1,24,1,"[2, 2, -1, -1, -2, -2]","[3913.0, 3102.0, 689.0, 0.0, 0.0, 0.0]","[0.0, 689.0, 0.0, 0.0, 0.0, 0.0]"
1,2,120000.0,2,2,2,26,1,"[-1, 2, 0, 0, 0, 2]","[2682.0, 1725.0, 2682.0, 3272.0, 3455.0, 3261.0]","[0.0, 1000.0, 1000.0, 1000.0, 0.0, 2000.0]"
2,3,90000.0,2,2,2,34,0,"[0, 0, 0, 0, 0, 0]","[29239.0, 14027.0, 13559.0, 14331.0, 14948.0, ...","[1518.0, 1500.0, 1000.0, 1000.0, 1000.0, 5000.0]"
3,4,50000.0,2,2,1,37,0,"[0, 0, 0, 0, 0, 0]","[46990.0, 48233.0, 49291.0, 28314.0, 28959.0, ...","[2000.0, 2019.0, 1200.0, 1100.0, 1069.0, 1000.0]"
4,5,50000.0,1,2,1,57,0,"[-1, 0, -1, 0, 0, 0]","[8617.0, 5670.0, 35835.0, 20940.0, 19146.0, 19...","[2000.0, 36681.0, 10000.0, 9000.0, 689.0, 679.0]"
