# CS211: Data Privacy
## Final Project
### Nikhil and James

In [215]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import random
plt.style.use('seaborn-whitegrid')

# Preliminary utility functions
def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(qs, sensitivity, epsilon):
    return [laplace_mech(q, sensitivity, epsilon) for q in qs]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# Database containing the default data of credit card clients Taiwan during 2005 
cc_default_db = pd.read_csv('https://raw.githubusercontent.com/nikhilchoppa/cs211-final-project/main/UCI_Credit_Card.csv')

  plt.style.use('seaborn-whitegrid')


In [216]:
cc_default_db.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


### There are 25 variables in the `cc_default_db` Dataset:

- ID: ID of each client
- LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
- SEX: Gender (1=male, 2=female)
- EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
- MARRIAGE: Marital status (1=married, 2=single, 3=others)
- AGE: Age in years
- PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)
- PAY_2: Repayment status in August, 2005 (scale same as above)
- PAY_3: Repayment status in July, 2005 (scale same as above)
- PAY_4: Repayment status in June, 2005 (scale same as above)
- PAY_5: Repayment status in May, 2005 (scale same as above)
- PAY_6: Repayment status in April, 2005 (scale same as above)
- BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
- BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
- BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
- BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
- BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
- BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
- PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
- PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
- PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
- PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
- PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
- PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
- default.payment.next.month: Default payment (1=yes, 0=no)

## Generalize ages
A good first step to ensure privacy but only will satisfy k-anonimity, differential privacy will require additonal work 

In [217]:
# Generalizing the given column to essentially round to the nth decimal place
depths = {
    'AGE': 1
}

# Grabbed this part from HW2 / the textbook, applies the rounding described above to the given column(s) in the dataset
cc_default_db['AGE'] = cc_default_db[['AGE']].apply(lambda x: x.apply(lambda y: int(int(y/(10**depths[x.name]))*(10**depths[x.name]))))

## Differential Privacy
### Choosing a privacy budget
Define the privacy budget in terms of epsilon $\epsilon$, keeping in mind that each row is an individual and allocating the budget in terms of the individual. An epsilon of 0.1 should be decent in this case but might change depending on how many queries we end up including throughout the document.

In [218]:
epsilon = 0.1

Create differentially private functions to easily acheieve privacy

In [219]:
# Applying the sparse vector technique to our differentially private statistics
# This will have to be done for each column in BILL_AMT1-6 and PAY_AMT1-6
# Takes a long time to run, but generates the correct b values!
def sparse_vector_averages(col):
    # preserves epsilon-differential privacy
    def above_threshold(queries, df, T, epsilon):
        T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)   
        for idx, q in enumerate(queries):
            nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
            if q(df) + nu_i >= T_hat:
                return idx
        # if the algorithm "fails", return a random index 
        # more convenient in certain use cases    
        return random.randint(0,len(queries)-1)
    
    df = cc_default_db[col]
    # Construct the stream of queries, the more the better even though this has a huge impact on runtime
    bs = range(1,150000,10)
    queries = [lambda df: df.clip(lower=0, upper=b).sum() - df.clip(lower=0, upper=b+1).sum() for b in bs]

    # Run AboveThreshold, using 1/3 of the privacy budget, to find a good clipping parameter
    epsilon_svt = epsilon / 3
    final_b = bs[above_threshold(queries, df, 0, epsilon_svt)]

    # Compute the noisy sum and noisy count, using 1/3 of the privacy budget for each
    epsilon_sum = epsilon / 3
    epsilon_count = epsilon / 3

    noisy_sum = laplace_mech(df.clip(lower=0, upper=final_b).sum(), final_b, epsilon_sum)
    noisy_count = laplace_mech(len(df), 1, epsilon_count)
    noisy_average = noisy_sum / noisy_count

    # Add those statistics to the respective column
    return [noisy_sum, noisy_average]

def dp_averages(col):
    s = cc_default_db[col]
    bs = [10**n for n in range(10)]
    n = len(bs)

    epsilon_i = epsilon / (n+1)

    noisy_count = laplace_mech(len(s), 1, epsilon_i)
    last_value = -1000
    for b in bs:
        noisy_sum = laplace_mech(s.clip(lower=0, upper=b).sum(), b, epsilon_i)
        noisy_avg = noisy_sum / noisy_count

        if abs(noisy_avg - last_value) < .01*b:
            return [noisy_sum, last_value]
        else:
            last_value = noisy_avg
    
    return [-1, last_value]

# Just use classic DP for counts of other columns
# Using the lambda function to preserve the pandas data frame
f = lambda x: x + np.random.laplace(loc=0, scale=1/epsilon)        

def dp_education_counts():
    return cc_default_db['EDUCATION'].value_counts().apply(f)

## Calculating non DP statistics
In order to compare and show how adding noise affects the queries

In [220]:
# Counting queries for every row
col_counts = {}
for col in cc_default_db.columns:
    if col != 'ID': # Skip the ID column for obvious reasons
        col_counts[col] = cc_default_db[col].value_counts()
        
col_sums = {}
for col in cc_default_db.columns:
    if col != 'ID':
        col_sums[col] = cc_default_db[col].sum()

col_averages = {}
for col in cc_default_db.columns:
    if col != 'ID':
        col_averages[col] = cc_default_db[col].mean()

## Comparing Differentially Private and non DP statistics
This will involve looking at the differences in the histograms for the LIMIT_BAL and AGE columns. While the percent error will be contrasted between the BAL_AMT and PAY_AMT statistics. 

In [221]:
# Counts of most common education level
print(dp_education_counts()) 

# Average client age
age_average = dp_averages('AGE')[1]
reg_age_average = np.mean(cc_default_db['AGE'])
print('Average Age:', age_average)

# Compare the percent error of age average
age_average_error = pct_error(reg_age_average, age_average)
print('PCT ERROR for DP age column:', age_average_error)

# Average limit balance of a credit card clients accounts
limit_bal_average = sparse_vector_averages('LIMIT_BAL')[1]
reg_limit_bal_average = np.mean(cc_default_db['LIMIT_BAL'])
print(limit_bal_average)

# Compare the percent error for limit balance average
limit_bal_average_error = pct_error(np.mean(cc_default_db['LIMIT_BAL']), limit_bal_average)
print('PCT ERROR for DP limit_bal column:', limit_bal_average_error)

# Average monthly bill amount
bill_averages = []
bill_averages_errors = []
reg_bill_averages = []
for col in cc_default_db.columns:
    if 'BILL_AMT' in col:
        reg_bill_averages.append(np.mean(cc_default_db[col]))
        bill_averages.append(sparse_vector_averages(col)[1])
        bill_averages_errors.append(pct_error(np.mean(cc_default_db[col]), bill_averages[-1]))
print(bill_averages)
print('PCT ERROR for bill_amt columns:', bill_averages_errors)

# Average monthly payment amount
pay_averages = []
pay_averages_errors = []
reg_pay_averages = []
for col in cc_default_db.columns:
    if 'PAY_AMT' in col:
        reg_pay_averages.append(np.mean(cc_default_db[col]))
        pay_averages.append(dp_averages(col)[1])
        pay_averages_errors.append(pct_error(np.mean(cc_default_db[col]), pay_averages[-1]))
print(pay_averages) # Add extra noise to be safe
print('PCT ERROR for pay_amt columns:', pay_averages_errors)

# Average monthly bill amount for credit clients in higher education (EDUCATION = 1 or 2)
higher_ed_avg_db = cc_default_db[cc_default_db['EDUCATION'] <= 2]
higher_ed_bill_average = 0
for col in cc_default_db[cc_default_db['EDUCATION'] <= 2]:
    if 'BILL_AMT' in col:
        higher_ed_bill_average += sparse_vector_averages(col)[1]
higher_ed_bill_average = laplace_mech((higher_ed_bill_average / 6), 1, epsilon)
print(higher_ed_bill_average)

# Contrast average monthly payments between sexes
# This is going to be two different queries, one with SEX=1 and the other SEX=2
male_pay_average = 0
for col in cc_default_db[cc_default_db['SEX'] == 1]:
    if 'PAY_AMT' in col:
        male_pay_average += dp_averages(col)[1]
male_pay_average = laplace_mech((male_pay_average / 6), 1, epsilon)
print(male_pay_average)

female_pay_average = 0
for col in cc_default_db[cc_default_db['SEX'] == 2]:
    if 'PAY_AMT' in col:
        female_pay_average += dp_averages(col)[1]
female_pay_average = laplace_mech((female_pay_average / 6), 1, epsilon)
print(female_pay_average)

2    14027.301399
1    10586.564091
3     4912.121562
5      275.720706
4      105.269538
6       44.002262
0        4.293737
Name: EDUCATION, dtype: float64
Average Age: 31.20976557848411
PCT ERROR for DP age column: 1.1443196882924616
70236.44063304328
PCT ERROR for DP limit_bal column: 58.06387158227915
[6747.682384134839, 40909.1362246198, 40122.50602006076, 29700.003265331543, 30749.945552836445, 17753.207764314455]
PCT ERROR for bill_amt columns: [86.82693556709948, 16.815970845365136, 14.656852553828688, 31.35002588886192, 23.718985658019058, 54.32877857439548]
[5237.268270952111, 5792.280070997873, 4416.8381477934045, 4076.447844183984, 4061.0487066776204, 4770.83403813229]
PCT ERROR for pay_amt columns: [7.527256459900034, 2.17665715533994, 15.47823670858232, 15.532886093470886, 15.384023610172793, 8.525899908023131]
27613.987430139972
4939.084253846592
4893.4072704546725


In [222]:
# Counts of an individual’s marital status in comparison to if they default
default_marriage_ct = pd.crosstab(cc_default_db['MARRIAGE'], cc_default_db['default.payment.next.month'])
default_marriage_ct = default_marriage_ct.apply(f)

## Generate the PDF writeup document

In [224]:
from pylatex import Document, Section, Subsection, Command, Math, NoEscape, MediumText, Subsubsection
from pylatex.utils import italic, NoEscape, verbatim

doc = Document()

doc.preamble.append(Command('title', 'Differentially Private Credit Card Clients Report from Taiwan'))
doc.preamble.append(Command('date', NoEscape(r'\today')))
doc.append(NoEscape(r'\maketitle'))
with doc.create(Section('Discussion')):
    doc.append('The database can be found here: https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset\n')
    doc.append('The github can be found here: https://github.com/nikhilchoppa/cs211-final-project\n')
    doc.append('Our project video can be found here: https://youtu.be/zT8L7tV1GQ8\n')
    doc.append('It should be noted that reported dollar amounts are in the New Tiawan dollar')
    with doc.create(Subsection('Privacy Budget')):
        doc.append(f'Overall in the document our epsilon value is {epsilon}, and from since we invoke the laplace mechanism 33 times, through sequential composition our total privacy budget would be 3.3')
with doc.create(Section('Statistics')):
    with doc.create(Subsection('Basic Averages')):
        doc.append('The bill_amt and limit_bal differentially private average statistics were generated using the sparse vector technique to determine a clipping parameter for the data, and then generating differentially private sums and counts to find a differentially private average.')
        doc.append('Whereas the remaining averages and counts were calculated using the good old laplace mechanism or in the case of the averages, we used the method for choosing a clipping which does not require thousands of queries as displayed in Homework 4')

        # Average credit card clients age
        with doc.create(Subsubsection('Average Age')):
            doc.append(f'The average age of all credit card clients is {round(reg_age_average, 2)}, the differentially private average age of all customers is {round(age_average, 2)}. This gives an error of {round(age_average_error, 2)}%.')

        # Average limit balance of a credit card clients accounts
        with doc.create(Subsubsection('Average Credit Limit Balance')):
            doc.append(f'The average credit card balance limit of all credit card customers is {round(reg_limit_bal_average, 2)}, the differentially private average credit card balance limit of all customers is {round(limit_bal_average, 2)}. This gives an error of {round(limit_bal_average_error, 2)}%.')

        # Average monthly bill amount
        with doc.create(Subsubsection('Average Bill Amount')):
            doc.append(f'The average bill amount of all credit card customers for the month of September is {round(reg_bill_averages[0], 2)}, the differentially private average bill amount of all customers is {round(bill_averages[0], 2)}. This gives an error of {round(bill_averages_errors[0], 2)}%.')
            doc.append(f'The average bill amount of all credit card customers for the month of August is {round(reg_bill_averages[1], 2)}, the differentially private average bill amount of all customers is {round(bill_averages[1], 2)}. This gives an error of {round(bill_averages_errors[1], 2)}%.')
            doc.append(f'The average bill amount of all credit card customers for the month of July is {round(reg_bill_averages[2], 2)}, the differentially private average bill amount of all customers is {round(bill_averages[2], 2)}. This gives an error of {round(bill_averages_errors[2], 2)}%.')
            doc.append(f'The average bill amount of all credit card customers for the month of June is {round(reg_bill_averages[3], 2)}, the differentially private average bill amount of all customers is {round(bill_averages[3], 2)}. This gives an error of {round(bill_averages_errors[3], 2)}%.')
            doc.append(f'The average bill amount of all credit card customers for the month of May is {round(reg_bill_averages[4], 2)}, the differentially private average bill amount of all customers is {round(bill_averages[4], 2)}. This gives an error of {round(bill_averages_errors[4], 2)}%.')
            doc.append(f'The average bill amount of all credit card customers for the month of April is {round(reg_bill_averages[5], 2)}, the differentially private average bill amount of all customers is {round(bill_averages[5], 2)}. This gives an error of {round(bill_averages_errors[5], 2)}%.')

        # Average monthly payment amount
        with doc.create(Subsubsection('Average Pay Amount')):
            doc.append(f'The average pay amount of all credit card customers for the month of September is {round(reg_pay_averages[0], 2)}, the differentially private average pay amount of all customers is {round(pay_averages[0], 2)}. This gives an error of {round(pay_averages_errors[0], 2)}%.')
            doc.append(f'The average pay amount of all credit card customers for the month of August is {round(reg_pay_averages[1], 2)}, the differentially private average pay amount of all customers is {round(pay_averages[1], 2)}. This gives an error of {round(pay_averages_errors[1], 2)}%.')
            doc.append(f'The average pay amount of all credit card customers for the month of July is {round(reg_pay_averages[2], 2)}, the differentially private average pay amount of all customers is {round(pay_averages[2], 2)}. This gives an error of {round(pay_averages_errors[2], 2)}%.')
            doc.append(f'The average pay amount of all credit card customers for the month of June is {round(reg_pay_averages[3], 2)}, the differentially private average pay amount of all customers is {round(pay_averages[3], 2)}. This gives an error of {round(pay_averages_errors[3], 2)}%.')
            doc.append(f'The average pay amount of all credit card customers for the month of May is {round(reg_pay_averages[4], 2)}, the differentially private average pay amount of all customers is {round(pay_averages[4], 2)}. This gives an error of {round(pay_averages_errors[4], 2)}%.')
            doc.append(f'The average pay amount of all credit card customers for the month of April is {round(reg_pay_averages[5], 2)}, the differentially private average pay amount of all customers is {round(pay_averages[5], 2)}. This gives an error of {round(pay_averages_errors[5], 2)}%.')


    with doc.create(Subsection('Basic Counts')):
        doc.append('The follwing statistics were generated using the value counts function and applying the laplace mechanism as a lambda function to preserve the dataframe.')

        # Counts of most common education level
        with doc.create(Subsubsection('Education Levels')):
            doc.append(f'The most common education level as determined by using a differentially private method is {education_counts()}.')


    
    with doc.create(Subsection('Conditional Averages')):
        
        # Contrast average monthly payments between sexes
        with doc.create(Subsubsection('Average Monthly Payments of Male credit card clients')):
            doc.append(f'The average monthly credit card payments of male customers is {round(male_pay_average, 2)} with differential privacy applied.')
        with doc.create(Subsubsection('Average Monthly Payments of Female credit card customers')):
            doc.append(f'The average monthly credit card payments of female customers is is {round(female_pay_average, 2)} with differential privacy applied.')

        # Average monthly bill amount for credit clients in higher education (EDUCATION = 1 or 2)
        with doc.create(Subsubsection('Average Bill Amount credit clients in Higher Education')):
            doc.append(f'The average bill amount of all credit card customers in Higher Education is {round(higher_ed_bill_average, 2)} with differential privacy applied.')


    # Counts of an individual’s marital status in comparison to if they default
    with doc.create(Subsection('Conditional Counts')):
        with doc.create(Subsubsection('Most Common Marital Status with defalt \'YES\'')):
            doc.append(f'The comparison of marital status with if they HAVE defaulted using a differentially private method is {default_marriage_ct[1]}.')
        with doc.create(Subsubsection('Most Common Marital Status with defalt \'NO\'')):
            doc.append(f'The comparison of marital status with if they HAVE NOT defaulted using a differentially private method is {default_marriage_ct[0]}.')

# To generate pdf file you need to install latexmk
doc.generate_pdf('final_report', clean_tex=False)

tex = doc.dumps()
out = open("fianl_report.tex", "a")
out.write(tex)
out.close()