In [73]:
import pandas as pd
from mechanisms import TrivialMechanism, LaplaceMechanism
from attacks import cn19

In [74]:
# Load the dataset  
raw_data = pd.read_csv("hospital_data.csv")
raw_data = raw_data.drop('Unnamed: 0', axis=1)
raw_data.head()

Unnamed: 0,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,Length of Stay,Type of Admission,Patient Disposition,Discharge Year,CCS Diagnosis Code,CCS Procedure Code,APR DRG Code,APR MDC Code,APR Severity of Illness Code,APR Risk of Mortality
0,18 to 29,100,F,White,Spanish/Hispanic,4,Urgent,Home or Self Care,2015,189,134,540,14,1,Minor
1,0 to 17,112,F,Black/African American,Not Span/Hispanic,3,Elective,Home or Self Care,2015,58,71,421,10,3,Minor
2,0 to 17,104,M,White,Spanish/Hispanic,1,Elective,Home or Self Care,2015,124,30,93,3,2,Minor
3,50 to 69,115,F,White,Not Span/Hispanic,6,Elective,Home or Self Care,2015,14,78,221,6,2,Minor
4,50 to 69,100,F,White,Spanish/Hispanic,39,Elective,Expired,2015,237,49,4,5,4,Extreme


In [75]:
# Encode all of the columns as integers for the LP, keeping track of the keys
factor_dicts = {}
data = pd.DataFrame()
for col in raw_data:
    values, key = pd.factorize(raw_data[col])
    factor_dicts[col] = key 
    data[col] = values 

column_names = list(data)    
data.head()

Unnamed: 0,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,Length of Stay,Type of Admission,Patient Disposition,Discharge Year,CCS Diagnosis Code,CCS Procedure Code,APR DRG Code,APR MDC Code,APR Severity of Illness Code,APR Risk of Mortality
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,1,1,1,1,0,0,1,1,1,1,1,0
2,1,2,1,0,0,2,1,0,0,2,2,2,2,2,0
3,2,3,0,0,1,3,1,0,0,3,3,3,3,2,0
4,2,0,0,0,0,4,1,1,0,4,4,4,4,3,1


In [81]:
# We represent the multi-column setting by a dictionary of mechanisms for each columns 
trivial_mechs = {}
laplace_mechs = {}
epsilon = 1
for col in data:
    trivial_mechs[col] = TrivialMechanism(data[col].values)
    sensitivity = len(factor_dicts[col]) - 1
    laplace_mechs[col] = LaplaceMechanism(data[col].values, epsilon, sensitivity)

In [82]:
def attack(column_names, mech_dict, factor_dicts, verbose=False):
    """
    Uses the CN19 attack to reconstruct the database. 
    
    We assume that the attacker can access the column names, and the key to the integer factors, but nothing else. 
    
    mech_dict contains a dictionary of mechanisms for each column
    
    """
    reconstructed = pd.DataFrame()
    # For each column, run the CN19 attack with 1000 queries
    for col in column_names:
        if verbose:
            print("Reconstructing " + col + "...\n")
        raw_col = cn19(mech_dict[col], gen_t = lambda n : 1000, k = len(factor_dicts[col]) - 1)
        # Convert the ints back into categorical data
        reconstructed[col] = factor_dicts[col].take(raw_col)
    return reconstructed
    
    

In [78]:
trivial_res = attack(column_names, trivial_mechs, factor_dicts, verbose=True)

Reconstructing Age Group...

Reconstructing Zip Code - 3 digits...

Reconstructing Gender...

Reconstructing Race...

Reconstructing Ethnicity...

Reconstructing Length of Stay...

Reconstructing Type of Admission...

Reconstructing Patient Disposition...

Reconstructing Discharge Year...

Reconstructing CCS Diagnosis Code...

Reconstructing CCS Procedure Code...

Reconstructing APR DRG Code...

Reconstructing APR MDC Code...

Reconstructing APR Severity of Illness Code...

Reconstructing APR Risk of Mortality...



In [80]:
def agreement(result, original, n):
    """
    Measures how much 2 dataframes differ in each column
    """
    difference = {}
    for col in result:
        difference[col] = sum(result[col] == original[col]) / n
    return difference


agreement(trivial_res, raw_data, 1000)

{'Age Group': 1.0,
 'Zip Code - 3 digits': 0.701,
 'Gender': 1.0,
 'Race': 1.0,
 'Ethnicity': 1.0,
 'Length of Stay': 1.0,
 'Type of Admission': 1.0,
 'Patient Disposition': 1.0,
 'Discharge Year': 1.0,
 'CCS Diagnosis Code': 1.0,
 'CCS Procedure Code': 1.0,
 'APR DRG Code': 1.0,
 'APR MDC Code': 1.0,
 'APR Severity of Illness Code': 1.0,
 'APR Risk of Mortality': 1.0}

In [83]:
laplace_res = attack(column_names, laplace_mechs, factor_dicts, verbose=True)

Reconstructing Age Group...

Reconstructing Zip Code - 3 digits...

Reconstructing Gender...

Reconstructing Race...

Reconstructing Ethnicity...

Reconstructing Length of Stay...

Reconstructing Type of Admission...

Reconstructing Patient Disposition...

Reconstructing Discharge Year...

Reconstructing CCS Diagnosis Code...

Reconstructing CCS Procedure Code...

Reconstructing APR DRG Code...

Reconstructing APR MDC Code...

Reconstructing APR Severity of Illness Code...

Reconstructing APR Risk of Mortality...



In [84]:
agreement(laplace_res, raw_data, 1000)

{'Age Group': 0.528,
 'Zip Code - 3 digits': 0.207,
 'Gender': 1.0,
 'Race': 0.837,
 'Ethnicity': 0.727,
 'Length of Stay': 0.072,
 'Type of Admission': 0.533,
 'Patient Disposition': 0.509,
 'Discharge Year': 1.0,
 'CCS Diagnosis Code': 0.011,
 'CCS Procedure Code': 0.032,
 'APR DRG Code': 0.021,
 'APR MDC Code': 0.109,
 'APR Severity of Illness Code': 0.745,
 'APR Risk of Mortality': 0.828}