In [1]:
import pandas as pd
import string
import numpy as np
from cape_dataframes.pandas import dtypes

from diffprivlib.models import LogisticRegression as priv_LR, StandardScaler, DecisionTreeClassifier as priv_DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeClassifier 

In [20]:
df = pd.read_csv('./Assignment2Dataset-1.csv', header = 0, usecols = lambda x: not x.startswith('Unnamed'))

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Name                 60 non-null     object 
 1   Country              60 non-null     object 
 2   SSN                  60 non-null     object 
 3   DOB                  60 non-null     object 
 4   Income               60 non-null     int64  
 5   Sex                  60 non-null     object 
 6   Marital Status       60 non-null     object 
 7   Education            60 non-null     object 
 8   Loan                 60 non-null     object 
 9   House Status         60 non-null     object 
 10  Blood Type           60 non-null     object 
 11  Blood Pressure       60 non-null     object 
 12  Heart Rate           59 non-null     float64
 13  Oxygen Level         59 non-null     object 
 14  Medical Procedure    59 non-null     object 
 15  Smoking              58 non-null     objec

In [5]:
df.shape

(60, 20)

In [6]:
df.head()

Unnamed: 0,Name,Country,SSN,DOB,Income,Sex,Marital Status,Education,Loan,House Status,Blood Type,Blood Pressure,Heart Rate,Oxygen Level,Medical Procedure,Smoking,Alcohol Consumption,Allergies,Vaccinations,Tumor Condition
0,John Smith,USA,123-45-6789,7/15/1985,50000,Male,Married,Bachelor's Degree,Yes,Own,O+,120/80,72.0,98%,Appendectomy,No,No,Pollen,Yes,Normal
1,Emily Johnson,Canada,987-65-4321,12/2/1990,60000,Female,Single,Master's Degree,No,Rent,A-,110/70,68.0,96%,Laser Eye Surgery,No,Yes,Shellfish,No,Normal
2,Michael Davis,UK,456-78-9123,3/20/1978,75000,Male,Divorced,High School Diploma,Yes,Own,B+,130/85,75.0,97%,Colonoscopy,No,Yes,Cats,Yes,Abnormal
3,Jessica Martinez,Australia,789-12-3456,9/10/1982,40000,Female,Married,Associate's Degree,No,Own,AB-,115/75,70.0,99%,Mammogram,No,No,Dust,No,Normal
4,David Thompson,USA,234-56-7890,6/25/1995,35000,Male,Single,Some College,No,Rent,O-,125/80,68.0,97%,Dental Cleaning,Yes,Yes,Peanuts,Yes,Normal


In [7]:
def randomize_values(df, columns):
    alphanumeric_char_list = list((string.ascii_letters + string.digits))
    if type(columns) == str:
        columns = [columns]
    for column in columns:
        df[column] = df[column].apply(lambda x: ''.join(np.random.choice(alphanumeric_char_list, size = 10)))
    return df

In [8]:
# Name and SSN could not provide value for the model or anaylsis hence we just anonymize it
df = randomize_values(df, ["Name", "SSN"])

In [9]:
df.head()

Unnamed: 0,Name,Country,SSN,DOB,Income,Sex,Marital Status,Education,Loan,House Status,Blood Type,Blood Pressure,Heart Rate,Oxygen Level,Medical Procedure,Smoking,Alcohol Consumption,Allergies,Vaccinations,Tumor Condition
0,LC6QhRjKT8,USA,MBA7r8vlEZ,7/15/1985,50000,Male,Married,Bachelor's Degree,Yes,Own,O+,120/80,72.0,98%,Appendectomy,No,No,Pollen,Yes,Normal
1,wCHS1mybsK,Canada,Rt08QDh9qM,12/2/1990,60000,Female,Single,Master's Degree,No,Rent,A-,110/70,68.0,96%,Laser Eye Surgery,No,Yes,Shellfish,No,Normal
2,1vHcOXdkG8,UK,srQyLZtF3L,3/20/1978,75000,Male,Divorced,High School Diploma,Yes,Own,B+,130/85,75.0,97%,Colonoscopy,No,Yes,Cats,Yes,Abnormal
3,PWteECvtqG,Australia,PqblM15Vy8,9/10/1982,40000,Female,Married,Associate's Degree,No,Own,AB-,115/75,70.0,99%,Mammogram,No,No,Dust,No,Normal
4,uSGTiTOIGT,USA,GIMXK2aKsQ,6/25/1995,35000,Male,Single,Some College,No,Rent,O-,125/80,68.0,97%,Dental Cleaning,Yes,Yes,Peanuts,Yes,Normal


In [10]:
from cape_dataframes.pandas.transformations import ReversibleTokenizer, Tokenizer, DateTruncation, \
                                                    DatePerturbation, NumericPerturbation, \
                                                    NumericRounding

In [11]:
df = pd.concat([
            df.drop(columns = ['Blood Pressure', 'Oxygen Level']), 
            pd.DataFrame(df['Oxygen Level'].str.strip('%').astype(float)),
            df['Blood Pressure'].str.split('/', expand = True)\
                                .replace({"": np.NaN}).astype('float')\
                                .rename(columns = {0 : 'Systolic Pressure', 1 : 'Diastolic Pressure'})
    ],
    axis = 1
)

In [12]:
df.head()

Unnamed: 0,Name,Country,SSN,DOB,Income,Sex,Marital Status,Education,Loan,House Status,...,Heart Rate,Medical Procedure,Smoking,Alcohol Consumption,Allergies,Vaccinations,Tumor Condition,Oxygen Level,Systolic Pressure,Diastolic Pressure
0,LC6QhRjKT8,USA,MBA7r8vlEZ,7/15/1985,50000,Male,Married,Bachelor's Degree,Yes,Own,...,72.0,Appendectomy,No,No,Pollen,Yes,Normal,98.0,120.0,80.0
1,wCHS1mybsK,Canada,Rt08QDh9qM,12/2/1990,60000,Female,Single,Master's Degree,No,Rent,...,68.0,Laser Eye Surgery,No,Yes,Shellfish,No,Normal,96.0,110.0,70.0
2,1vHcOXdkG8,UK,srQyLZtF3L,3/20/1978,75000,Male,Divorced,High School Diploma,Yes,Own,...,75.0,Colonoscopy,No,Yes,Cats,Yes,Abnormal,97.0,130.0,85.0
3,PWteECvtqG,Australia,PqblM15Vy8,9/10/1982,40000,Female,Married,Associate's Degree,No,Own,...,70.0,Mammogram,No,No,Dust,No,Normal,99.0,115.0,75.0
4,uSGTiTOIGT,USA,GIMXK2aKsQ,6/25/1995,35000,Male,Single,Some College,No,Rent,...,68.0,Dental Cleaning,Yes,Yes,Peanuts,Yes,Normal,97.0,125.0,80.0


In [13]:
from cryptography.fernet import Fernet

key = Fernet.generate_key()

In [14]:
cipher_suite = Fernet(key)

In [15]:
tokenizer_secret = "".join(np.random.choice(list(string.ascii_letters + string.digits), 32))
with open("Tokenizer_secret", 'w') as f:
    f.write(cipher_suite.encrypt(tokenizer_secret.encode()).decode())

In [16]:
tokenize_cols = ["Country", "Education", "Marital Status", "Blood Type", "House Status", "Sex", "Loan",
                "Medical Procedure", "Smoking", "Alcohol Consumption", "Allergies", "Vaccinations", "Tumor Condition"]

In [17]:
cat_imputer_pipe = Pipeline([
    ('Encoder', OrdinalEncoder()),
    ('Categorical Iterative Imputer', IterativeImputer(tol = 0, max_iter = 100))
])

imputed_data = cat_imputer_pipe.fit_transform(df[tokenize_cols])
df[tokenize_cols] = cat_imputer_pipe.steps[0][1].inverse_transform(imputed_data)



In [18]:
for col in tokenize_cols:
    tokenizer = ReversibleTokenizer(key = tokenizer_secret)
    df[col] = df[[col]].astype(str).apply(tokenizer).values.flatten()

In [19]:
df['DOB_year'] = pd.to_datetime(df['DOB']).dt.year.astype(int)

In [20]:
df = df.drop(columns = ['DOB'])

Differential Privacy - Add perturbations
DOB, Income, Heart Rate, Oxygen Level, Systolic Pressure, Diastolic Pressure

In [21]:
value_ranges_dict = {'Income': -4,
                     'Heart Rate': 0,
                     'Oxygen Level': 0,
                     'Systolic Pressure': -1,
                     'Diastolic Pressure': -1,
                     'DOB_year': -1}

In [22]:
df[df.select_dtypes(exclude = 'object').columns.tolist()] = df.select_dtypes(exclude = 'object').apply(lambda x: \
                            NumericRounding(dtypes.Float, value_ranges_dict.get(x.name))(x))                                                

In [23]:
df.head()

Unnamed: 0,Name,Country,SSN,Income,Sex,Marital Status,Education,Loan,House Status,Blood Type,...,Medical Procedure,Smoking,Alcohol Consumption,Allergies,Vaccinations,Tumor Condition,Oxygen Level,Systolic Pressure,Diastolic Pressure,DOB_year
0,LC6QhRjKT8,3f8e37e3213d80d50a42247caf83c90913d18e,MBA7r8vlEZ,50000.0,cd9013ca1603146996e2355402b24158557967bf,246d5018ec460bd65a6d5900b596d1391551bc2b00716d,d82a9db860a015ec7993fd548f1459dd2568e82ebdc544...,e75050be0da8765b5ed48fbbfc8273e968174b,889f4988059a8c95ffdc591ddc40a900e938ca,a754ee605c96cd684e1c3a464790d62b419f,...,b42704f7cb4c8963caae8f6d7907de6123d39d4eab2739...,ce73369d7ec699629297ebc218ee6f8c343f,ce73369d7ec699629297ebc218ee6f8c343f,0b6dc8d5eac00c556d833c5a2a68b63b4f6145353f1c,e75050be0da8765b5ed48fbbfc8273e968174b,f088b8df5f374f2451488088f99dc4059d1ef9893d45,98.0,120.0,80.0,1980.0
1,wCHS1mybsK,f21aad0389f7703fc5f4f5f2458b0a2832286eb507ac,Rt08QDh9qM,60000.0,79c32d745577b42562feb70232c94964743bb96bd7e7,aad2f44ecd40ea9574217ddddf5f17a4f4fda990d981,b5a5e20b55f00bc091f9322efd93544fdba5fbff2545f3...,ce73369d7ec699629297ebc218ee6f8c343f,50e421b0cf8f9e68b1ae93e3a3f5d859864b7110,e2420e9bcbfc1b80746e5b7883cb5cd2978d,...,796bed9dd7d0c5d79c69e3d06cc2b103a058da177efe8b...,ce73369d7ec699629297ebc218ee6f8c343f,e75050be0da8765b5ed48fbbfc8273e968174b,6ce53d2afd9378d94e29768d5a1e5eaffd14e657d58e89...,ce73369d7ec699629297ebc218ee6f8c343f,f088b8df5f374f2451488088f99dc4059d1ef9893d45,96.0,110.0,70.0,1990.0
2,1vHcOXdkG8,a3482a2769de3ea402ddfe0683947fa08b13,srQyLZtF3L,80000.0,cd9013ca1603146996e2355402b24158557967bf,c52315148645b9e42b5153adc6d8e603bf0412f1bf741f77,ba1150a7b2455473a28f73671d745ead7af85c042cd864...,e75050be0da8765b5ed48fbbfc8273e968174b,889f4988059a8c95ffdc591ddc40a900e938ca,d67b8be192c389fc4fa8afb1d2ee29dd911b,...,482ed0186fbb7fd00b320ed4f6b5a725a092ea5e5c52b7...,ce73369d7ec699629297ebc218ee6f8c343f,e75050be0da8765b5ed48fbbfc8273e968174b,d2feb2f8a168f3795284554ce2635d7d6b8c4b3d,e75050be0da8765b5ed48fbbfc8273e968174b,9f541ddd4f9cd6406ec922b269e7a0b29cfa6c4a0e12840a,97.0,130.0,80.0,1980.0
3,PWteECvtqG,e36933c8b91a22d0f0c35e812a183976fb683e0799350d...,PqblM15Vy8,40000.0,79c32d745577b42562feb70232c94964743bb96bd7e7,246d5018ec460bd65a6d5900b596d1391551bc2b00716d,d7f3c98958ab3c079564900f731511a8fa25d15faa280c...,ce73369d7ec699629297ebc218ee6f8c343f,889f4988059a8c95ffdc591ddc40a900e938ca,39a22736affa903040b11604c22cd7e17af7e6,...,6a30a0e66dd0e0137e0f598cbf5f75bc9405526bad04ac...,ce73369d7ec699629297ebc218ee6f8c343f,ce73369d7ec699629297ebc218ee6f8c343f,fa4ca553847603ef08ca1261d662735e9dbeb199,ce73369d7ec699629297ebc218ee6f8c343f,f088b8df5f374f2451488088f99dc4059d1ef9893d45,99.0,120.0,80.0,1980.0
4,uSGTiTOIGT,3f8e37e3213d80d50a42247caf83c90913d18e,GIMXK2aKsQ,40000.0,cd9013ca1603146996e2355402b24158557967bf,aad2f44ecd40ea9574217ddddf5f17a4f4fda990d981,ddc6dc2eb5474fe490ebef2c13541247a12df88fcff7c5...,ce73369d7ec699629297ebc218ee6f8c343f,50e421b0cf8f9e68b1ae93e3a3f5d859864b7110,bf462fbbb1d44b72be4b3092c178bb0f5823,...,dc5d5e7b13678509487813a7208ad53d471d4755bfef5c...,e75050be0da8765b5ed48fbbfc8273e968174b,e75050be0da8765b5ed48fbbfc8273e968174b,1cf3850dd5e7623fab43d14a05b27aea5d7ec66362c9ce,e75050be0da8765b5ed48fbbfc8273e968174b,f088b8df5f374f2451488088f99dc4059d1ef9893d45,97.0,120.0,80.0,2000.0


In [24]:
new_column_names = []
for i in df.columns:
    if i == 'Tumor Condition':
        new_column_names.append(cipher_suite.encrypt("Target".encode()))
        continue
    new_column_names.append(cipher_suite.encrypt(i.encode()))

In [25]:
df.columns = [i.decode() for i in new_column_names]

In [26]:
df.head()

Unnamed: 0,gAAAAABl7QW7E2GLP8YI8Uj-3dxmglubRBk64mfIqLthygRJyAJ33e_3S5rQKAT1JIO6Qrow3GOG5_UCC7zN6sQSrNefosXUhA==,gAAAAABl7QW7NZ7OnDYyW0Vx_RY6Cgr1N3b_HVEAjI1Qwi9oVrQN7rKg-xwzIzXgzRk0xyHDTgBYdm-1HJIsxjWiDRIE9U-mhA==,gAAAAABl7QW75rwHnWKJuKuCqGYkJGXm4A_x1NxsPyaVktNmpZmY5U5KuWX2dFDdURTfm79Po8CZLbsZQ-heSL7M7s-OTUD6eg==,gAAAAABl7QW7iwpZkuvucN84QRsteBepZYNFlrMkovRA9Op5Gfw4c1Mm99Au6OAJ46UJI-wukG1uvU6SQxM4UFqlBlv97rVO3w==,gAAAAABl7QW70lEiX76Y1KbV2E4SbsG2fg-3OKzYLUry70v7EYHxCH2y6lBoCZ55lW7KFN8bGZ2FZI_tSRF0p7xXr3EX2gHCQQ==,gAAAAABl7QW71rJFSD5F0LTaEIRCN56HfeFuMIprewR03CCOv2I7V7qGCZqVRcrUAWM5OyiBjfuRG62jDMEkbcVt3B6ElGjnhg==,gAAAAABl7QW7ARvdyEkrb-vZYv1suOui-PEFz5mUS3fBXq85X0iKUbD5qYAeSCnQezr69iGx3z1aQeq-hFwtNDWAD7CbQGTM4g==,gAAAAABl7QW7j5KII6lNGv9xGH-0Dkap-ZbJGMDVDHN9e0_2EZR1007hGk5dpCF2Mtku0cfcLhhrblWYWSGtZZxfYo1T_lfq3g==,gAAAAABl7QW7mvfznbXfO1b-2MKm5XCpMxB5MN41jrK8x8jjnrcOdzKjjJJ-JV-6I7c2gwi7-tfMTX1L8JhcEslKxWo52hqnVw==,gAAAAABl7QW71uiSoj6PwzYf7gfpfYYgYJbfcCYSZoApwhou--yyCGyJRrLP8HbxFQ32UwYGMSveBbqXgwWKOG-X9mFy8pT-4w==,...,gAAAAABl7QW7rZqmTtHVlu-bOppXCj8_VIgScQcsQqmKd4f2W9TtIJ6TC_HNiobwgOL9vTjAwPIpj6ip3807oBJ1HwYp1cf-yjuUuGh8JXn_X2r1IZ5EXBY=,gAAAAABl7QW7sOMfPEbU8xcV2OxjTtm9m_UMcdI0yEI_PBdaMmnB3b1eY44UCRA5oR9p3Ml2ekHIPTjHQsRvSOnnpkG0cDx8Hg==,gAAAAABl7QW7AB_UUQm2W0OHstnYY3sgBC_zvA4ckWW14epVRkYb1FbQBnEXMvs0CCgiIlgPP-h6DhDrn8zHRlMowaVHOm15tJJlF2TPGRKzgy8inLQksh4=,gAAAAABl7QW7z7mPXn2pwoDmXjfVSvxSDIGJIXtKhJkeSEh5EVALONsYZBYwB_lYCOexYUVLcnMAY5BfTeVjFifqtDSKhJT8ig==,gAAAAABl7QW7zd_wL9ju2CPJKU3yWgJ1N2YDMViuXa0dc-ZhEtdfqdH3K-5dzHM2I2qjhUIa8NnxZYUNk68hrDy7vZasBDDvfA==,gAAAAABl7QW79Iw1MyYnd90fQDxVSmLuJpwuiiwV7rEoDOGmtjlEDP5ieffJyicJCn8U-jgA7uOHehE1NHFqxKqSo5TplOdHRw==,gAAAAABl7QW7OJRKPQUB2mCxWQEPy1ztYDsMjQ9ThOuq2lUqnYtx43Ne3q8ftgfFOtgwQMuSpscUeV1euM_ncH-evpPdaZsNIg==,gAAAAABl7QW72IcFV95LwzfJOqjeeL-wR18wd-DnesgQSPRf6pn0JfMEUCwZMoW7bC-IFTXQtTPkevFPmrUghaIwDckaWmxmaW3VLQCtOcA_XQIMEy6E1sg=,gAAAAABl7QW7Q5J2RFg9PBjIcKhUYwpXG4mZnk1YppLDujkfwyAzuS00WTUhZ80ccG2rBeoNMpfpEsCkWyxkJRdCD9QH4qCI50lxNm3aj9D7hssJ_OhZEGk=,gAAAAABl7QW7_vmkWQOT3e5hDM6iOknWw3ES8nyIU6MtostgU4jXg-iUdOhi1zI9yvy1PSTkk8oDa818ipX4klOkKWm_ZA8CPw==
0,LC6QhRjKT8,3f8e37e3213d80d50a42247caf83c90913d18e,MBA7r8vlEZ,50000.0,cd9013ca1603146996e2355402b24158557967bf,246d5018ec460bd65a6d5900b596d1391551bc2b00716d,d82a9db860a015ec7993fd548f1459dd2568e82ebdc544...,e75050be0da8765b5ed48fbbfc8273e968174b,889f4988059a8c95ffdc591ddc40a900e938ca,a754ee605c96cd684e1c3a464790d62b419f,...,b42704f7cb4c8963caae8f6d7907de6123d39d4eab2739...,ce73369d7ec699629297ebc218ee6f8c343f,ce73369d7ec699629297ebc218ee6f8c343f,0b6dc8d5eac00c556d833c5a2a68b63b4f6145353f1c,e75050be0da8765b5ed48fbbfc8273e968174b,f088b8df5f374f2451488088f99dc4059d1ef9893d45,98.0,120.0,80.0,1980.0
1,wCHS1mybsK,f21aad0389f7703fc5f4f5f2458b0a2832286eb507ac,Rt08QDh9qM,60000.0,79c32d745577b42562feb70232c94964743bb96bd7e7,aad2f44ecd40ea9574217ddddf5f17a4f4fda990d981,b5a5e20b55f00bc091f9322efd93544fdba5fbff2545f3...,ce73369d7ec699629297ebc218ee6f8c343f,50e421b0cf8f9e68b1ae93e3a3f5d859864b7110,e2420e9bcbfc1b80746e5b7883cb5cd2978d,...,796bed9dd7d0c5d79c69e3d06cc2b103a058da177efe8b...,ce73369d7ec699629297ebc218ee6f8c343f,e75050be0da8765b5ed48fbbfc8273e968174b,6ce53d2afd9378d94e29768d5a1e5eaffd14e657d58e89...,ce73369d7ec699629297ebc218ee6f8c343f,f088b8df5f374f2451488088f99dc4059d1ef9893d45,96.0,110.0,70.0,1990.0
2,1vHcOXdkG8,a3482a2769de3ea402ddfe0683947fa08b13,srQyLZtF3L,80000.0,cd9013ca1603146996e2355402b24158557967bf,c52315148645b9e42b5153adc6d8e603bf0412f1bf741f77,ba1150a7b2455473a28f73671d745ead7af85c042cd864...,e75050be0da8765b5ed48fbbfc8273e968174b,889f4988059a8c95ffdc591ddc40a900e938ca,d67b8be192c389fc4fa8afb1d2ee29dd911b,...,482ed0186fbb7fd00b320ed4f6b5a725a092ea5e5c52b7...,ce73369d7ec699629297ebc218ee6f8c343f,e75050be0da8765b5ed48fbbfc8273e968174b,d2feb2f8a168f3795284554ce2635d7d6b8c4b3d,e75050be0da8765b5ed48fbbfc8273e968174b,9f541ddd4f9cd6406ec922b269e7a0b29cfa6c4a0e12840a,97.0,130.0,80.0,1980.0
3,PWteECvtqG,e36933c8b91a22d0f0c35e812a183976fb683e0799350d...,PqblM15Vy8,40000.0,79c32d745577b42562feb70232c94964743bb96bd7e7,246d5018ec460bd65a6d5900b596d1391551bc2b00716d,d7f3c98958ab3c079564900f731511a8fa25d15faa280c...,ce73369d7ec699629297ebc218ee6f8c343f,889f4988059a8c95ffdc591ddc40a900e938ca,39a22736affa903040b11604c22cd7e17af7e6,...,6a30a0e66dd0e0137e0f598cbf5f75bc9405526bad04ac...,ce73369d7ec699629297ebc218ee6f8c343f,ce73369d7ec699629297ebc218ee6f8c343f,fa4ca553847603ef08ca1261d662735e9dbeb199,ce73369d7ec699629297ebc218ee6f8c343f,f088b8df5f374f2451488088f99dc4059d1ef9893d45,99.0,120.0,80.0,1980.0
4,uSGTiTOIGT,3f8e37e3213d80d50a42247caf83c90913d18e,GIMXK2aKsQ,40000.0,cd9013ca1603146996e2355402b24158557967bf,aad2f44ecd40ea9574217ddddf5f17a4f4fda990d981,ddc6dc2eb5474fe490ebef2c13541247a12df88fcff7c5...,ce73369d7ec699629297ebc218ee6f8c343f,50e421b0cf8f9e68b1ae93e3a3f5d859864b7110,bf462fbbb1d44b72be4b3092c178bb0f5823,...,dc5d5e7b13678509487813a7208ad53d471d4755bfef5c...,e75050be0da8765b5ed48fbbfc8273e968174b,e75050be0da8765b5ed48fbbfc8273e968174b,1cf3850dd5e7623fab43d14a05b27aea5d7ec66362c9ce,e75050be0da8765b5ed48fbbfc8273e968174b,f088b8df5f374f2451488088f99dc4059d1ef9893d45,97.0,120.0,80.0,2000.0


In [27]:
df.to_csv('private_data.csv', index = False)

This data can be shared without a threat to any privacy

Employees with the right authorities can use the encryption key to decipher the column names. This can be allowed if necessary

In [28]:
df = pd.read_csv('private_data.csv')
df.columns = [cipher_suite.decrypt(i).decode() for i in df.columns]

In [29]:
df.head()

Unnamed: 0,Name,Country,SSN,Income,Sex,Marital Status,Education,Loan,House Status,Blood Type,...,Medical Procedure,Smoking,Alcohol Consumption,Allergies,Vaccinations,Target,Oxygen Level,Systolic Pressure,Diastolic Pressure,DOB_year
0,LC6QhRjKT8,3f8e37e3213d80d50a42247caf83c90913d18e,MBA7r8vlEZ,50000.0,cd9013ca1603146996e2355402b24158557967bf,246d5018ec460bd65a6d5900b596d1391551bc2b00716d,d82a9db860a015ec7993fd548f1459dd2568e82ebdc544...,e75050be0da8765b5ed48fbbfc8273e968174b,889f4988059a8c95ffdc591ddc40a900e938ca,a754ee605c96cd684e1c3a464790d62b419f,...,b42704f7cb4c8963caae8f6d7907de6123d39d4eab2739...,ce73369d7ec699629297ebc218ee6f8c343f,ce73369d7ec699629297ebc218ee6f8c343f,0b6dc8d5eac00c556d833c5a2a68b63b4f6145353f1c,e75050be0da8765b5ed48fbbfc8273e968174b,f088b8df5f374f2451488088f99dc4059d1ef9893d45,98.0,120.0,80.0,1980.0
1,wCHS1mybsK,f21aad0389f7703fc5f4f5f2458b0a2832286eb507ac,Rt08QDh9qM,60000.0,79c32d745577b42562feb70232c94964743bb96bd7e7,aad2f44ecd40ea9574217ddddf5f17a4f4fda990d981,b5a5e20b55f00bc091f9322efd93544fdba5fbff2545f3...,ce73369d7ec699629297ebc218ee6f8c343f,50e421b0cf8f9e68b1ae93e3a3f5d859864b7110,e2420e9bcbfc1b80746e5b7883cb5cd2978d,...,796bed9dd7d0c5d79c69e3d06cc2b103a058da177efe8b...,ce73369d7ec699629297ebc218ee6f8c343f,e75050be0da8765b5ed48fbbfc8273e968174b,6ce53d2afd9378d94e29768d5a1e5eaffd14e657d58e89...,ce73369d7ec699629297ebc218ee6f8c343f,f088b8df5f374f2451488088f99dc4059d1ef9893d45,96.0,110.0,70.0,1990.0
2,1vHcOXdkG8,a3482a2769de3ea402ddfe0683947fa08b13,srQyLZtF3L,80000.0,cd9013ca1603146996e2355402b24158557967bf,c52315148645b9e42b5153adc6d8e603bf0412f1bf741f77,ba1150a7b2455473a28f73671d745ead7af85c042cd864...,e75050be0da8765b5ed48fbbfc8273e968174b,889f4988059a8c95ffdc591ddc40a900e938ca,d67b8be192c389fc4fa8afb1d2ee29dd911b,...,482ed0186fbb7fd00b320ed4f6b5a725a092ea5e5c52b7...,ce73369d7ec699629297ebc218ee6f8c343f,e75050be0da8765b5ed48fbbfc8273e968174b,d2feb2f8a168f3795284554ce2635d7d6b8c4b3d,e75050be0da8765b5ed48fbbfc8273e968174b,9f541ddd4f9cd6406ec922b269e7a0b29cfa6c4a0e12840a,97.0,130.0,80.0,1980.0
3,PWteECvtqG,e36933c8b91a22d0f0c35e812a183976fb683e0799350d...,PqblM15Vy8,40000.0,79c32d745577b42562feb70232c94964743bb96bd7e7,246d5018ec460bd65a6d5900b596d1391551bc2b00716d,d7f3c98958ab3c079564900f731511a8fa25d15faa280c...,ce73369d7ec699629297ebc218ee6f8c343f,889f4988059a8c95ffdc591ddc40a900e938ca,39a22736affa903040b11604c22cd7e17af7e6,...,6a30a0e66dd0e0137e0f598cbf5f75bc9405526bad04ac...,ce73369d7ec699629297ebc218ee6f8c343f,ce73369d7ec699629297ebc218ee6f8c343f,fa4ca553847603ef08ca1261d662735e9dbeb199,ce73369d7ec699629297ebc218ee6f8c343f,f088b8df5f374f2451488088f99dc4059d1ef9893d45,99.0,120.0,80.0,1980.0
4,uSGTiTOIGT,3f8e37e3213d80d50a42247caf83c90913d18e,GIMXK2aKsQ,40000.0,cd9013ca1603146996e2355402b24158557967bf,aad2f44ecd40ea9574217ddddf5f17a4f4fda990d981,ddc6dc2eb5474fe490ebef2c13541247a12df88fcff7c5...,ce73369d7ec699629297ebc218ee6f8c343f,50e421b0cf8f9e68b1ae93e3a3f5d859864b7110,bf462fbbb1d44b72be4b3092c178bb0f5823,...,dc5d5e7b13678509487813a7208ad53d471d4755bfef5c...,e75050be0da8765b5ed48fbbfc8273e968174b,e75050be0da8765b5ed48fbbfc8273e968174b,1cf3850dd5e7623fab43d14a05b27aea5d7ec66362c9ce,e75050be0da8765b5ed48fbbfc8273e968174b,f088b8df5f374f2451488088f99dc4059d1ef9893d45,97.0,120.0,80.0,2000.0


In [30]:
df = df.drop(columns = ['Name', 'SSN'])

In [31]:
X, Y = df.drop(columns = ['Target']), df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, stratify = Y, random_state = 1234)

In [32]:
cat_cols = X_train.select_dtypes("object").columns.tolist()

In [33]:
column_transformer = ColumnTransformer([
    ('Categorical Encoder', OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1), cat_cols)
], remainder = 'passthrough')

In [34]:
sklearn_pipeline = Pipeline([("categorical Encoder", column_transformer), 
                  ("Iterative Imputer", IterativeImputer(random_state = 1234)),
                  ("Scaler", StandardScaler(random_state = 1234)),
                  ("Logistic Regression", DecisionTreeClassifier(random_state = 1234))
                 ])

In [35]:
sklearn_pipeline.fit(X_train, y_train)
sklearn_pipeline.score(X_test, y_test)

 This will result in additional privacy leakage.  To ensure differential privacy with no additional privacy loss, specify `bounds` for each valued returned by np.mean().


1.0

In [36]:
diff_private_model = Pipeline([("categorical Encoder", column_transformer), 
                  ("Iterative Imputer", IterativeImputer(random_state = 1234)),
                  ("Scaler", StandardScaler(random_state = 1234)),
                  ("Logistic Regression", priv_DTC(random_state = 1234))
                 ])

In [37]:
diff_private_model.fit(X_train, y_train)
diff_private_model.score(X_test, y_test)

 This will result in additional privacy leakage.  To ensure differential privacy with no additional privacy loss, specify `bounds` for each valued returned by np.mean().


0.9444444444444444