In [40]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow_privacy

# Load dataset
url = "~/Downloads/adult.csv"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, names=column_names, sep=r'\s*,\s*', engine='python')

# Preprocess data
X = data.drop('income', axis=1)
y = data['income'].apply(lambda x: 1 if x == '>50K' else 0)
y = np.eye(2)[y]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define preprocessor
numeric_features = ['age', 'fnlwgt', 'education-num',
                    'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status',
                        'occupation', 'relationship', 'race', 'sex', 'native-country']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)])

pipeline = Pipeline([
    ('preprocessor', preprocessor)])

X_train = pipeline.fit_transform(X_train).toarray()
X_test = pipeline.transform(X_test).toarray()

# Define model
# model = tf.keras.models.Sequential([
#     tf.keras.layers.Dense(64, activation='relu',
#                           input_shape=(X_train.shape[1],)),
#     tf.keras.layers.Dense(32, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu',
                          kernel_initializer='glorot_normal', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, kernel_initializer='glorot_normal',activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

optimizer = tensorflow_privacy.VectorizedDPKerasSGDOptimizer(
    l2_norm_clip=1,
    noise_multiplier=100.0,
    num_microbatches=1,
    learning_rate=0.15)

loss = tf.keras.losses.CategoricalCrossentropy(
    from_logits=True, reduction=tf.losses.Reduction.NONE)

model.compile(optimizer=optimizer, loss='binary_crossentropy',
              metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 73.56%


In [23]:
def e_th():
    import math
    # Given parameters
    D_size = 6000
    b = 250
    epochs = 24
    C = 1
    sigma = 0.73
    delta = 1e-5
    # Compute q, the sampling ratio
    q = b / D_size
    # Compute Delta f (sensitivity of the function)
    Delta_f = C
    # Compute epsilon for each step
    epsilon_per_step = q * Delta_f / sigma
    # Compute the number of steps T across all epochs
    T = (D_size / b) * epochs
    # Compute epsilon_total using advanced composition theorem
    epsilon_total = math.sqrt(2 * T * math.log(1/delta)) * \
    epsilon_per_step + T * epsilon_per_step**2

    return epsilon_per_step, epsilon_total


e_th()


(0.05707762557077625, 11.172607449470878)

In [25]:
import pandas as pd


def calculate_combination_probabilities(data, feature1, feature2):
    """
    Calculate the probabilities of all combinations of two feature values in the dataset.

    Parameters:
    - data: DataFrame containing the dataset
    - feature1: Name of the first feature
    - feature2: Name of the second feature

    Returns:
    - DataFrame containing probabilities of each combination
    """
    # Create a pivot table with counts of each combination
    pivot_table = data.groupby(
        [feature1, feature2]).size().unstack(fill_value=0)

    # Convert counts to probabilities
    total_rows = len(data)
    probability_table = pivot_table / total_rows

    return probability_table


# Load the dataset
data = pd.read_csv('adult.csv', header=None, names=[
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
])

# Calculate the probabilities for the combination of 'education' and 'occupation'
probability_table = calculate_combination_probabilities(
    data, 'education', 'occupation')
print(probability_table)

# If you want to save this table to a CSV file:
# probability_table.to_csv('feature_combination_probabilities.csv')


occupation            ?   Adm-clerical   Armed-Forces   Craft-repair  \
education                                                              
 10th          0.003133       0.001167       0.000000       0.005221   
 11th          0.003655       0.002058       0.000000       0.005375   
 12th          0.001228       0.001167       0.000031       0.001781   
 1st-4th       0.000369       0.000000       0.000000       0.000706   
 5th-6th       0.000921       0.000184       0.000000       0.001321   
 7th-8th       0.002242       0.000338       0.000000       0.003563   
 9th           0.001566       0.000430       0.000000       0.002948   
 Assoc-acdm    0.001443       0.005927       0.000000       0.003532   
 Assoc-voc     0.001873       0.005129       0.000000       0.007739   
 Bachelors     0.005313       0.015540       0.000031       0.006941   
 Doctorate     0.000461       0.000154       0.000000       0.000061   
 HS-grad       0.016369       0.041921       0.000123       0.05

In [28]:
data.groupby(['education', 'occupation']).size()
[-0.001,1.0003]
[1,0]

education      occupation       
 10th           ?                    102
                Adm-clerical          38
                Craft-repair         170
                Exec-managerial       24
                Farming-fishing       44
                                    ... 
 Some-college   Prof-specialty       430
                Protective-serv      202
                Sales               1009
                Tech-support         273
                Transport-moving     283
Length: 217, dtype: int64