# Load dataset

In [34]:
import pandas as pd
df = pd.read_csv('../data/athletes.csv')

### Create Response Variable

In [35]:
# There are Na values in some columns. Fill with 0
df['total_lift'] = df['candj'].fillna(0) + df['snatch'].fillna(0) + df['deadlift'].fillna(0) + df['backsq'].fillna(0)

### Split and Model

In [36]:
from sklearn.model_selection import train_test_split

vars = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong','age','height','weight','candj','snatch','deadlift','backsq']
cats = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong']
numcs = ['age','height','weight','candj','snatch','deadlift','backsq']

# x = pd.get_dummies(x, columns=cats) # encode categorical variables

x = df[numcs].fillna(0) # NAs in numeric columns, fill 0 if any

y = df['total_lift']

# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [45]:
import tensorflow as tf
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer

# model hyperparameters
epochs = 10
batch_size = 32

# Define privacy parameters
l2_norm_clip = 1.0  # Clipping norm
noise_multiplier = 0.5  # Noise multiplier
num_microbatches = 1  # Number of microbatches
learning_rate = 0.1  # Learning rate

# Define the model architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(x_train.shape[1],)),   # Input layer for the number of features
    tf.keras.layers.Dense(64, activation='relu'),       # Hidden layer 1
    tf.keras.layers.Dense(32, activation='relu'),       # Hidden layer 2
    tf.keras.layers.Dense(1)                            # Output layer (for regression)
])

# Create the optimizer with differential privacy
optimizer = DPGradientDescentGaussianOptimizer(
    l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=num_microbatches,
    learning_rate=learning_rate)  # Use `dp_sum_query` for better privacy guarantees.


In [46]:
# Compile the model
model.compile(optimizer=optimizer, loss='mean_squared_error')

# Train the model
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

Train on 24023 samples
Epoch 1/10

2023-10-17 20:53:08.188684: W tensorflow/c/c_api.cc:304] Operation '{name:'TFOptimizer_10/iterations/Assign' id:1430 op device:{requested: '', assigned: ''} def:{{{node TFOptimizer_10/iterations/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_INT64, validate_shape=false](TFOptimizer_10/iterations, TFOptimizer_10/iterations/Initializer/initial_value)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2b0bbbe50>

### Compute Metrics

In [48]:
# Evaluate the model
test_loss = model.evaluate(x_test, y_test)
print(f'Test loss: {test_loss}')

# Make predictions
y_pred = model.predict(x_test)

Test loss: 198169759.73359972


In [49]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Compute regression metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # Calculate RMSE

# Display the metrics summary
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

Mean Absolute Error (MAE): 7343.46
Mean Squared Error (MSE): 198169753.58
Root Mean Squared Error (RMSE): 14077.28


### Compute DP epsilon

In [56]:
print(compute_dp_sgd_privacy.compute_dp_sgd_privacy_statement(x_train.shape[0],
                                    batch_size=256,
                                    noise_multiplier=noise_multiplier,
                                    num_epochs=epochs,
                                    delta=.00001
                                    ))

DP-SGD performed over 24023 examples with 256 examples per iteration, noise
multiplier 0.5 for 10 epochs with microbatching, and no bound on number of
examples per user.

This privacy guarantee protects the release of all model checkpoints in addition
to the final model.

Example-level DP with add-or-remove-one adjacency at delta = 1e-05 computed with
RDP accounting:
    Epsilon with each example occurring once per epoch:       138.688
    Epsilon assuming Poisson sampling (*):                    139.347

No user-level privacy guarantee is possible without a bound on the number of
examples per user.

(*) Poisson sampling is not usually done in training pipelines, but assuming
that the data was randomly shuffled, it is believed the actual epsilon should be
closer to this value than the conservative assumption of an arbitrary data
order.

