# 🛡️ Solutions — Private Synthetic Data Generation

Built by **Stu** 🚀

## Solutions to Exercises 1–8

In [1]:
synthetic_data_definition = "Data generated artificially based on patterns learned from real datasets."

In [2]:
synthetic_privacy_risks = "Poorly generated synthetic data could still leak sensitive attributes if overfit to real examples."

In [3]:
import numpy as np
import pandas as pd
np.random.seed(42)
n_samples = 300
prices = np.random.uniform(10, 500, n_samples)
bought = (prices < 250).astype(int)
real_data = pd.DataFrame({'Price': prices, 'Bought': bought})
real_data.head()

In [4]:
def add_noise_to_data(df, epsilon=1.0):
    noisy_df = df.copy()
    for col in df.columns:
        noisy_df[col] += np.random.laplace(0, 1/epsilon, size=df.shape[0])
    return noisy_df

noisy_data = add_noise_to_data(real_data, epsilon=0.5)
noisy_data.head()

In [5]:
from sklearn.linear_model import LogisticRegression
X = noisy_data[['Price']]
y = (noisy_data['Bought'] > 0.5).astype(int)
model = LogisticRegression()
model.fit(X, y)
model.score(X, y)

In [6]:
synthetic_prices = np.random.uniform(10, 500, n_samples)
synthetic_probs = model.predict_proba(synthetic_prices.reshape(-1,1))[:,1]
synthetic_bought = (synthetic_probs > 0.5).astype(int)
synthetic_data = pd.DataFrame({'Price': synthetic_prices, 'Bought': synthetic_bought})
synthetic_data.head()

In [7]:
synthetic_quality_reflection = "Synthetic data roughly matches trends, but noise makes it less sharp. Still useful for safe analytics."

In [8]:
synthetic_summary = "DP-protected synthetic data reduces reidentification risk but at the cost of slightly less realism."