# 15. Wide-Filtered DP-SGD (Additive)

This notebook runs a filtered wide-table DP-SGD continuation experiment and evaluates benchmark utility.

It uses additive output paths only.

In [1]:
from pathlib import Path
import sys
import pickle
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from opacus import PrivacyEngine
from IPython.display import display, Markdown

ROOT = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd().resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.experiments.wide_dpsgd_model import DCAWidetableVAE, vae_loss
from src.eval.decompose import decompose_wide_table
from src.pipeline.run_benchmark import run_all
from src.eval.compare import evaluate_all, results_to_dataframe

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
display(Markdown(f"Device: `{device}`"))

Device: `cpu`

In [2]:
FILTERED_WIDE = ROOT / 'data' / 'experiments_additive' / 'wide_filter' / 'wide_training_table_cov3.parquet'
CHECKPOINT_PATH = ROOT / 'data' / 'models' / 'dp_vae_checkpoint.pt'
TRANSFORMER_PATH = ROOT / 'data' / 'models' / 'transformer.pkl'
OUT_ROOT = ROOT / 'data' / 'experiments_additive' / 'wide_filtered_dpsgd'
OUT_ROOT.mkdir(parents=True, exist_ok=True)
SYNTH_WIDE_PATH = OUT_ROOT / 'synth_wide_filtered_dpsgd.parquet'
SYNTH_REPORTING_DIR = OUT_ROOT / 'reporting'
QUERY_RESULTS_DIR = OUT_ROOT / 'query_results'
EVAL_CSV = OUT_ROOT / 'evaluation.csv'

checkpoint = torch.load(CHECKPOINT_PATH, map_location=device, weights_only=False)
with open(TRANSFORMER_PATH, 'rb') as f:
    transformer = pickle.load(f)

cat_cols = checkpoint['cat_cols']
numeric_cols = checkpoint['numeric_cols']
cat_encoder = transformer.named_transformers_['cat']
num_scaler = transformer.named_transformers_['num']

display(Markdown(f"Filtered table: `{FILTERED_WIDE}`"))
display(Markdown(f"Rows in filtered table: `{len(pd.read_parquet(FILTERED_WIDE, columns=['guid'])):,}`"))

Filtered table: `/Users/enscribe/Repositories/School/dsc180-q2/data/experiments_additive/wide_filter/wide_training_table_cov3.parquet`

Rows in filtered table: `282,315`

In [3]:
wide = pd.read_parquet(FILTERED_WIDE)
X = transformer.transform(wide[cat_cols + numeric_cols]).astype(np.float32)
finite_mask = np.isfinite(X)
finite_ratio = float(finite_mask.mean())
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
X_t = torch.tensor(X, dtype=torch.float32)

cat_sizes = checkpoint['model_args']['cat_sizes']
cat_offsets = np.cumsum([0] + cat_sizes)
cat_targets = []
for i in range(len(cat_sizes)):
    a, b = cat_offsets[i], cat_offsets[i + 1]
    idx = np.argmax(X[:, a:b], axis=1).astype(np.int64)
    cat_targets.append(torch.tensor(idx, dtype=torch.long))
num_start = cat_offsets[-1]
X_num = torch.tensor(X[:, num_start:], dtype=torch.float32)

dataset = TensorDataset(X_t, *cat_targets, X_num)
batch_size = 2048
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

display(Markdown(f"Feature matrix shape: `{X.shape}`"))
display(Markdown(f"Finite ratio before sanitization: `{finite_ratio:.6f}`"))

Feature matrix shape: `(282315, 307)`

Finite ratio before sanitization: `1.000000`

In [4]:
model = DCAWidetableVAE(**checkpoint['model_args']).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer = Adam(model.parameters(), lr=2e-4)

target_epsilon = 4.0
target_delta = 1e-5
epochs = 3
max_grad_norm = 1.0

probe_batch = next(iter(loader))
probe_x = probe_batch[0].to(device)
probe_y_cats = [t.to(device) for t in probe_batch[1:1 + len(cat_sizes)]]
probe_y_num = probe_batch[-1].to(device)
with torch.no_grad():
    p_cat_logits, p_num_out, p_mu, p_logvar = model(probe_x)
    probe_loss, probe_ce, probe_mse, probe_kl = vae_loss(p_cat_logits, p_num_out, probe_y_cats, probe_y_num, p_mu, p_logvar, beta=0.1)
if not torch.isfinite(probe_loss):
    raise RuntimeError('Non-finite probe loss before DP wrapping. Check model stability and transformed features.')
display(Markdown(f"Probe loss before DP wrapping: `{float(probe_loss):.6f}`"))

privacy_engine = PrivacyEngine(accountant='rdp')
model, optimizer, loader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=loader,
    target_epsilon=target_epsilon,
    target_delta=target_delta,
    epochs=epochs,
    max_grad_norm=max_grad_norm,
)

history = []
for epoch in range(epochs):
    model.train()
    losses = []
    skipped = 0
    steps = 0
    for batch in loader:
        x = batch[0].to(device)
        y_cats = [t.to(device) for t in batch[1:1 + len(cat_sizes)]]
        y_num = batch[-1].to(device)

        optimizer.zero_grad()
        cat_logits, num_out, mu, logvar = model(x)
        loss, ce, mse, kl = vae_loss(cat_logits, num_out, y_cats, y_num, mu, logvar, beta=0.1)
        if not torch.isfinite(loss):
            skipped += 1
            continue
        loss.backward()
        optimizer.step()
        steps += 1
        losses.append(float(loss.detach().cpu()))

    eps = float('nan')
    if steps > 0:
        try:
            eps = float(privacy_engine.accountant.get_epsilon(delta=target_delta))
        except Exception:
            eps = float('nan')
    epoch_loss = float(np.mean(losses)) if len(losses) else float('nan')
    history.append({'epoch': epoch + 1, 'loss': epoch_loss, 'epsilon': eps, 'skipped_batches': skipped, 'steps': steps})

history_df = pd.DataFrame(history)
display(Markdown('## Training summary'))
display(history_df)
if history_df['steps'].sum() == 0:
    raise RuntimeError('No optimizer steps completed. Loss is non-finite for all batches.')

Probe loss before DP wrapping: `149918639145676177408.000000`

  loss.backward()


ValueError: Per sample gradient is not initialized. Not updated in backward pass?

In [None]:
N_SYNTH = len(wide)
model.eval()
base_model = model._module if hasattr(model, "_module") else model
chunk = 8192
cat_preds = []
num_preds = []
with torch.no_grad():
    n_done = 0
    while n_done < N_SYNTH:
        n = min(chunk, N_SYNTH - n_done)
        z = torch.randn(n, checkpoint['model_args']['latent_dim'], device=device)
        cat_logits, num_out = base_model.decode(z)
        if not torch.isfinite(num_out).all():
            raise RuntimeError('Non-finite values in decoder numeric output. Training run is unstable.')
        cat_idx = [torch.argmax(logits, dim=1).cpu().numpy() for logits in cat_logits]
        num_arr = num_out.cpu().numpy()
        cat_preds.append(cat_idx)
        num_preds.append(num_arr)
        n_done += n

cat_arrays = [np.concatenate([chunk_cat[i] for chunk_cat in cat_preds], axis=0) for i in range(len(cat_sizes))]
num_array = np.concatenate(num_preds, axis=0)

synth_cat = {}
for i, col in enumerate(cat_cols):
    cats = cat_encoder.categories_[i]
    idx = np.clip(cat_arrays[i], 0, len(cats) - 1)
    synth_cat[col] = cats[idx]

synth_num = num_scaler.inverse_transform(num_array)
synth_wide = pd.DataFrame(synth_cat)
for j, col in enumerate(numeric_cols):
    synth_wide[col] = synth_num[:, j]
synth_wide.insert(0, 'guid', [f'filtered_synth_{i:07d}' for i in range(N_SYNTH)])
synth_wide.to_parquet(SYNTH_WIDE_PATH, index=False)

display(Markdown(f"Saved synthetic wide table: `{SYNTH_WIDE_PATH}`"))
display(synth_wide.head())

Saved synthetic wide table: `/Users/enscribe/Repositories/School/dsc180-q2/data/experiments_additive/wide_filtered_dpsgd/synth_wide_filtered_dpsgd.parquet`

Unnamed: 0,guid,chassistype,countryname_normalized,modelvendor_normalized,os,cpuname,cpucode,cpu_family,persona,processornumber,...,psys_rap_nrs,psys_rap_avg,pkg_c0_nrs,pkg_c0_avg,avg_freq_nrs,avg_freq_avg,temp_nrs,temp_avg,pkg_power_nrs,pkg_power_avg
0,filtered_synth_0000000,2 in 1,Argentina,AZW,Win Server,10th Gen i3,Other,Atom,Casual Gamer,10,...,,,,,,,,,,
1,filtered_synth_0000001,2 in 1,Argentina,AZW,Win Server,10th Gen i3,Other,Atom,Casual Gamer,10,...,,,,,,,,,,
2,filtered_synth_0000002,2 in 1,Argentina,AZW,Win Server,10th Gen i3,Other,Atom,Casual Gamer,10,...,,,,,,,,,,
3,filtered_synth_0000003,2 in 1,Argentina,AZW,Win Server,10th Gen i3,Other,Atom,Casual Gamer,10,...,,,,,,,,,,
4,filtered_synth_0000004,2 in 1,Argentina,AZW,Win Server,10th Gen i3,Other,Atom,Casual Gamer,10,...,,,,,,,,,,


In [None]:
counts = decompose_wide_table(synth_wide, SYNTH_REPORTING_DIR)
display(Markdown('## Decomposition table row counts'))
display(pd.DataFrame([{'table': k, 'rows': v} for k, v in counts.items()]).sort_values('rows', ascending=False))

## Decomposition table row counts

Unnamed: 0,table,rows
0,sysinfo,282315
1,network_consumption,0
2,memory_utilization,0
3,system_psys_rap_watts,0
4,system_pkg_C0,0
5,system_pkg_avg_freq_mhz,0
6,system_pkg_temp_centigrade,0
7,system_hw_pkg_power,0
8,batt_dc_events,0
9,web_cat_usage,0


In [None]:
run_all(
    queries_dir=ROOT / 'docs' / 'queries',
    reporting_dir=SYNTH_REPORTING_DIR,
    output_dir=QUERY_RESULTS_DIR,
    skip_infeasible=True,
    verbose=False,
)

eval_res = evaluate_all(ROOT / 'data' / 'results' / 'real', QUERY_RESULTS_DIR)
eval_df = results_to_dataframe(eval_res)
eval_df.to_csv(EVAL_CSV, index=False)

ev = eval_df[eval_df['n_metrics'] > 0]
passed = int(ev['passed'].fillna(False).sum()) if len(ev) else 0
avg_score = float(ev['score'].mean()) if len(ev) else 0.0
display(Markdown('## Evaluation summary'))
display(pd.DataFrame([{
    'queries_evaluated': len(ev),
    'queries_passed': passed,
    'pass_rate': passed / len(ev) if len(ev) else 0.0,
    'avg_score': avg_score,
    'evaluation_csv': str(EVAL_CSV),
}]))

  FAIL battery_on_duration_cpu_family_gen: IO Error: No files found that match the pattern "/Users/enscribe/Repositories/School/dsc180-q2/data/experiments_additive/wide_filtered_dpsgd/reporting/system_cpu_metadata.parquet"

LINE 1: ..., avg(duration_mins) as avg_duration_mins_on_battery from read_parquet('/Users/enscribe/Repositories/School/dsc180...
                                                                     ^
  FAIL display_devices_connection_type_resolution_durations_ac_dc: IO Error: No files found that match the pattern "/Users/enscribe/Repositories/School/dsc180-q2/data/experiments_additive/wide_filtered_dpsgd/reporting/system_display_devices.parquet"

LINE 1: ...(avg(duration_dc),2) as average_duration_on_dc_in_seconds from read_parquet('/Users/enscribe/Repositories/School/dsc180...
                                                                          ^
  FAIL display_devices_vendors_percentage: IO Error: No files found that match the pattern "/Users/enscribe/Reposit

## Evaluation summary

Unnamed: 0,queries_evaluated,queries_passed,pass_rate,avg_score,evaluation_csv
0,8,0,0.0,0.0,/Users/enscribe/Repositories/School/dsc180-q2/...


In [None]:
baseline = pd.read_csv(ROOT / 'data' / 'results' / 'evaluation_widetable.csv')
b = baseline[baseline['n_metrics'] > 0]
n = pd.read_csv(EVAL_CSV)
n = n[n['n_metrics'] > 0]
display(Markdown('## Delta vs wide-table baseline'))
display(pd.DataFrame([{
    'baseline_passed': int(b['passed'].fillna(False).sum()),
    'new_passed': int(n['passed'].fillna(False).sum()),
    'delta_passed': int(n['passed'].fillna(False).sum()) - int(b['passed'].fillna(False).sum()),
    'baseline_avg_score': float(b['score'].mean()) if len(b) else 0.0,
    'new_avg_score': float(n['score'].mean()) if len(n) else 0.0,
    'delta_avg_score': (float(n['score'].mean()) - float(b['score'].mean())) if len(b) and len(n) else 0.0,
}]))

## Delta vs wide-table baseline

Unnamed: 0,baseline_passed,new_passed,delta_passed,baseline_avg_score,new_avg_score,delta_avg_score
0,1,0,-1,0.258065,0.0,-0.258065
