In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# reproducibility
torch.manual_seed(0)

# synthetic dataset
X = torch.randn(500, 5)          # 5-feature inputs
y = (X.sum(dim=1) > 0).long()    # binary labels (fake rule)

# simple model
model = nn.Sequential(
    nn.Linear(5, 10),
    nn.ReLU(),
    nn.Linear(10, 2)
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# train
for epoch in range(200):
    optimizer.zero_grad()
    loss = criterion(model(X), y)
    loss.backward()
    optimizer.step()

In [2]:
# attacker wants to reconstruct an input that looks like class 1
target_class = torch.tensor([1])

# start from random noise
x_fake = torch.randn(1, 5, requires_grad=True)

attack_optimizer = optim.Adam([x_fake], lr=0.1)

for step in range(300):
    attack_optimizer.zero_grad()
    
    output = model(x_fake)
    loss = -output[0, target_class]  # maximize class confidence
    
    loss.backward()
    attack_optimizer.step()

# final reconstructed input
print("Reconstructed input:", x_fake.detach())
print("Model prediction:", torch.softmax(model(x_fake), dim=1))

Reconstructed input: tensor([[29.4476, 31.1460, 29.2949, 28.4703, 30.8170]])
Model prediction: tensor([[0., 1.]], grad_fn=<SoftmaxBackward0>)


### 1️⃣ Baseline (already done)

> You already saw:

> + attacker reaches ~99% confidence

> + reconstructed input converges nicely

>> + That’s the danger zone.