Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ func (r *Runner) Drain(ctx context.Context) error {
}

r.Log.Info("stopping sandbox", "id", md.ID)
err := r.sbController.Delete(ctx, md.ID)
err := r.sbController.Delete(ctx, md.ID, nil)
if err != nil {
r.Log.Error("failed to stop sandbox", "id", md.ID, "error", err)
drainErr = multierror.Append(drainErr, fmt.Errorf("failed to stop sandbox %s: %w", md.ID, err))
Expand Down
2 changes: 1 addition & 1 deletion controllers/certificate/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ func (c *Controller) Reconcile(ctx context.Context, route *ingress_v1alpha.HttpR
}

// Delete handles http_route deletion - we keep the cert in cache/disk for potential reuse
func (c *Controller) Delete(ctx context.Context, id entity.Id) error {
func (c *Controller) Delete(ctx context.Context, id entity.Id, obj *ingress_v1alpha.HttpRoute) error {
c.Log.Debug("http_route deleted, keeping certificate in cache", "route", id)
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion controllers/disk/disk_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func (d *DiskController) Update(ctx context.Context, disk *storage_v1alpha.Disk,
}

// Delete handles deletion of a disk entity
func (d *DiskController) Delete(ctx context.Context, id entity.Id) error {
func (d *DiskController) Delete(ctx context.Context, id entity.Id, obj *storage_v1alpha.Disk) error {
d.Log.Info("Processing disk deletion", "disk", id)
// Deletion is handled through the DELETING status in reconcileDisk
return nil
Expand Down
2 changes: 1 addition & 1 deletion controllers/disk/disk_lease_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ func (d *DiskLeaseController) Update(ctx context.Context, lease *storage_v1alpha
}

// Delete handles deletion of a disk lease entity
func (d *DiskLeaseController) Delete(ctx context.Context, id entity.Id) error {
func (d *DiskLeaseController) Delete(ctx context.Context, id entity.Id, obj *storage_v1alpha.DiskLease) error {
d.Log.Info("Processing lease deletion", "lease", id)

leaseId := id.String()
Expand Down
2 changes: 1 addition & 1 deletion controllers/disk/disk_lease_controller_directory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ func TestDiskLeaseController_DirectoryMode_Integration(t *testing.T) {
assert.False(t, exists)

// Step 3: Delete the lease
err = dlc.Delete(ctx, lease.ID)
err = dlc.Delete(ctx, lease.ID, nil)
require.NoError(t, err)

// Verify complete cleanup
Expand Down
2 changes: 1 addition & 1 deletion controllers/disk/disk_lease_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func TestDiskLeaseController_Delete(t *testing.T) {
}

// Process the deletion
err := dlc.Delete(context.Background(), entity.Id("disk-lease/test-lease"))
err := dlc.Delete(context.Background(), entity.Id("disk-lease/test-lease"), nil)
require.NoError(t, err)

// Should remove from active leases
Expand Down
2 changes: 1 addition & 1 deletion controllers/disk/disk_watch_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func (d *DiskWatchController) Update(ctx context.Context, disk *storage_v1alpha.
}

// Delete handles deletion of a disk entity
func (d *DiskWatchController) Delete(ctx context.Context, id entity.Id) error {
func (d *DiskWatchController) Delete(ctx context.Context, id entity.Id, obj *storage_v1alpha.Disk) error {
// Disk deleted - dependent leases will handle this
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion controllers/disk/mount_watch_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,6 @@ func (m *MountWatchController) Update(ctx context.Context, mount *storage_v1alph
return nil
}

func (m *MountWatchController) Delete(ctx context.Context, id entity.Id) error {
func (m *MountWatchController) Delete(ctx context.Context, id entity.Id, obj *storage_v1alpha.LsvdMount) error {
return nil
}
2 changes: 1 addition & 1 deletion controllers/disk/volume_watch_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,6 @@ func (v *VolumeWatchController) Update(ctx context.Context, vol *storage_v1alpha
return nil
}

func (v *VolumeWatchController) Delete(ctx context.Context, id entity.Id) error {
func (v *VolumeWatchController) Delete(ctx context.Context, id entity.Id, obj *storage_v1alpha.LsvdVolume) error {
return nil
}
2 changes: 1 addition & 1 deletion controllers/ingress/defaultroute.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func (c *DefaultRouteController) Update(ctx context.Context, route *ingress_v1al
}

// Delete handles http_route deletion events
func (c *DefaultRouteController) Delete(ctx context.Context, id entity.Id) error {
func (c *DefaultRouteController) Delete(ctx context.Context, id entity.Id, obj *ingress_v1alpha.HttpRoute) error {
c.Log.Info("HttpRoute deleted; doing nothing", "route", id)
// Note: We don't automatically promote another route to default when
// the default route is deleted, as per the requirements
Expand Down
2 changes: 1 addition & 1 deletion controllers/ingress/defaultrouteapp.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func (c *DefaultRouteAppController) Update(ctx context.Context, app *core_v1alph
}

// Delete handles app deletion events
func (c *DefaultRouteAppController) Delete(ctx context.Context, id entity.Id) error {
func (c *DefaultRouteAppController) Delete(ctx context.Context, id entity.Id, obj *core_v1alpha.App) error {
c.Log.Info("App deleted", "app", id)

// Check if this app had a default route
Expand Down
71 changes: 69 additions & 2 deletions controllers/sandbox/firewall.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/containernetworking/plugins/pkg/utils/sysctl"
compute "miren.dev/runtime/api/compute/compute_v1alpha"
"miren.dev/runtime/network"
"miren.dev/runtime/pkg/netutil"
)

func (c *SandboxController) configureFirewall(sb *compute.Sandbox, ep *network.EndpointConfig) error {
Expand Down Expand Up @@ -39,7 +40,7 @@ func (c *SandboxController) configurePort(p compute.SandboxSpecContainerPort, ep

exe := exec.Command("iptables",
"-t", "nat",
"-A", "PREROUTING",
"-I", "PREROUTING",
"!", "-i", c.Bridge,
"-p", "tcp",
"-m", "tcp",
Expand All @@ -53,7 +54,7 @@ func (c *SandboxController) configurePort(p compute.SandboxSpecContainerPort, ep

exe = exec.Command("iptables",
"-t", "nat",
"-A", "OUTPUT",
"-I", "OUTPUT",
"-p", "tcp",
"-m", "tcp",
"-d", "127.0.0.1",
Expand Down Expand Up @@ -82,3 +83,69 @@ func (c *SandboxController) configurePort(p compute.SandboxSpecContainerPort, ep

return nil
}

func (c *SandboxController) unconfigureFirewall(sb *compute.Sandbox) {
if len(sb.Network) == 0 {
c.Log.Warn("no network info on sandbox, skipping firewall cleanup", "sandbox", sb.ID.String())
return
}

ip, err := netutil.ParseNetworkAddress(sb.Network[0].Address)
if err != nil {
c.Log.Warn("failed to parse sandbox network address for firewall cleanup", "sandbox", sb.ID.String(), "address", sb.Network[0].Address, "err", err)
return
}

for _, co := range sb.Spec.Container {
for _, p := range co.Port {
c.unconfigurePort(p, ip)
}
}
}

func (c *SandboxController) unconfigurePort(p compute.SandboxSpecContainerPort, ip string) {
if p.NodePort == 0 {
return
}

c.Log.Info("removing firewall rules", "nodePort", p.NodePort, "targetPort", p.Port, "ip", ip)

out, err := exec.Command("iptables",
"-t", "nat",
"-D", "PREROUTING",
"!", "-i", c.Bridge,
"-p", "tcp",
"-m", "tcp",
"--dport", strconv.Itoa(int(p.NodePort)),
"-j", "DNAT", "--to-destination", fmt.Sprintf("%s:%d", ip, p.Port),
).CombinedOutput()
if err != nil {
c.Log.Warn("failed to remove PREROUTING rule", "nodePort", p.NodePort, "err", err, "output", string(out))
}

out, err = exec.Command("iptables",
"-t", "nat",
"-D", "OUTPUT",
"-p", "tcp",
"-m", "tcp",
"-d", "127.0.0.1",
"--dport", strconv.Itoa(int(p.NodePort)),
"-j", "DNAT", "--to-destination", fmt.Sprintf("%s:%d", ip, p.Port),
).CombinedOutput()
if err != nil {
c.Log.Warn("failed to remove OUTPUT rule", "nodePort", p.NodePort, "err", err, "output", string(out))
}

out, err = exec.Command("iptables",
"-t", "nat",
"-D", "POSTROUTING",
"-s", "127.0.0.1",
"-p", "tcp",
"-d", ip,
"--dport", strconv.Itoa(int(p.Port)),
"-j", "MASQUERADE",
).CombinedOutput()
if err != nil {
c.Log.Warn("failed to remove POSTROUTING rule", "nodePort", p.NodePort, "err", err, "output", string(out))
}
}
3 changes: 3 additions & 0 deletions controllers/sandbox/firewall_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ import (
func (c *SandboxController) configureFirewall(sb *compute.Sandbox, ep *network.EndpointConfig) error {
return nil
}

func (c *SandboxController) unconfigureFirewall(sb *compute.Sandbox) {
}
31 changes: 19 additions & 12 deletions controllers/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ func (c *SandboxController) reconcileSandboxesOnBoot(ctx context.Context) error
c.Log.Info("marking unhealthy sandbox as DEAD and cleaning up", "id", id)

// Try to clean up the sandbox
err := c.Delete(ctx, id)
err := c.Delete(ctx, id, nil)
if err != nil {
c.Log.Error("failed to cleanup unhealthy sandbox", "id", id, "err", err)
// Continue with other sandboxes even if one fails
Expand Down Expand Up @@ -2229,10 +2229,11 @@ cleanup:
return nil
}

func (c *SandboxController) Delete(ctx context.Context, id entity.Id) error {
func (c *SandboxController) Delete(ctx context.Context, id entity.Id, sb *compute.Sandbox) error {
c.Log.Debug("delete callback received, cleaning up sandbox", "id", id)
// Delete is called when an entity has been deleted from the store.
// Just pass through to stopSandbox which will discover everything from containerd.
if sb != nil {
c.unconfigureFirewall(sb)
}
return c.stopSandbox(ctx, id)
}

Expand Down Expand Up @@ -2270,13 +2271,17 @@ func (c *SandboxController) stopSandbox(ctx context.Context, id entity.Id) error
c.Log.Warn("failed to load pause container", "id", id, "err", err)
}

// Fallback: if we couldn't get IPs from container labels, try the entity store
if len(sandboxIPs) == 0 {
resp, err := c.EAC.Get(ctx, id.String())
if err == nil {
var sb compute.Sandbox
sb.Decode(resp.Entity().Entity())
// Fetch sandbox entity for firewall cleanup and as IP fallback
resp, entityErr := c.EAC.Get(ctx, id.String())
if entityErr == nil {
var sb compute.Sandbox
sb.Decode(resp.Entity().Entity())

// Clean up iptables DNAT rules before destroying containers
c.unconfigureFirewall(&sb)

// Use entity store IPs as fallback if container labels didn't have them
if len(sandboxIPs) == 0 {
sandboxIPs = make(map[string]bool)
for _, net := range sb.Network {
addr := net.Address
Expand All @@ -2293,9 +2298,11 @@ func (c *SandboxController) stopSandbox(ctx context.Context, id entity.Id) error
if len(sandboxIPs) > 0 {
c.Log.Debug("retrieved IPs from entity store for cleanup", "id", id, "ips", sandboxIPs)
}
} else if !errors.Is(err, cond.ErrNotFound{}) {
c.Log.Warn("failed to get sandbox entity for IP cleanup", "id", id, "err", err)
}
} else if !errors.Is(entityErr, cond.ErrNotFound{}) {
c.Log.Warn("failed to get sandbox entity for cleanup", "id", id, "err", entityErr)
} else {
c.Log.Debug("sandbox entity already deleted, firewall cleanup handled by DeleteEntity if available", "id", id)
}
Comment on lines +2302 to 2306
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Firewall cleanup is skipped on NotFound, which can miss direct-delete teardowns.

When sandbox entity lookup fails with not found, cleanup is fully bypassed. In delete-driven teardown flows, that can leave DNAT rules behind. Please add a fallback source for port/IP cleanup data when the entity is absent (or ensure cleanup happens before entity deletion in that flow).

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@controllers/sandbox/sandbox.go` around lines 2301 - 2305, The current branch
that skips firewall cleanup when errors.Is(entityErr, cond.ErrNotFound{}) needs
a fallback: instead of bypassing DNAT/port cleanup, fetch the needed port/IP
cleanup metadata from an alternate source and proceed with cleanup. Update the
code around the entityErr check in controllers/sandbox/sandbox.go (the block
logging "failed to get sandbox entity for cleanup" / "sandbox entity already
deleted") so that when cond.ErrNotFound{} is detected you attempt to load
port/IP bindings from a fallback store (e.g., the firewall metadata table,
network binding records, or any SandboxRecord/cleanup cache your service has)
and call the same DNAT/port cleanup routine with that data; ensure the cleanup
routine and logs remain the same path used when the entity is found so DNAT
rules are removed even if the sandbox entity was directly deleted.


// Fallback if we couldn't get LogEntity from labels
Expand Down
10 changes: 5 additions & 5 deletions controllers/sandbox/sandbox_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ func TestSandbox(t *testing.T) {
r.NoError(err)

// Stop the first sandbox (this should set status to DEAD)
err = sbc.Delete(ctx, sbID1)
err = sbc.Delete(ctx, sbID1, nil)
r.NoError(err)

// Manually update the UpdatedAt timestamp to be older than our test time horizon
Expand Down Expand Up @@ -816,7 +816,7 @@ func TestSandbox(t *testing.T) {
r.Equal(compute.RUNNING, remainingSb.Status)

// Clean up the remaining sandbox
err = sbc.Delete(ctx, sbID2)
err = sbc.Delete(ctx, sbID2, nil)
r.NoError(err)
})

Expand Down Expand Up @@ -930,7 +930,7 @@ func TestSandbox(t *testing.T) {
}

// Clean up
err = co.Delete(ctx, id)
err = co.Delete(ctx, id, nil)
r.NoError(err)

// NOTE we only track port binding now, not unbounding.
Expand Down Expand Up @@ -1050,7 +1050,7 @@ func TestSandbox(t *testing.T) {
}

// Clean up
err = co.Delete(ctx, id)
err = co.Delete(ctx, id, nil)
r.NoError(err)
})

Expand Down Expand Up @@ -1226,7 +1226,7 @@ func TestSandbox(t *testing.T) {
}, 15*time.Second, 500*time.Millisecond, "task should stay running and logs should be collected")

// Clean up
err = co1.Delete(ctx, id)
err = co1.Delete(ctx, id, nil)
r.NoError(err)
})

Expand Down
2 changes: 1 addition & 1 deletion controllers/service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,6 @@ func (s *ServiceController) Create(ctx context.Context, srv *network_v1alpha.Ser
return nil
}

func (s *ServiceController) Delete(ctx context.Context, id entity.Id) error {
func (s *ServiceController) Delete(ctx context.Context, id entity.Id, obj *network_v1alpha.Service) error {
return nil
}
8 changes: 4 additions & 4 deletions controllers/service/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ func TestServiceController(t *testing.T) {
r.True(found, "Expected to find endpoints for service")

// Clean up
err = sbC.Delete(ctx, sbID)
err = sbC.Delete(ctx, sbID, nil)
r.NoError(err)
})

Expand Down Expand Up @@ -370,7 +370,7 @@ func TestServiceController(t *testing.T) {
r.NotEmpty(sandboxIP)

// Delete the sandbox
err = sbC.Delete(ctx, sbID)
err = sbC.Delete(ctx, sbID, nil)
r.NoError(err)

// Poll for endpoints to be created
Expand Down Expand Up @@ -611,7 +611,7 @@ func TestServiceController(t *testing.T) {
r.Equal(2, serviceEndpoints, "Expected to find 2 endpoint entities for service")

// Delete first sandbox
err = sbC.Delete(ctx, sbID1)
err = sbC.Delete(ctx, sbID1, nil)
r.NoError(err)

// Poll for endpoints to be created
Expand All @@ -638,7 +638,7 @@ func TestServiceController(t *testing.T) {
r.Equal(1, serviceEndpoints, "Expected to find 1 endpoint entity for service after deleting first sandbox")

// Clean up
err = sbC.Delete(ctx, sbID2)
err = sbC.Delete(ctx, sbID2, nil)
r.NoError(err)
})

Expand Down
16 changes: 12 additions & 4 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,11 @@ type ControllerEntity interface {
type GenericController[P ControllerEntity] interface {
Init(context.Context) error
Create(ctx context.Context, obj P, meta *entity.Meta) error
Delete(ctx context.Context, e entity.Id) error
// Delete is called when an entity is deleted. The obj parameter contains the
// entity data from before deletion (available since delete watch events now
// include the previous value). It may be nil if the entity data was unavailable
// (e.g., watch reconnect after etcd compaction).
Delete(ctx context.Context, id entity.Id, obj P) error
}

// UpdatingController is an optional interface that controllers can implement
Expand Down Expand Up @@ -609,8 +613,13 @@ func AdaptController[
return entity.Diff(meta.Entity, orig), err

case EventDeleted:
if err := cont.Delete(ctx, event.Id); err != nil {
return nil, fmt.Errorf("failed to create entity: %w", err)
var obj P
if event.Entity != nil {
obj = new(T)
obj.Decode(event.Entity)
}
if err := cont.Delete(ctx, event.Id, obj); err != nil {
return nil, fmt.Errorf("failed to delete entity: %w", err)
}
}

Expand Down Expand Up @@ -658,7 +667,6 @@ func AdaptReconcileController[
return entity.Diff(meta.Entity, orig), err

case EventDeleted:
// Check if the controller implements DeletingReconcileController
if deleter, ok := any(cont).(DeletingReconcileController); ok {
if err := deleter.Delete(ctx, event.Id); err != nil {
return nil, fmt.Errorf("failed to delete entity: %w", err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ func (c *BasicController) Create(ctx context.Context, obj *TestEntity, meta *ent
return nil
}

func (c *BasicController) Delete(ctx context.Context, id entity.Id) error {
func (c *BasicController) Delete(ctx context.Context, id entity.Id, obj *TestEntity) error {
c.DeleteCalls = append(c.DeleteCalls, string(id))
return nil
}
Expand Down
Loading