diff --git a/internal/controller/vm/vm.go b/internal/controller/vm/vm.go index b335249d1b..d623ef2d7f 100644 --- a/internal/controller/vm/vm.go +++ b/internal/controller/vm/vm.go @@ -188,12 +188,30 @@ func (c *Controller) StartVM(ctx context.Context, opts *StartOptions) (err error }() defer cancel() - // we should set up the necessary listeners for guest-host communication. - // The guest needs to connect to predefined vsock ports. - // The host must already be listening on these ports before the guest attempts to connect, - // otherwise the connection would fail. - c.setupEntropyListener(gctx, g) - c.setupLoggingListener(gctx, g) + // Set up the host-side hvsock listeners for entropy and logs before + // starting the VM. The guest dials predefined vsock ports early in boot, + // so the listeners must be bound up front to avoid a race. + // Each setup call creates the listener synchronously and dispatches an + // accept goroutine onto the errgroup: + // - entropy: writes seed bytes to the guest, then returns. + // - logging: accepts the connection and spawns a long-running relay + // for guest logs; the accept goroutine itself returns immediately. + // + // We intentionally wait on the error group after VM start but before + // establishing the GCS connection, to ensure entropy is seeded and the + // log channel is wired up first. + if err = c.setupEntropyListener(gctx, g); err != nil { + return fmt.Errorf("failed to set up entropy listener: %w", err) + } + if err = c.setupLoggingListener(gctx, g); err != nil { + return fmt.Errorf("failed to set up logging listener: %w", err) + } + // Open the host-side GCS hvsock listener before VM start so the host + // is listening when the in-VM GCS dials. Otherwise, GCS falls back to + // the internal HCS bridge and our accept hangs until timeout. + if err = c.guest.PrepareConnection(opts.GCSServiceID); err != nil { + return fmt.Errorf("failed to prepare guest connection: %w", err) + } err = c.uvm.Start(ctx) if err != nil { @@ -210,7 +228,9 @@ func (c *Controller) StartVM(ctx context.Context, opts *StartOptions) (err error return err } - err = c.guest.CreateConnection(ctx, opts.GCSServiceID, opts.ConfigOptions...) + // VM is started, entropy is seeded and log channel is up. Accept the + // GCS dial on the prepared listener and run the GCS protocol handshake. + err = c.guest.CreateConnection(ctx, opts.ConfigOptions...) if err != nil { return fmt.Errorf("failed to create guest connection: %w", err) } diff --git a/internal/controller/vm/vm_lcow.go b/internal/controller/vm/vm_lcow.go index 72b8edbf87..0775ff1377 100644 --- a/internal/controller/vm/vm_lcow.go +++ b/internal/controller/vm/vm_lcow.go @@ -111,16 +111,18 @@ func (c *Controller) Plan9Controller() *plan9.Controller { // Linux VMs require entropy to initialize their random number generators during boot. // This method listens on a predefined vsock port and provides cryptographically secure // random data to the Linux init process when it connects. -func (c *Controller) setupEntropyListener(ctx context.Context, group *errgroup.Group) { +func (c *Controller) setupEntropyListener(ctx context.Context, group *errgroup.Group) error { + // The Linux guest will connect to this port during init to receive entropy. + entropyConn, err := winio.ListenHvsock(&winio.HvsockAddr{ + VMID: c.uvm.RuntimeID(), + ServiceID: winio.VsockServiceID(vmutils.LinuxEntropyVsockPort), + }) + if err != nil { + return fmt.Errorf("failed to listen on hvSocket for entropy: %w", err) + } + group.Go(func() error { - // The Linux guest will connect to this port during init to receive entropy. - entropyConn, err := winio.ListenHvsock(&winio.HvsockAddr{ - VMID: c.uvm.RuntimeID(), - ServiceID: winio.VsockServiceID(vmutils.LinuxEntropyVsockPort), - }) - if err != nil { - return fmt.Errorf("failed to listen on hvSocket for entropy: %w", err) - } + defer entropyConn.Close() // Prepare to provide entropy to the init process in the background. This // must be done in a goroutine since, when using the internal bridge, the @@ -135,13 +137,14 @@ func (c *Controller) setupEntropyListener(ctx context.Context, group *errgroup.G // Write the required amount of entropy to the connection. // The init process will read this data and use it to seed the kernel's // random number generator (CRNG). - _, err = io.CopyN(conn, rand.Reader, vmutils.LinuxEntropyBytes) - if err != nil { + if _, err = io.CopyN(conn, rand.Reader, vmutils.LinuxEntropyBytes); err != nil { return fmt.Errorf("failed to write entropy to connection: %w", err) } return nil }) + + return nil } // setupLoggingListener sets up logging for LCOW UVMs. @@ -149,17 +152,19 @@ func (c *Controller) setupEntropyListener(ctx context.Context, group *errgroup.G // This method establishes a vsock connection to receive log output from GCS // running inside the Linux VM. The logs are parsed and // forwarded to the host's logging system for monitoring and debugging. -func (c *Controller) setupLoggingListener(ctx context.Context, group *errgroup.Group) { +func (c *Controller) setupLoggingListener(ctx context.Context, group *errgroup.Group) error { + // The GCS will connect to this port to stream log output. + logConn, err := winio.ListenHvsock(&winio.HvsockAddr{ + VMID: c.uvm.RuntimeID(), + ServiceID: winio.VsockServiceID(vmutils.LinuxLogVsockPort), + }) + if err != nil { + close(c.logOutputDone) + return fmt.Errorf("failed to listen on hvSocket for logs: %w", err) + } + group.Go(func() error { - // The GCS will connect to this port to stream log output. - logConn, err := winio.ListenHvsock(&winio.HvsockAddr{ - VMID: c.uvm.RuntimeID(), - ServiceID: winio.VsockServiceID(vmutils.LinuxLogVsockPort), - }) - if err != nil { - close(c.logOutputDone) - return fmt.Errorf("failed to listen on hvSocket for logs: %w", err) - } + defer logConn.Close() // Accept the connection from the GCS. conn, err := vmmanager.AcceptConnection(ctx, c.uvm, logConn, true) @@ -170,6 +175,8 @@ func (c *Controller) setupLoggingListener(ctx context.Context, group *errgroup.G // Launch a separate goroutine to process logs for the lifetime of the VM. go func() { + defer conn.Close() + // Parse GCS log output and forward it to the host logging system. vmutils.ParseGCSLogrus(c.uvm.ID())(conn) @@ -180,6 +187,8 @@ func (c *Controller) setupLoggingListener(ctx context.Context, group *errgroup.G return nil }) + + return nil } // finalizeGCSConnection finalizes the GCS connection for LCOW VMs. diff --git a/internal/controller/vm/vm_wcow.go b/internal/controller/vm/vm_wcow.go index 80d8fa8c0d..d9a57b4b61 100644 --- a/internal/controller/vm/vm_wcow.go +++ b/internal/controller/vm/vm_wcow.go @@ -40,7 +40,7 @@ func (c *Controller) buildConfidentialOptions(_ context.Context) (*guestresource // This is a no-op implementation to satisfy the platform-specific interface. // // For comparison, LCOW VMs require entropy to be provided during boot. -func (c *Controller) setupEntropyListener(_ context.Context, _ *errgroup.Group) {} +func (c *Controller) setupEntropyListener(_ context.Context, _ *errgroup.Group) error { return nil } // setupLoggingListener sets up logging for WCOW UVMs. // @@ -52,25 +52,22 @@ func (c *Controller) setupEntropyListener(_ context.Context, _ *errgroup.Group) // The listener is configured to accept only one concurrent connection at a time // to prevent resource exhaustion, but will accept new connections if the current one is closed. // This supports scenarios where the logging service inside the VM needs to restart. -func (c *Controller) setupLoggingListener(ctx context.Context, _ *errgroup.Group) { +func (c *Controller) setupLoggingListener(ctx context.Context, _ *errgroup.Group) error { + baseListener, err := winio.ListenHvsock(&winio.HvsockAddr{ + VMID: c.uvm.RuntimeID(), + ServiceID: prot.WindowsLoggingHvsockServiceID, + }) + if err != nil { + close(c.logOutputDone) + return fmt.Errorf("failed to listen for windows logging connections: %w", err) + } + // For Windows, the listener can receive a connection later (after VM starts), // so we start the output handler in a goroutine with a non-timeout context. // This allows the output handler to run independently of the VM creation lifecycle. // This is useful for the case when the logging service is restarted. go func() { - baseListener, err := winio.ListenHvsock(&winio.HvsockAddr{ - VMID: c.uvm.RuntimeID(), - ServiceID: prot.WindowsLoggingHvsockServiceID, - }) - if err != nil { - // Close the output done channel to signal that logging setup - // has failed and no logs will be processed. - close(c.logOutputDone) - logrus.WithError(err).Error("failed to listen for windows logging connections") - - // Return early due to error. - return - } + defer baseListener.Close() // Use a WaitGroup to track active log processing goroutines. // This ensures we wait for all log processing to complete before closing logOutputDone. @@ -91,6 +88,8 @@ func (c *Controller) setupLoggingListener(ctx context.Context, _ *errgroup.Group wg.Add(1) go func() { defer wg.Done() + defer conn.Close() + logrus.Info("uvm output handler starting") // Parse GCS log output and forward it to the host logging system. @@ -107,6 +106,8 @@ func (c *Controller) setupLoggingListener(ctx context.Context, _ *errgroup.Group // Signal that log output processing has completed. close(c.logOutputDone) }() + + return nil } // finalizeGCSConnection finalizes the GCS connection for WCOW UVMs. diff --git a/internal/vm/guestmanager/doc.go b/internal/vm/guestmanager/doc.go index af849c2ad3..c901a77248 100644 --- a/internal/vm/guestmanager/doc.go +++ b/internal/vm/guestmanager/doc.go @@ -16,6 +16,8 @@ GCS connection: g, err := guestmanager.New(ctx, uvm) if err != nil { // handle error } + if err := g.PrepareConnection(gcsServiceID); err != nil { // handle error } + // (start the UVM here) if err := g.CreateConnection(ctx); err != nil { // handle error } After the connection is established, use the manager interfaces for guest-side changes: diff --git a/internal/vm/guestmanager/guest.go b/internal/vm/guestmanager/guest.go index 54e0a56e53..a95e5b9265 100644 --- a/internal/vm/guestmanager/guest.go +++ b/internal/vm/guestmanager/guest.go @@ -5,6 +5,7 @@ package guestmanager import ( "context" "fmt" + "net" "sync" "github.com/Microsoft/hcsshim/internal/gcs" @@ -46,6 +47,8 @@ type Guest struct { // gc is the active GCS connection to the guest. // It will be nil if no connection is active. gc *gcs.GuestConnection + // gcListener is bound by PrepareConnection and consumed by CreateConnection. + gcListener net.Listener } // New creates a new Guest Manager. @@ -67,26 +70,22 @@ func WithInitializationState(state *gcs.InitialGuestState) ConfigOption { } } -// CreateConnection accepts the GCS connection and performs initial setup. -func (gm *Guest) CreateConnection(ctx context.Context, GCSServiceID guid.GUID, opts ...ConfigOption) error { +// PrepareConnection opens the host-side hvsock listener for the given GCS +// service ID. Must be called before VM start so the host is listening when +// the in-VM GCS dials. Idempotent for the same service ID. +func (gm *Guest) PrepareConnection(GCSServiceID guid.GUID) error { gm.mu.Lock() defer gm.mu.Unlock() - // Return early if a connection is already active. - if gm.gc != nil { - // If the caller tried to connect to a different GCS service then error out. + // Idempotent if already prepared/connected with the same service ID. + if gm.gcListener != nil || gm.gc != nil { if gm.gcsServiceID != GCSServiceID { return fmt.Errorf("gcs service id mismatch: expected %s, got %s", gm.gcsServiceID, GCSServiceID) } return nil } - gm.gcsServiceID = GCSServiceID - - // The guest needs to connect to predefined GCS port. - // The host must already be listening on these port before the guest attempts to connect, - // otherwise the connection would fail. - vmConn, err := winio.ListenHvsock(&winio.HvsockAddr{ + l, err := winio.ListenHvsock(&winio.HvsockAddr{ VMID: gm.uvm.RuntimeID(), ServiceID: GCSServiceID, }) @@ -94,8 +93,30 @@ func (gm *Guest) CreateConnection(ctx context.Context, GCSServiceID guid.GUID, o return fmt.Errorf("failed to listen for guest connection: %w", err) } - // Accept the connection - conn, err := vmmanager.AcceptConnection(ctx, gm.uvm, vmConn, true) + gm.gcsServiceID = GCSServiceID + gm.gcListener = l + return nil +} + +// CreateConnection accepts the GCS dial on the prepared listener and runs +// the GCS protocol handshake. Must be called after VM start. Idempotent if +// a connection already exists. +func (gm *Guest) CreateConnection(ctx context.Context, opts ...ConfigOption) error { + gm.mu.Lock() + defer gm.mu.Unlock() + + if gm.gc != nil { + return nil + } + if gm.gcListener == nil { + return fmt.Errorf("CreateConnection called before PrepareConnection") + } + + // AcceptConnection takes ownership of the listener and closes it. + l := gm.gcListener + gm.gcListener = nil + + conn, err := vmmanager.AcceptConnection(ctx, gm.uvm, l, true) if err != nil { return fmt.Errorf("failed to connect to GCS: %w", err) } @@ -134,6 +155,9 @@ func (gm *Guest) CloseConnection() error { err = gm.gc.Close() gm.gc = nil } - + if gm.gcListener != nil { + _ = gm.gcListener.Close() + gm.gcListener = nil + } return err } diff --git a/internal/vm/vmmanager/utils.go b/internal/vm/vmmanager/utils.go index 535c963f4d..f6ac5a04f1 100644 --- a/internal/vm/vmmanager/utils.go +++ b/internal/vm/vmmanager/utils.go @@ -24,7 +24,7 @@ func AcceptConnection(ctx context.Context, uvm vmWaiter, l net.Listener, closeCo conn net.Conn err error } - resultCh := make(chan acceptResult) + resultCh := make(chan acceptResult, 1) go func() { c, err := l.Accept() @@ -45,6 +45,7 @@ func AcceptConnection(ctx context.Context, uvm vmWaiter, l net.Listener, closeCo } return res.conn, res.err case <-ctx.Done(): + _ = l.Close() return nil, ctx.Err() case <-vmExitCh: }