Skip to content

Commit

Permalink
Detect GPUs (#121)
Browse files Browse the repository at this point in the history
  • Loading branch information
majst01 committed May 6, 2024
1 parent c92750a commit 20724eb
Show file tree
Hide file tree
Showing 15 changed files with 297 additions and 208 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ jobs:
mv metal-hammer-initrd.img.lz4* images/metal-hammer/
- name: Upload image tarballs to GCS
run: gsutil -m cp -r -p images/metal-hammer gs://$GCS_BUCKET
- uses: release-drafter/release-drafter@v5
- uses: release-drafter/release-drafter@v6
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ RUN curl -fLsS https://sourceforge.net/projects/e1000/files/ice%20stable/${ICE_V

# ipmitool from bookworm is broken and returns with error on most commands
FROM golang:1.22-bullseye as initrd-builder
ENV UROOT_GIT_SHA_OR_TAG=v0.13.0
ENV UROOT_GIT_SHA_OR_TAG=v0.14.0
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates \
Expand Down
4 changes: 2 additions & 2 deletions REINSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ If only the `imageID` is given it tries to guess the primary disk of the old OS.
After wiping the primary disk the reinstall procedure continues with the usual installation process up from the `installImage` method that eventually ends with the `finalizeAllocation` call, which now includes the previous mentioned `BootInfo` parameters.

**metal-core** passes-through the request to **metal-api**, sets the boot order to HD and power cycles the machine again, which in turn boots the new OS.
**metal-api** removes the `allocation.Reinstall` mark and stores the `BootInfo` details together with the newly installed `imageID` in the `allcation.MachineSetup` struct.

**metal-api** removes the `allocation.Reinstall` mark and stores the `BootInfo` details together with the newly installed `imageID` in the `allocation.MachineSetup` struct.

This was the happy-path. But of course, things can go wrong. If for any reason the reinstallation process fails, we are potentially in one of the following two states: Either the primary disk has been wiped already (and therewith the existing OS) or not. In both cases **metal-hammer** calls **metal-core** via the `/machine/abort-reinstall/<id>` endpoint delivering the bool value `primaryDiskWiped` that indicates the actual state.
If **metal-core** fails to respond or the OS has already been wiped the machine reboots. Otherwise it gets the `BootInfo` of the previous installed OS stored in the DS and reboots with these details into the existing OS, just as nothing had happened at all.
Expand Down
2 changes: 1 addition & 1 deletion cmd/firmware/firmware.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func (f *Firmware) Update() {
}
}

// Run execute a comand with arguments, returns output and error
// Run execute a command with arguments, returns output and error
func run(log *slog.Logger, command string, args ...string) (string, error) {
path, err := exec.LookPath(command)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion cmd/firmware/intel.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func (r intel) update() error {
if err != nil {
return fmt.Errorf("unable to update intel firmware %w", err)
}
r.log.Info("intel", "updated firware output", output)
r.log.Info("intel", "updated firmware output", output)
return nil
}

Expand Down
9 changes: 6 additions & 3 deletions cmd/metal-client.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

v1 "github.com/metal-stack/metal-api/pkg/api/v1"
metalgo "github.com/metal-stack/metal-go"
"github.com/metal-stack/metal-go/api/client/machine"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/keepalive"
Expand All @@ -18,7 +19,7 @@ import (
type MetalAPIClient struct {
log *slog.Logger
conn grpc.ClientConnInterface
Driver metalgo.Client
driver metalgo.Client
}

// NewMetalAPIClient fetches the address,hmac and certificates from pixie needed to communicate with metal-api,
Expand Down Expand Up @@ -71,10 +72,12 @@ func NewMetalAPIClient(log *slog.Logger, spec *Specification) (*MetalAPIClient,
return &MetalAPIClient{
log: log,
conn: conn,
Driver: driver,
driver: driver,
}, nil
}

func (c *MetalAPIClient) Machine() machine.ClientService {
return c.driver.Machine()
}
func (c *MetalAPIClient) Event() v1.EventServiceClient {
return v1.NewEventServiceClient(c.conn)
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/network/ethtool.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
"github.com/metal-stack/metal-hammer/pkg/os/command"
)

// EthtoolCommand to gather ethernet informations
// EthtoolCommand to gather ethernet information
const ethtoolCommand = command.Ethtool

// Ethtool to query/set ethernet interfaces
Expand Down Expand Up @@ -89,7 +89,7 @@ func (e *Ethtool) disableFirmwareLLDP(ifi string) {

var buggyIntelNicDriverNames = []string{"i40e"}

// stopFirmwareLLDP stop Firmeware LLDP not persistent over reboots, only during runtime.
// stopFirmwareLLDP stop Firmware LLDP not persistent over reboots, only during runtime.
// mount -t debugfs none /sys/kernel/debug
// echo lldp stop > /sys/kernel/debug/i40e/0000:01:00.2/command
// where <0000:01:00.2> is the pci address of the ethernet nic, this can be inspected by lspci,
Expand Down
2 changes: 1 addition & 1 deletion cmd/network/ntpdate.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func getTime(log *slog.Logger, servers []string) (t time.Time, err error) {
return
}

// NtpDate set the system time to the time comming from a ntp source
// NtpDate set the system time to the time coming from a ntp source
func NtpDate(log *slog.Logger) {
t, err := getTime(log, ntpServers)
if err != nil {
Expand Down
61 changes: 60 additions & 1 deletion cmd/register/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/metal-stack/metal-hammer/cmd/network"
"github.com/metal-stack/metal-hammer/cmd/storage"
"github.com/metal-stack/v"
"github.com/u-root/u-root/pkg/pci"
"github.com/vishvananda/netlink"
)

Expand Down Expand Up @@ -81,6 +82,32 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
if err != nil {
return nil, fmt.Errorf("unable to get system cpu(s) %w", err)
}
r.log.Info("cpu", "processors", cpu.String())
var metalCPUs []*v1.MachineCPU
for _, cpu := range cpu.Processors {
metalCPUs = append(metalCPUs, &v1.MachineCPU{
Vendor: cpu.Vendor,
Model: cpu.Model,
Cores: cpu.NumCores,
Threads: cpu.NumThreads,
})
}

// 0000:bd:00.0: DisplayVGA: NVIDIA Corporation AD102GL [RTX 6000 Ada Generation]

gpus, err := r.detectGPUs()
if err != nil {
return nil, fmt.Errorf("unable to get system gpu(s) %w", err)
}

var metalGPUs []*v1.MachineGPU
for _, g := range gpus {
r.log.Info("found gpu", "gpu", g.String())
metalGPUs = append(metalGPUs, &v1.MachineGPU{
Vendor: g.VendorName,
Model: g.DeviceName,
})
}

// Nics
nics := []*v1.MachineNic{}
Expand Down Expand Up @@ -166,6 +193,8 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
CpuCores: uint32(cpu.TotalCores),
Nics: nics,
Disks: disks,
Cpus: metalCPUs,
Gpus: metalGPUs,
}

// IPMI
Expand All @@ -178,7 +207,7 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
board := r.inband.Board()
b := board.BIOS
if b == nil {
return nil, fmt.Errorf("unable to read bios informations from bmc")
return nil, fmt.Errorf("unable to read bios information from bmc")
}
bios := &v1.MachineBIOS{
Version: b.Version,
Expand All @@ -198,6 +227,36 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
return request, nil
}

func (r *Register) detectGPUs() (pci.Devices, error) {
pciReader, err := pci.NewBusReader("*")
if err != nil {
return nil, err
}

var devices pci.Devices
if devices, err = pciReader.Read(); err != nil {
return nil, err
}

devices.SetVendorDeviceName()

var result pci.Devices
for _, device := range devices {
// "vendor":"NVIDIA Corporation","device":"AD102GL [RTX 6000 Ada Generation]"}
if !strings.Contains(strings.ToLower(device.VendorName), "nvidia") {
continue
}

// TODO if new models must be supported, this code must be refactored
if strings.Contains(strings.ToLower(device.DeviceName), "rtx") {
r.log.Info("add gpu", "vendor", device.VendorName, "device", device.DeviceName)
result = append(result, device)
}
}

return result, nil
}

// save the content of kernel ring buffer to /var/log/syslog
// by calling the appropriate syscall.
// Only required if Memory is gathered by ghw.Memory()
Expand Down
4 changes: 2 additions & 2 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ func Run(log *slog.Logger, spec *Specification, hal hal.InBand) (*event.EventEmi
return eventEmitter, fmt.Errorf("register %w", err)
}

resp, err := metalAPIClient.Driver.Machine().FindMachine(machine.NewFindMachineParams().WithID(spec.MachineUUID), nil)
resp, err := metalAPIClient.Machine().FindMachine(machine.NewFindMachineParams().WithID(spec.MachineUUID), nil)
if err != nil {
return eventEmitter, fmt.Errorf("fetch %w", err)
}
Expand Down Expand Up @@ -143,7 +143,7 @@ func Run(log *slog.Logger, spec *Specification, hal hal.InBand) (*event.EventEmi
if err != nil {
return eventEmitter, fmt.Errorf("wait for installation %w", err)
}
resp, err = metalAPIClient.Driver.Machine().FindMachine(machine.NewFindMachineParams().WithID(spec.MachineUUID), nil)
resp, err = metalAPIClient.Machine().FindMachine(machine.NewFindMachineParams().WithID(spec.MachineUUID), nil)
if err != nil {
return eventEmitter, fmt.Errorf("wait for installation %w", err)
}
Expand Down
10 changes: 8 additions & 2 deletions cmd/supwd.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,21 @@ func (h *Hammer) createBmcSuperuser() error {
return fmt.Errorf("failed to fetch SuperUser password %w", err)
}

if resp.FeatureDisabled {
if resp.SuperUserPassword == "" {
h.log.Warn("creation of superuser disabled because password is empty")
return nil
}

bmcConn := h.Hal.BMCConnection()

err = bmcConn.CreateUser(bmcConn.SuperUser(), api.AdministratorPrivilege, resp.SuperUserPassword)
if err != nil {
return fmt.Errorf("failed to create bmc superuser: %s %w", bmcConn.SuperUser().Name, err)
// FIXME: this happens always after the first creation on X12 and newer boards
// return fmt.Errorf("failed to create bmc superuser: %s %w", bmcConn.SuperUser().Name, err)
h.log.Error("failed to create bmc superuser", "user", bmcConn.SuperUser().Name, "error", err)
return nil
}

h.log.Info("created superuser", "user", bmcConn.SuperUser().Name)
return nil
}
Loading

0 comments on commit 20724eb

Please sign in to comment.