Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OCPCLOUD-812] Enable support for instances with GPUs on GCP #172

Merged
merged 1 commit into from
Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions pkg/apis/gcpprovider/v1beta1/gcpmachineproviderconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,26 @@ type GCPMachineProviderSpec struct {
// CredentialsSecret is a reference to the secret with GCP credentials.
CredentialsSecret *corev1.LocalObjectReference `json:"credentialsSecret,omitempty"`

CanIPForward bool `json:"canIPForward"`
DeletionProtection bool `json:"deletionProtection"`
Disks []*GCPDisk `json:"disks,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Metadata []*GCPMetadata `json:"gcpMetadata,omitempty"`
NetworkInterfaces []*GCPNetworkInterface `json:"networkInterfaces,omitempty"`
ServiceAccounts []GCPServiceAccount `json:"serviceAccounts"`
Tags []string `json:"tags,omitempty"`
TargetPools []string `json:"targetPools,omitempty"`
MachineType string `json:"machineType"`
Region string `json:"region"`
Zone string `json:"zone"`
ProjectID string `json:"projectID,omitempty"`
CanIPForward bool `json:"canIPForward"`
DeletionProtection bool `json:"deletionProtection"`
Disks []*GCPDisk `json:"disks,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Metadata []*GCPMetadata `json:"gcpMetadata,omitempty"`
NetworkInterfaces []*GCPNetworkInterface `json:"networkInterfaces,omitempty"`
ServiceAccounts []GCPServiceAccount `json:"serviceAccounts"`
Tags []string `json:"tags,omitempty"`
TargetPools []string `json:"targetPools,omitempty"`
MachineType string `json:"machineType"`
Region string `json:"region"`
Zone string `json:"zone"`
ProjectID string `json:"projectID,omitempty"`
GuestAccelerators []*GCPAcceleratorConfig `json:"guestAccelerators,omitempty"`

// Preemptible indicates if created instance is preemptible
Preemptible bool `json:"preemptible,omitempty"`

OnHostMaintenance string `json:"onHostMaintenance,omitempty"`
AutomaticRestart *bool `json:"automaticRestart,omitempty"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down Expand Up @@ -104,3 +108,12 @@ type GCPKMSKeyReference struct {
// Location is the GCP location in which the Key Ring exists.
Location string `json:"location"`
}

// GCPAcceleratorConfig describes type and count of accelerator cards attached to the instance on GCP.
type GCPAcceleratorConfig struct {
// AcceleratorCount is number of AcceleratorType accelerators (GPUs) to be attached to an instance
AcceleratorCount int64 `json:"acceleratorCount,omitempty"`
// AcceleratorType is the type of accelerator (GPU) to be attached to an instance.
// Supported accelerator types are: nvidia-tesla-k80, nvidia-tesla-p100, nvidia-tesla-v100, nvidia-tesla-a100, nvidia-tesla-p4, nvidia-tesla-t4
AcceleratorType string `json:"acceleratorType,omitempty"`
}
31 changes: 31 additions & 0 deletions pkg/apis/gcpprovider/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

122 changes: 117 additions & 5 deletions pkg/cloud/gcp/actuators/machine/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@ package machine
import (
"context"
"fmt"
"time"

"strconv"
"strings"
"time"

"github.com/openshift/cluster-api-provider-gcp/pkg/apis/gcpprovider/v1beta1"
machinev1 "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1"
Expand All @@ -25,6 +24,8 @@ const (
requeueAfterSeconds = 20
instanceLinkFmt = "https://www.googleapis.com/compute/v1/projects/%s/zones/%s/instances/%s"
kmsKeyNameFmt = "projects/%s/locations/%s/keyRings/%s/cryptoKeys/%s"
machineTypeFmt = "zones/%s/machineTypes/%s"
acceleratorTypeFmt = "zones/%s/acceleratorTypes/%s"
)

// Reconciler are list of services required by machine actuator, easy to create a fake
Expand All @@ -39,6 +40,98 @@ func newReconciler(scope *machineScope) *Reconciler {
}
}

var (
supportedGpuTypes = map[string]string{
"nvidia-tesla-k80": "NVIDIA_K80_GPUS",
"nvidia-tesla-p100": "NVIDIA_P100_GPUS",
"nvidia-tesla-v100": "NVIDIA_V100_GPUS",
"nvidia-tesla-a100": "NVIDIA_A100_GPUS",
"nvidia-tesla-p4": "NVIDIA_P4_GPUS",
"nvidia-tesla-t4": "NVIDIA_T4_GPUS",
}
)

func containsString(sli []string, str string) bool {
for _, elem := range sli {
if elem == str {
return true
}
}
return false
}

// machineTypeAcceleratorCount represents nvidia-tesla-A100 GPUs which are only compatible with A2 machine family
func (r *Reconciler) checkQuota(machineTypeAcceleratorCount int64) error {
region, err := r.computeService.RegionGet(r.projectID, r.providerSpec.Region)
if err != nil {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Failed to get region %s via compute service: %v", r.providerSpec.Region, err))
}
quotas := region.Quotas
var guestAccelerators = []*v1beta1.GCPAcceleratorConfig{}
// When the machine type has associated accelerator instances (A2 machine family), accelerators will be nvidia-tesla-A100s.
// Additional guest accelerators are not allowed so ignore the providerSpec GuestAccelerators.
if machineTypeAcceleratorCount != 0 {
guestAccelerators = append(guestAccelerators, &v1beta1.GCPAcceleratorConfig{AcceleratorType: "nvidia-tesla-a100", AcceleratorCount: machineTypeAcceleratorCount})
} else {
guestAccelerators = r.providerSpec.GuestAccelerators
}
// validate zone and then quota
// guestAccelerators slice can not store more than 1 element.
// More than one accelerator included in request results in error -> googleapi: Error 413: Value for field 'resource.guestAccelerators' is too large: maximum size 1 element(s); actual size 2., fieldSizeTooLarge
accelerator := guestAccelerators[0]
_, err = r.computeService.AcceleratorTypeGet(r.projectID, r.providerSpec.Zone, accelerator.AcceleratorType)
if err != nil {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("AcceleratorType %s not available in the zone %s : %v", accelerator.AcceleratorType, r.providerSpec.Zone, err))
}
metric := supportedGpuTypes[accelerator.AcceleratorType]
if metric == "" {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Unsupported accelerator type %s", accelerator.AcceleratorType))
}
// preemptible instances have separate quota
if r.providerSpec.Preemptible {
metric = "PREEMPTIBLE_" + metric
}
// check quota for GA
for i, q := range quotas {
if q.Metric == metric {
if int64(q.Usage)+accelerator.AcceleratorCount > int64(q.Limit) {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Quota exceeded. Metric: %s. Usage: %v. Limit: %v.", metric, q.Usage, q.Limit))
}
break
}
if i == len(quotas)-1 {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("No quota found. Metric: %s.", metric))
}
}
return nil
}

func (r *Reconciler) validateGuestAccelerators() error {
if len(r.providerSpec.GuestAccelerators) == 0 && !strings.HasPrefix(r.providerSpec.MachineType, "a2-") {
// no accelerators to validate so return nil
return nil
JoelSpeed marked this conversation as resolved.
Show resolved Hide resolved
}
if len(r.providerSpec.GuestAccelerators) > 0 && strings.HasPrefix(r.providerSpec.MachineType, "a2-") {
return machinecontroller.InvalidMachineConfiguration("A2 Machine types have pre-attached guest accelerators. Adding additional guest accelerators is not supported")
}
if !strings.HasPrefix(r.providerSpec.MachineType, "n1-") && !strings.HasPrefix(r.providerSpec.MachineType, "a2-") {
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s does not support accelerators. Only A2 and N1 machine type families support guest acceleartors.", r.providerSpec.MachineType))
}
a2MachineFamily, n1MachineFamily := r.computeService.GPUCompatibleMachineTypesList(r.providerSpec.ProjectID, r.providerSpec.Zone, r.Context)
machineType := r.providerSpec.MachineType
switch {
case a2MachineFamily[machineType] != 0:
// a2 family machine - has fixed type and count of GPUs
return r.checkQuota(a2MachineFamily[machineType])
case containsString(n1MachineFamily, machineType):
// n1 family machine
return r.checkQuota(0)
default:
// any other machine type
return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s is not available in the zone %s.", r.providerSpec.MachineType, r.providerSpec.Zone))
}
}

// Create creates machine if and only if machine exists, handled by cluster-api
func (r *Reconciler) create() error {
if err := validateMachine(*r.machine, *r.providerSpec); err != nil {
Expand All @@ -50,16 +143,35 @@ func (r *Reconciler) create() error {
CanIpForward: r.providerSpec.CanIPForward,
DeletionProtection: r.providerSpec.DeletionProtection,
Labels: r.providerSpec.Labels,
MachineType: fmt.Sprintf("zones/%s/machineTypes/%s", zone, r.providerSpec.MachineType),
MachineType: fmt.Sprintf(machineTypeFmt, zone, r.providerSpec.MachineType),
Name: r.machine.Name,
Tags: &compute.Tags{
Items: r.providerSpec.Tags,
},
Scheduling: &compute.Scheduling{
Preemptible: r.providerSpec.Preemptible,
Preemptible: r.providerSpec.Preemptible,
AutomaticRestart: r.providerSpec.AutomaticRestart,
OnHostMaintenance: r.providerSpec.OnHostMaintenance,
},
}

var guestAccelerators = []*compute.AcceleratorConfig{}

if l := len(r.providerSpec.GuestAccelerators); l == 1 {
guestAccelerators = append(guestAccelerators, &compute.AcceleratorConfig{
AcceleratorType: fmt.Sprintf(acceleratorTypeFmt, zone, r.providerSpec.GuestAccelerators[0].AcceleratorType),
AcceleratorCount: r.providerSpec.GuestAccelerators[0].AcceleratorCount,
})
} else if l > 1 {
return machinecontroller.InvalidMachineConfiguration("More than one type of accelerator provided. Instances support only one accelerator type at a time.")
}

instance.GuestAccelerators = guestAccelerators

if err := r.validateGuestAccelerators(); err != nil {
return err
}

if instance.Labels == nil {
instance.Labels = map[string]string{}
}
Expand All @@ -70,7 +182,7 @@ func (r *Reconciler) create() error {
for _, disk := range r.providerSpec.Disks {
srcImage := disk.Image
if !strings.Contains(disk.Image, "/") {
// only image name provided therfore defaulting to the current project
// only image name provided therefore defaulting to the current project
srcImage = googleapi.ResolveRelative(r.computeService.BasePath(), fmt.Sprintf("%s/global/images/%s", r.projectID, disk.Image))
}

Expand Down
40 changes: 38 additions & 2 deletions pkg/cloud/gcp/actuators/services/compute/computeservice.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@ package computeservice

import (
"context"
"log"
"strings"

"github.com/openshift/cluster-api-provider-gcp/pkg/version"
"golang.org/x/oauth2/google"
"google.golang.org/api/compute/v1"
"google.golang.org/api/option"

"github.com/openshift/cluster-api-provider-gcp/pkg/version"
"google.golang.org/api/compute/v1"
)

// GCPComputeService is a pass through wrapper for google.golang.org/api/compute/v1/compute
Expand All @@ -22,6 +25,9 @@ type GCPComputeService interface {
TargetPoolsAddInstance(project string, region string, name string, instance string) (*compute.Operation, error)
TargetPoolsRemoveInstance(project string, region string, name string, instance string) (*compute.Operation, error)
MachineTypesGet(project string, machineType string, zone string) (*compute.MachineType, error)
RegionGet(project string, region string) (*compute.Region, error)
GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string)
AcceleratorTypeGet(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error)
}

type computeService struct {
Expand Down Expand Up @@ -106,3 +112,33 @@ func (c *computeService) TargetPoolsRemoveInstance(project string, region string
func (c *computeService) MachineTypesGet(project string, zone string, machineType string) (*compute.MachineType, error) {
return c.service.MachineTypes.Get(project, zone, machineType).Do()
}

// GPUCompatibleMachineTypesList function lists machineTypes available in the zone and return map of A2 family and slice of N1 family machineTypes
func (c *computeService) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {
req := c.service.MachineTypes.List(project, zone)
var (
a2MachineFamily = map[string]int64{}
n1MachineFamily []string
)
if err := req.Pages(ctx, func(page *compute.MachineTypeList) error {
for _, machineType := range page.Items {
if strings.HasPrefix(machineType.Name, "a2") {
a2MachineFamily[machineType.Name] = machineType.Accelerators[0].GuestAcceleratorCount
} else if strings.HasPrefix(machineType.Name, "n1") {
n1MachineFamily = append(n1MachineFamily, machineType.Name)
}
}
return nil
}); err != nil {
log.Fatal(err)
}
return a2MachineFamily, n1MachineFamily
}

func (c *computeService) AcceleratorTypeGet(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error) {
return c.service.AcceleratorTypes.Get(project, zone, acceleratorType).Do()
}

func (c *computeService) RegionGet(project string, region string) (*compute.Region, error) {
return c.service.Regions.Get(project, region).Do()
}
13 changes: 13 additions & 0 deletions pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package computeservice

import (
"context"

compute "google.golang.org/api/compute/v1"
"google.golang.org/api/googleapi"
)
Expand Down Expand Up @@ -129,3 +131,14 @@ func MockBuilderFuncTypeNotFound(serviceAccountJSON string) (GCPComputeService,
}
return computeSvc, nil
}

func (c *GCPComputeServiceMock) RegionGet(project string, region string) (*compute.Region, error) {
return nil, nil
}

func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) {
return nil, nil
}
func (c *GCPComputeServiceMock) AcceleratorTypeGet(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error) {
return nil, nil
}