diff --git a/README.md b/README.md index a6030d4..ebe124e 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Go Report Card](https://goreportcard.com/badge/github.com/mayooot/gpu-docker-api)](https://goreportcard.com/badge/github.com/mayooot/gpu-docker-api) [简体中文](docs%2Fzh-cn.md) -> ⚠️注意:中文文档已经过期,请使用英文文档。 +> ⚠️注意:中文文档更新可能落后于英文文档,请以英文文档为准。 > # Overview @@ -116,7 +116,6 @@ Import [gpu-docker-api.openapi.json](api%2Fgpu-docker-api.openapi.json) to invok tutorial: [volume-size-scale-en.md](docs%2Fvolume%2Fvolume-size-scale-en.md) 4. Make sure your test environment has ETCD V3 installed, installation tutorial: [ETCD](https://github.com/etcd-io/etcd). -5. Clone and run [detect-gpu](https://github.com/mayooot/detect-gpu). ## Build From Source @@ -127,6 +126,7 @@ $ make build ~~~ ## Download From Release + [release](https://github.com/mayooot/gpu-docker-api/releases) ## Config File @@ -193,7 +193,7 @@ And workQueue asynchronous processing in Client-go. * gpuScheduler:A scheduler that allocates GPU resources and saves the used GPUs. * gpuStatusMap: - Maintain the GPU resources of the server, when the program starts for the first time, call detect-gpu to get all + Maintain the GPU resources of the server, when the program starts for the first time, call `nvidia-smi` to get all the GPU resources, and initialize gpuStatusMap, Key is the UUID of GPU, Value is the usage, 0 means used, 1 means unused. @@ -214,8 +214,6 @@ And workQueue asynchronous processing in Client-go. * /apis/v1/versions/containerVersionMapKey * /apis/v1/versions/volumeVersionMapKey -* detect-gpu:A simple HTTP server that calls [go-nvml](https://github.com/NVIDIA/go-nvml) to get the GPU of the host - computer. ## Architecture Diagram diff --git a/cmd/gpu-docker-api/main.go b/cmd/gpu-docker-api/main.go index 82af88c..2fbf49e 100644 --- a/cmd/gpu-docker-api/main.go +++ b/cmd/gpu-docker-api/main.go @@ -70,7 +70,7 @@ func (p *program) Init(svc.Environment) error { workQueue.InitWorkQueue() - if err = gpuscheduler.Init(p.cfg); err != nil { + if err = gpuscheduler.Init(); err != nil { return err } diff --git a/etc/config.toml b/etc/config.toml index 4489654..c3d6431 100644 --- a/etc/config.toml +++ b/etc/config.toml @@ -4,11 +4,6 @@ port = ":2378" # etcd addr etcd_addr = "127.0.0.1:2379" -# detect-gpu addr -detect_gpu_addr = "http://127.0.0.1:2376/api/v1/detect/gpu" -# host gpu count -available_gpu_nums = 8 - # available port range start_port = 40000 end_port = 65535 \ No newline at end of file diff --git a/internal/config/config.go b/internal/config/config.go index c661bc6..eb826e8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -7,12 +7,10 @@ import ( ) type Config struct { - Port string `toml:"port"` - EtcdAddr string `toml:"etcd_addr"` - DetectGPUAddr string `toml:"detect_gpu_addr"` - AvailableGpuNums int `toml:"available_gpu_nums"` - StartPort int `toml:"start_port"` - EndPort int `toml:"end_port"` + Port string `toml:"port"` + EtcdAddr string `toml:"etcd_addr"` + StartPort int `toml:"start_port"` + EndPort int `toml:"end_port"` } func NewConfigWithFile(name string) (*Config, error) { diff --git a/internal/etcd/client.go b/internal/etcd/client.go index c69bd3b..5b4e948 100644 --- a/internal/etcd/client.go +++ b/internal/etcd/client.go @@ -1,6 +1,7 @@ package etcd import ( + "github.com/pkg/errors" "time" clientv3 "go.etcd.io/etcd/client/v3" @@ -18,8 +19,10 @@ func InitEtcdClient(cfg *config.Config) error { DialTimeout: 2 * time.Second, DialOptions: []grpc.DialOption{grpc.WithBlock()}, }) - - return err + if err != nil { + return errors.Wrap(err, "failed to connect etcd") + } + return nil } func CloseEtcdClient() error { diff --git a/internal/scheduler/gpuscheduler/scheduler.go b/internal/scheduler/gpuscheduler/scheduler.go index 012010c..1f457f6 100644 --- a/internal/scheduler/gpuscheduler/scheduler.go +++ b/internal/scheduler/gpuscheduler/scheduler.go @@ -2,21 +2,20 @@ package gpuscheduler import ( "encoding/json" - "errors" - "io/ioutil" - "net/http" "strconv" + "strings" "sync" - "github.com/mayooot/gpu-docker-api/internal/config" + "github.com/commander-cli/cmd" + "github.com/pkg/errors" + "github.com/mayooot/gpu-docker-api/internal/etcd" - "github.com/mayooot/gpu-docker-api/internal/model" "github.com/mayooot/gpu-docker-api/internal/xerrors" ) const ( - // 默认的可用GPU 数量 - defaultAvailableGpuNums = 8 + // 执行命令获取 gpu 的 index 和 uuid + allGpuUUIDCommand = "nvidia-smi --query-gpu=index,uuid --format=csv,noheader,nounits" // gpuScheduler 存储在 etcd 中的 key gpuStatusMapKey = "gpuStatusMapKey" @@ -24,6 +23,11 @@ const ( var Scheduler *scheduler +type gpu struct { + Index int `json:"index"` + UUID *string `json:"uuid"` +} + type scheduler struct { sync.RWMutex @@ -31,26 +35,23 @@ type scheduler struct { GpuStatusMap map[string]byte } -func Init(cfg *config.Config) error { +func Init() error { var err error Scheduler, err = initFormEtcd() if err != nil { - return err + return errors.Wrap(err, "initFormEtcd failed") } if Scheduler.AvailableGpuNums == 0 || len(Scheduler.GpuStatusMap) == 0 { // 如果没有初始化过 - Scheduler.AvailableGpuNums = defaultAvailableGpuNums - if cfg.AvailableGpuNums >= 0 { - Scheduler.AvailableGpuNums = cfg.AvailableGpuNums - } - - gpus, err := getDetectGpus(cfg.DetectGPUAddr) + gpus, err := getAllGpuUUID() if err != nil { - return err + return errors.Wrap(err, "getAllGpuUUID failed") } + + Scheduler.AvailableGpuNums = len(gpus) for i := 0; i < len(gpus); i++ { - Scheduler.GpuStatusMap[gpus[i].UUID] = 0 + Scheduler.GpuStatusMap[*gpus[i].UUID] = 0 } } return nil @@ -139,20 +140,40 @@ func initFormEtcd() (s *scheduler, err error) { return s, err } -func getDetectGpus(addr string) (gpus []model.GpuInfo, err error) { - resp, err := http.Get(addr) +func getAllGpuUUID() ([]*gpu, error) { + c := cmd.NewCommand(allGpuUUIDCommand) + err := c.Execute() if err != nil { - return gpus, err + return nil, errors.Wrap(err, "cmd.Execute failed") } - defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) + gpuList, err := parseOutput(c.Stdout()) if err != nil { - return gpus, err + return nil, errors.Wrap(err, "parseOutput failed") } + return gpuList, nil +} + +func parseOutput(output string) (gpuList []*gpu, err error) { + lines := strings.Split(output, "\n") + gpuList = make([]*gpu, 0, len(lines)) + for _, line := range lines { + if line == "" { + continue + } - if err = json.Unmarshal(body, &gpus); err != nil { - return gpus, err + fields := strings.Split(line, ", ") + if len(fields) == 2 { + index, err := strconv.Atoi(fields[0]) + if err != nil { + return gpuList, errors.Wrapf(err, "strconv.Atoi failed, index: %s", fields[0]) + } + uuid := fields[1] + gpuList = append(gpuList, &gpu{ + Index: index, + UUID: &uuid, + }) + } } - return gpus, err + return } diff --git a/internal/scheduler/portscheduler/scheduler.go b/internal/scheduler/portscheduler/scheduler.go index d490eb2..6f7b472 100644 --- a/internal/scheduler/portscheduler/scheduler.go +++ b/internal/scheduler/portscheduler/scheduler.go @@ -2,7 +2,7 @@ package portscheduler import ( "encoding/json" - "errors" + "github.com/pkg/errors" "sort" "strconv" "sync" @@ -59,7 +59,7 @@ func Init(cfg *config.Config) error { var err error Scheduler, err = initFormEtcd() if err != nil { - return err + return errors.Wrap(err, "initFormEtcd failed") } if Scheduler.StartPort == 0 || Scheduler.EndPort == 0 || Scheduler.AvailableCount == 0 {