Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Go Report Card](https://goreportcard.com/badge/github.com/mayooot/gpu-docker-api)](https://goreportcard.com/badge/github.com/mayooot/gpu-docker-api)

[简体中文](docs%2Fzh-cn.md)
> ⚠️注意:中文文档已经过期,请使用英文文档
> ⚠️注意:中文文档更新可能落后于英文文档,请以英文文档为准
>

# Overview
Expand Down Expand Up @@ -116,7 +116,6 @@ Import [gpu-docker-api.openapi.json](api%2Fgpu-docker-api.openapi.json) to invok
tutorial: [volume-size-scale-en.md](docs%2Fvolume%2Fvolume-size-scale-en.md)
4. Make sure your test environment has ETCD V3 installed, installation
tutorial: [ETCD](https://github.com/etcd-io/etcd).
5. Clone and run [detect-gpu](https://github.com/mayooot/detect-gpu).

## Build From Source

Expand All @@ -127,6 +126,7 @@ $ make build
~~~

## Download From Release

[release](https://github.com/mayooot/gpu-docker-api/releases)

## Config File
Expand Down Expand Up @@ -193,7 +193,7 @@ And workQueue asynchronous processing in Client-go.

* gpuScheduler:A scheduler that allocates GPU resources and saves the used GPUs.
* gpuStatusMap:
Maintain the GPU resources of the server, when the program starts for the first time, call detect-gpu to get all
Maintain the GPU resources of the server, when the program starts for the first time, call `nvidia-smi` to get all
the GPU resources, and initialize gpuStatusMap, Key is the UUID of GPU, Value is the usage, 0 means used, 1 means
unused.

Expand All @@ -214,8 +214,6 @@ And workQueue asynchronous processing in Client-go.
* /apis/v1/versions/containerVersionMapKey
* /apis/v1/versions/volumeVersionMapKey

* detect-gpu:A simple HTTP server that calls [go-nvml](https://github.com/NVIDIA/go-nvml) to get the GPU of the host
computer.

## Architecture Diagram

Expand Down
2 changes: 1 addition & 1 deletion cmd/gpu-docker-api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func (p *program) Init(svc.Environment) error {

workQueue.InitWorkQueue()

if err = gpuscheduler.Init(p.cfg); err != nil {
if err = gpuscheduler.Init(); err != nil {
return err
}

Expand Down
5 changes: 0 additions & 5 deletions etc/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@ port = ":2378"
# etcd addr
etcd_addr = "127.0.0.1:2379"

# detect-gpu addr
detect_gpu_addr = "http://127.0.0.1:2376/api/v1/detect/gpu"
# host gpu count
available_gpu_nums = 8

# available port range
start_port = 40000
end_port = 65535
10 changes: 4 additions & 6 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@ import (
)

type Config struct {
Port string `toml:"port"`
EtcdAddr string `toml:"etcd_addr"`
DetectGPUAddr string `toml:"detect_gpu_addr"`
AvailableGpuNums int `toml:"available_gpu_nums"`
StartPort int `toml:"start_port"`
EndPort int `toml:"end_port"`
Port string `toml:"port"`
EtcdAddr string `toml:"etcd_addr"`
StartPort int `toml:"start_port"`
EndPort int `toml:"end_port"`
}

func NewConfigWithFile(name string) (*Config, error) {
Expand Down
7 changes: 5 additions & 2 deletions internal/etcd/client.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package etcd

import (
"github.com/pkg/errors"
"time"

clientv3 "go.etcd.io/etcd/client/v3"
Expand All @@ -18,8 +19,10 @@ func InitEtcdClient(cfg *config.Config) error {
DialTimeout: 2 * time.Second,
DialOptions: []grpc.DialOption{grpc.WithBlock()},
})

return err
if err != nil {
return errors.Wrap(err, "failed to connect etcd")
}
return nil
}

func CloseEtcdClient() error {
Expand Down
73 changes: 47 additions & 26 deletions internal/scheduler/gpuscheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,56 @@ package gpuscheduler

import (
"encoding/json"
"errors"
"io/ioutil"
"net/http"
"strconv"
"strings"
"sync"

"github.com/mayooot/gpu-docker-api/internal/config"
"github.com/commander-cli/cmd"
"github.com/pkg/errors"

"github.com/mayooot/gpu-docker-api/internal/etcd"
"github.com/mayooot/gpu-docker-api/internal/model"
"github.com/mayooot/gpu-docker-api/internal/xerrors"
)

const (
// 默认的可用GPU 数量
defaultAvailableGpuNums = 8
// 执行命令获取 gpu 的 index 和 uuid
allGpuUUIDCommand = "nvidia-smi --query-gpu=index,uuid --format=csv,noheader,nounits"

// gpuScheduler 存储在 etcd 中的 key
gpuStatusMapKey = "gpuStatusMapKey"
)

var Scheduler *scheduler

type gpu struct {
Index int `json:"index"`
UUID *string `json:"uuid"`
}

type scheduler struct {
sync.RWMutex

AvailableGpuNums int
GpuStatusMap map[string]byte
}

func Init(cfg *config.Config) error {
func Init() error {
var err error
Scheduler, err = initFormEtcd()
if err != nil {
return err
return errors.Wrap(err, "initFormEtcd failed")
}

if Scheduler.AvailableGpuNums == 0 || len(Scheduler.GpuStatusMap) == 0 {
// 如果没有初始化过
Scheduler.AvailableGpuNums = defaultAvailableGpuNums
if cfg.AvailableGpuNums >= 0 {
Scheduler.AvailableGpuNums = cfg.AvailableGpuNums
}

gpus, err := getDetectGpus(cfg.DetectGPUAddr)
gpus, err := getAllGpuUUID()
if err != nil {
return err
return errors.Wrap(err, "getAllGpuUUID failed")
}

Scheduler.AvailableGpuNums = len(gpus)
for i := 0; i < len(gpus); i++ {
Scheduler.GpuStatusMap[gpus[i].UUID] = 0
Scheduler.GpuStatusMap[*gpus[i].UUID] = 0
}
}
return nil
Expand Down Expand Up @@ -139,20 +140,40 @@ func initFormEtcd() (s *scheduler, err error) {
return s, err
}

func getDetectGpus(addr string) (gpus []model.GpuInfo, err error) {
resp, err := http.Get(addr)
func getAllGpuUUID() ([]*gpu, error) {
c := cmd.NewCommand(allGpuUUIDCommand)
err := c.Execute()
if err != nil {
return gpus, err
return nil, errors.Wrap(err, "cmd.Execute failed")
}
defer resp.Body.Close()

body, err := ioutil.ReadAll(resp.Body)
gpuList, err := parseOutput(c.Stdout())
if err != nil {
return gpus, err
return nil, errors.Wrap(err, "parseOutput failed")
}
return gpuList, nil
}

func parseOutput(output string) (gpuList []*gpu, err error) {
lines := strings.Split(output, "\n")
gpuList = make([]*gpu, 0, len(lines))
for _, line := range lines {
if line == "" {
continue
}

if err = json.Unmarshal(body, &gpus); err != nil {
return gpus, err
fields := strings.Split(line, ", ")
if len(fields) == 2 {
index, err := strconv.Atoi(fields[0])
if err != nil {
return gpuList, errors.Wrapf(err, "strconv.Atoi failed, index: %s", fields[0])
}
uuid := fields[1]
gpuList = append(gpuList, &gpu{
Index: index,
UUID: &uuid,
})
}
}
return gpus, err
return
}
4 changes: 2 additions & 2 deletions internal/scheduler/portscheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package portscheduler

import (
"encoding/json"
"errors"
"github.com/pkg/errors"
"sort"
"strconv"
"sync"
Expand Down Expand Up @@ -59,7 +59,7 @@ func Init(cfg *config.Config) error {
var err error
Scheduler, err = initFormEtcd()
if err != nil {
return err
return errors.Wrap(err, "initFormEtcd failed")
}

if Scheduler.StartPort == 0 || Scheduler.EndPort == 0 || Scheduler.AvailableCount == 0 {
Expand Down