-
Notifications
You must be signed in to change notification settings - Fork 2
/
training.go
80 lines (74 loc) · 3.02 KB
/
training.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
//
// Copyright 2019 EPAM Systems
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package config
import (
odahuflowv1alpha1 "github.com/odahu/odahu-flow/packages/operator/api/v1alpha1"
"os"
"time"
)
var (
defaultTrainingMemoryLimit = "256Mi"
defaultTrainingCPULimit = "256m"
defaultTrainingMemoryRequests = "128Mi"
defaultTrainingCPURequests = "128m"
)
type ModelTrainingConfig struct {
// Kubernetes namespace, where model trainings will be deployed
Namespace string `json:"namespace"`
ToolchainIntegrationNamespace string `json:"toolchainIntegrationNamespace"`
// Enable deployment API/operator
Enabled bool `json:"enabled"`
ServiceAccount string `json:"serviceAccount"`
OutputConnectionID string `json:"outputConnectionID"`
NodeSelector map[string]string `json:"nodeSelector"`
// Kubernetes tolerations for model trainings pods
Toleration map[string]string `json:"toleration"`
GPUNodeSelector map[string]string `json:"gpuNodeSelector"`
// Kubernetes tolerations for GPU model trainings pods
GPUToleration map[string]string `json:"gpuToleration"`
MetricURL string `json:"metricUrl"`
ModelTrainerImage string `json:"modelTrainerImage"`
// Timeout for full training process
Timeout time.Duration `json:"timeout"`
// Default resources for training pods
DefaultResources odahuflowv1alpha1.ResourceRequirements `json:"defaultResources"`
// Storage backend for toolchain integrations. Available options:
// * kubernetes
// * postgres
ToolchainIntegrationRepositoryType RepositoryType `json:"toolchainIntegrationRepositoryType"`
}
func NewDefaultModelTrainingConfig() ModelTrainingConfig {
return ModelTrainingConfig{
Namespace: "odahu-flow-training",
ToolchainIntegrationNamespace: "odahu-flow",
Enabled: true,
Timeout: 4 * time.Hour,
ServiceAccount: "odahu-flow-model-trainer",
// workaround https://github.com/spf13/viper/issues/761
ModelTrainerImage: os.Getenv("TRAINING_MODEL_TRAINER_IMAGE"),
DefaultResources: odahuflowv1alpha1.ResourceRequirements{
Requests: &odahuflowv1alpha1.ResourceList{
CPU: &defaultTrainingCPURequests,
Memory: &defaultTrainingMemoryRequests,
},
Limits: &odahuflowv1alpha1.ResourceList{
CPU: &defaultTrainingCPULimit,
Memory: &defaultTrainingMemoryLimit,
},
},
ToolchainIntegrationRepositoryType: RepositoryKubernetesType,
}
}