diff --git a/src/ClusterBootstrap/deploy.py b/src/ClusterBootstrap/deploy.py index 4f3bee96b..d0876d765 100755 --- a/src/ClusterBootstrap/deploy.py +++ b/src/ClusterBootstrap/deploy.py @@ -2831,8 +2831,8 @@ def start_one_kube_service(fname): pass if fname == "./deploy/services/jobmanager/jobmanager.yaml": - # recreate the configmap init-user-script - run_kubectl( ["create configmap init-user-script --from-file=../Jobs_Templete/init_user.sh -o yaml --dry-run | ./deploy/bin/kubectl apply -f -"] ) + # recreate the configmap dlws-scripts + run_kubectl( ["create configmap dlws-scripts --from-file=../Jobs_Templete/ -o yaml --dry-run | ./deploy/bin/kubectl apply -f -"] ) run_kubectl( ["create", "-f", fname ] ) diff --git a/src/ClusterBootstrap/params.py b/src/ClusterBootstrap/params.py index fa0bfd4f8..5fafa11a6 100755 --- a/src/ClusterBootstrap/params.py +++ b/src/ClusterBootstrap/params.py @@ -23,7 +23,7 @@ "job-exporter": { "port": 9102 }, "node-exporter": { "port": 9100 }, "watchdog": { "port": 9101 }, - "grafana": { "port": 3000 }, + "grafana": { "port": 3000, "prometheus-ip": "localhost" }, "alert-manager": { "port": 9093, "configured": False, @@ -31,6 +31,11 @@ # If want to deploy with alert-manager, should config # configured with True, and fill appropriate value to: # smtp_url, smtp_from, smtp_auth_username, smtp_auth_password and receiver + "reaper": { + "dry-run": True, + "port": "9500", + "restful-url": "http://localhost:5000", + } }, "mysql_port": "3306", diff --git a/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml b/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml index caa7ae9d6..54ccf8c67 100755 --- a/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml +++ b/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml @@ -13,6 +13,9 @@ spec: labels: jobmanager-node: pod app: jobmanager + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" spec: {% if cnf["dnsPolicy"] %} dnsPolicy: {{cnf["dnsPolicy"]}} @@ -39,7 +42,40 @@ spec: - mountPath: {{cnf["storage-mount-path"]}}/jobfiles name: dlwsdatajobfiles - mountPath: /var/log/dlworkspace - name: log + name: log + ports: + - containerPort: 9200 + hostPort: 9200 + name: job-mgr + protocol: TCP + - containerPort: 9201 + hostPort: 9201 + name: user-mgr + protocol: TCP + - containerPort: 9202 + hostPort: 9202 + name: node-mgr + protocol: TCP + - containerPort: 9203 + hostPort: 9203 + name: joblog-mgr + protocol: TCP + - containerPort: 9204 + hostPort: 9204 + name: cmd-mgr + protocol: TCP + - containerPort: 9205 + hostPort: 9205 + name: endpoint-mgr + protocol: TCP + readinessProbe: + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 30 + successThreshold: 1 + tcpSocket: + port: 9200 + timeoutSeconds: 10 volumes: - name: certs hostPath: diff --git a/src/ClusterBootstrap/services/monitor/alert-manager.yaml b/src/ClusterBootstrap/services/monitor/alert-manager.yaml index 8bc493e6d..a15534e3f 100644 --- a/src/ClusterBootstrap/services/monitor/alert-manager.yaml +++ b/src/ClusterBootstrap/services/monitor/alert-manager.yaml @@ -24,7 +24,7 @@ spec: hostNetwork: true containers: - name: alert-manager - image: prom/alertmanager:v0.15.1 + image: prom/alertmanager:v0.18.0 args: - '--config.file=/etc/alertmanager/config.yml' - '--storage.path=/alertmanager' @@ -40,6 +40,23 @@ spec: mountPath: /alertmanager - name: templates-volume mountPath: /etc/alertmanager/template + {% if cnf["alert-manager"]["reaper"] %} + - name: reaper + image: {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}reaper:{{cnf["dockertag"]}} + command: + - 'python' + - '/reaper/main.py' + - '--port' + - '{{ cnf["alert-manager"]["reaper"]["port"] }}' + - '--restful_url' + - '{{ cnf["alert-manager"]["reaper"]["restful-url"] }}' + {% if cnf["alert-manager"]["reaper"]["dry-run"] %} + - '--dry_run' + {% endif %} + ports: + - name: alert-manager + containerPort: {{ cnf["alert-manager"]["reaper"]["port"] }} + {% endif %} volumes: - name: config-volume configMap: @@ -80,14 +97,30 @@ data: receiver: alert-email group_wait: 30s group_interval: 5m - group_by: [alertname] + group_by: [alertname, cluster] routes: - - receiver: task_user + - receiver: idle_gpu_receiver repeat_interval: 4h group_by: [alertname, user_email, cluster] match_re: - type: user_alert + type: idle_gpu alertname: "zero-gpu-usage" + - receiver: job_state_change_receiver + group_by: [alertname, user_email, cluster, subject] + match_re: + type: user_alert + alertname: "job-state-changed" + - receiver: reaper + group_by: [alertname, user_email, job_name] + group_wait: 0s + match_re: + type: reaper + - receiver: kill_idle_job_email + group_by: [alertname, user_email, cluster] + group_wait: 0s + match_re: + type: kill_idle_job_email + alertname: "kill-idle-jobs-email" receivers: - name: "alert-email" email_configs: @@ -95,7 +128,7 @@ data: html: '{{ "{{" }} template "email.html" . {{ "}}" }}' headers: subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}' - - name: "task_user" + - name: "idle_gpu_receiver" email_configs: {% if cnf["alert-manager"]["alert_users"] %} - to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}' @@ -109,4 +142,40 @@ data: CC: '{{ alert_info["receiver"] }}' {% endif %} subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}' + - name: "job_state_change_receiver" + email_configs: + {% if cnf["alert-manager"]["alert_users"] %} + - to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}' + {% else %} + - to: '{{ alert_info["receiver"] }}' + {% endif %} + html: '{{ "{{" }} template "job_state.html" . {{ "}}" }}' + headers: + {% if cnf["alert-manager"]["alert_users"] %} + To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}' + CC: '{{ alert_info["receiver"] }}' + {% endif %} + subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}' + - name: "reaper" + {% if cnf["alert-manager"]["reaper"] %} + webhook_configs: + - send_resolved: False + url: 'http://localhost:{{ cnf["alert-manager"]["reaper"]["port"] }}/kill' + http_config: + bearer_token: 'shinigami' + - name: "kill_idle_job_email" + email_configs: + {% if cnf["alert-manager"]["alert_users"] %} + - to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}' + {% else %} + - to: '{{ alert_info["receiver"] }}' + {% endif %} + html: '{{ "{{" }} template "kill_idle.html" . {{ "}}" }}' + headers: + {% if cnf["alert-manager"]["alert_users"] %} + To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}' + CC: '{{ alert_info["receiver"] }}' + {% endif %} + subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}' + {% endif %} {% endif %} diff --git a/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl b/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl new file mode 100644 index 000000000..2da286cc1 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl @@ -0,0 +1,71 @@ +{{ define "job_state.html" }} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + +
+ + {{ range .Alerts.Firing }} + + + + {{ end }} +
+Your job + +{{.Labels.job_name}} + from cluster '{{.Labels.cluster}}' has changed to state of {{.Labels.job_state}}. +
+
+ +
+
+ + + +{{ end }} diff --git a/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl b/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl new file mode 100644 index 000000000..e29dc993c --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl @@ -0,0 +1,71 @@ +{{ define "kill_idle.html" }} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + +
+ + {{ range .Alerts.Firing }} + + + + {{ end }} +
+Your job + +{{.Labels.job_name}} + from cluster '{{.Labels.cluster}}' VC '{{.Labels.vc_name}}' was killed because it have been idle for too long. +
+
+ +
+
+ + + +{{ end }} diff --git a/src/ClusterBootstrap/services/monitor/alerting/jobs.rules b/src/ClusterBootstrap/services/monitor/alerting/jobs.rules index 976263107..7a6160384 100644 --- a/src/ClusterBootstrap/services/monitor/alerting/jobs.rules +++ b/src/ClusterBootstrap/services/monitor/alerting/jobs.rules @@ -5,4 +5,14 @@ groups: expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0 for: 4h labels: - type: user_alert + type: idle_gpu + - alert: kill-idle-jobs-email + expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0 + for: 8h + labels: + type: kill_idle_job_email + - alert: kill-idle-jobs + expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0 + for: 8h + labels: + type: reaper diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json new file mode 100644 index 000000000..60dd045da --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json @@ -0,0 +1,239 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "id": 1, + "legend": { + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(k8s_node_gpu_total) - sum(k8s_node_gpu_available) - sum(k8s_node_gpu_reserved)) / sum(k8s_node_gpu_total) * 100", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "legendFormat": "Allocation Rate", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cluster wide GPU allocation rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "id": 2, + "legend": { + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(task_gpu_percent)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Avg Util", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cluster wide avg util", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now/w", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Cluster GPU statistic", + "version": 0 + } +} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json new file mode 100644 index 000000000..c78fab7dd --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json @@ -0,0 +1,389 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(datahandler_fn_latency_seconds_bucket{scraped_from=~\"jobmanager.*\"}[5m])) by (le, fn_name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}} fn_name {{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Datahandler 90th percentile latency per function from jobmanager", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(datahandler_fn_latency_seconds_bucket{scraped_from=~\"restfulapi.*\"}[5m])) by (le, fn_name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}} fn_name {{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Datahandler 90th percentile latency per function from restfulapi", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(db_connect_latency_seconds_bucket{scraped_from=~\"jobmanager.*\"}[5m])) by (le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Connection Latency", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "90th percentile DB connection latency from jobmanager", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(db_connect_latency_seconds_bucket{scraped_from=~\"restfulapi.*\"}[5m])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Connection Latency", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "90th percentile DB connection latency from restfulapi", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Performance dashboard", + "version": 0 + } +} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json b/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json index 467dfa70e..027e5ea6c 100644 --- a/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json +++ b/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json @@ -1,6 +1,6 @@ { "name": "PM", - "url": "http://{{cnf['prometheus']['host']}}:9091/prometheus", + "url": "http://{{cnf['grafana']['prometheus-ip']}}:9091/prometheus", "basicAuth": false, "access": "proxy", "type": "prometheus", diff --git a/src/ClusterBootstrap/services/monitor/job-exporter.yaml b/src/ClusterBootstrap/services/monitor/job-exporter.yaml index 54f32eda9..90df9fd62 100644 --- a/src/ClusterBootstrap/services/monitor/job-exporter.yaml +++ b/src/ClusterBootstrap/services/monitor/job-exporter.yaml @@ -82,3 +82,5 @@ spec: operator: "Exists" - key: node.kubernetes.io/disk-pressure operator: "Exists" + - key: node-role.kubernetes.io/master + operator: "Exists" diff --git a/src/ClusterBootstrap/services/monitor/node-exporter.yaml b/src/ClusterBootstrap/services/monitor/node-exporter.yaml index cc86f3583..26fb8633e 100644 --- a/src/ClusterBootstrap/services/monitor/node-exporter.yaml +++ b/src/ClusterBootstrap/services/monitor/node-exporter.yaml @@ -78,3 +78,5 @@ spec: operator: "Exists" - key: node.kubernetes.io/disk-pressure operator: "Exists" + - key: node-role.kubernetes.io/master + operator: "Exists" diff --git a/src/ClusterBootstrap/services/monitor/prometheus.yaml b/src/ClusterBootstrap/services/monitor/prometheus.yaml index 9a5e08bd1..bd4c0a203 100644 --- a/src/ClusterBootstrap/services/monitor/prometheus.yaml +++ b/src/ClusterBootstrap/services/monitor/prometheus.yaml @@ -83,6 +83,13 @@ spec: nodeSelector: prometheus: active hostNetwork: true + initContainers: + - name: init + image: bash:4 + volumeMounts: + - name: prometheus-data + mountPath: /prometheus-data + command: ["chmod", "777", "/prometheus-data"] # newly create dir have permission 755, which makes prometheus container unable to write containers: - name: prometheus image: prom/prometheus:v2.1.0 @@ -96,6 +103,7 @@ spec: - '--web.listen-address=0.0.0.0:{{cnf["prometheus"]["port"]}}' - '--web.external-url=http://localhost:{{cnf["prometheus"]["port"]}}/prometheus/' - '--web.route-prefix=prometheus' + - '--storage.tsdb.path=/prometheus-data' - '--storage.tsdb.retention=31d' ports: - name: web @@ -105,6 +113,8 @@ spec: mountPath: /etc/prometheus - name: rules-volume mountPath: /etc/prometheus-alert + - name: prometheus-data + mountPath: /prometheus-data volumes: - name: config-volume configMap: @@ -112,6 +122,9 @@ spec: - name: rules-volume configMap: name: prometheus-alert + - name: prometheus-data + hostPath: + path: /data/prometheus/data tolerations: - key: node.kubernetes.io/memory-pressure operator: "Exists" diff --git a/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml b/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml index e3d089c83..d1e5a6ce4 100755 --- a/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml +++ b/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml @@ -4,7 +4,7 @@ metadata: name: restfulapi namespace: default labels: - run: dlwsrestfulapi + run: dlwsrestfulapi spec: selector: matchLabels: @@ -15,13 +15,17 @@ spec: labels: restfulapi-node: pod app: restfulapi + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + prometheus.io/port: "5000" spec: - {% if cnf["dnsPolicy"] %} + {% if cnf["dnsPolicy"] %} dnsPolicy: {{cnf["dnsPolicy"]}} {% endif %} nodeSelector: restfulapi: active - hostNetwork: true + hostNetwork: true containers: - name: restfulapi image: {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}{{cnf["restfulapi"]}}:{{cnf["dockertag"]}} @@ -31,6 +35,10 @@ spec: name: apiconfig - mountPath: /var/log/apache2 name: log + ports: + - containerPort: 5000 + hostPort: 5000 + name: main {% if False %} {% for volume in cnf["mountpoints"] %} {% if cnf["mountpoints"][volume]["mountpoints"] is string and cnf["mountpoints"][volume]["mountpoints"]!="" %} @@ -42,7 +50,7 @@ spec: name: {{mp}} {% endfor %} {% endif %} - {% endfor %} + {% endfor %} {% endif %} volumes: - name: apiconfig @@ -60,14 +68,14 @@ spec: {% else %} {% for mp in cnf["mountpoints"][volume]["mountpoints"] %} - name: {{mp}} - hostPath: + hostPath: path: {{cnf["storage-mount-path"]}}/{{mp}} {% endfor %} {% endif %} - {% endfor %} + {% endfor %} {% endif %} tolerations: - key: CriticalAddonsOnly operator: Exists - key: node-role.kubernetes.io/master - effect: NoSchedule + effect: NoSchedule diff --git a/src/ClusterManager/cluster_manager.py b/src/ClusterManager/cluster_manager.py index ae8427d66..87d7cecad 100755 --- a/src/ClusterManager/cluster_manager.py +++ b/src/ClusterManager/cluster_manager.py @@ -1,88 +1,156 @@ -import json +import yaml +import subprocess32 import os -import time -import argparse -import uuid -import subprocess +import logging +import logging.config import sys +import time import datetime +import argparse +import threading +import traceback +import signal -import yaml -from jinja2 import Environment, FileSystemLoader, Template -import base64 +from prometheus_client.twisted import MetricsResource +from prometheus_client import Histogram -import re -import random +from twisted.web.server import Site +from twisted.web.resource import Resource +from twisted.internet import reactor + +logger = logging.getLogger(__name__) + +manager_iteration_histogram = Histogram("manager_iteration_latency_seconds", + "latency for manager to iterate", + buckets=(2.5, 5.0, 10.0, 20.0, 40.0, 80.0, 160.0, float("inf")), + labelnames=("name",)) -import textwrap -import logging -import logging.config -import job_manager -import user_manager -import node_manager -import joblog_manager -import command_manager -import endpoint_manager +class HealthResource(Resource): + def render_GET(self, request): + request.setHeader("Content-Type", "text/html; charset=utf-8") + return "Ok".encode("utf-8") -from multiprocessing import Process, Manager +def exporter_thread(port): + root = Resource() + root.putChild(b"metrics", MetricsResource()) + root.putChild(b"healthz", HealthResource()) + factory = Site(root) + reactor.listenTCP(port, factory) + reactor.run(installSignalHandlers=False) +def setup_exporter_thread(port): + t = threading.Thread(target=exporter_thread, args=(port,), + name="exporter") + t.start() + return t -def create_log(logdir='/var/log/dlworkspace'): +def create_log(logdir="/var/log/dlworkspace"): if not os.path.exists(logdir): os.system("mkdir -p " + logdir) - with open('logging.yaml') as f: + with open("logging.yaml") as f: logging_config = yaml.load(f) + logging_config["handlers"]["file"]["filename"] = logdir + "/clustermanager.log" + logging.config.dictConfig(logging_config) + +def dumpstacks(signal, frame): + id2name = dict([(th.ident, th.name) for th in threading.enumerate()]) + code = [] + for threadId, stack in sys._current_frames().items(): + code.append("\n# Thread: %s(%d)" % (id2name.get(threadId,""), threadId)) + for filename, lineno, name, line in traceback.extract_stack(stack): + code.append('File: "%s", line %d, in %s' % (filename, lineno, name)) + if line: + code.append(" %s" % (line.strip())) + print "\n".join(code) + sys.stdout.flush() + sys.stderr.flush() + +def register_stack_trace_dump(): + signal.signal(signal.SIGTRAP, dumpstacks) + +def update_file_modification_time(path): + if not os.path.isfile(path): + f = open(path, "w") f.close() - logging_config["handlers"]["file"]["filename"] = logdir+"/clustermanager.log" - logging.config.dictConfig(logging_config) + mod_time = time.mktime(datetime.datetime.now().timetuple()) + os.utime(path, (mod_time, mod_time)) -def Run(): - create_log() - - logging.info("Starting job manager... ") - proc_job = Process(target=job_manager.Run) - proc_job.start() - - logging.info("Starting user manager... ") - proc_user = Process(target=user_manager.Run) - proc_user.start() - - logging.info("Starting node manager... ") - proc_node = Process(target=node_manager.Run) - proc_node.start() - - logging.info("Starting joblogging manager... ") - proc_joblog = Process(target=joblog_manager.Run) - proc_joblog.start() - - logging.info("Starting command manager... ") - proc_command = Process(target=command_manager.Run) - proc_command.start() +def get_elapsed_seconds(path): + mtime = datetime.datetime.fromtimestamp(os.path.getmtime(path)) + return (datetime.datetime.now() - mtime).seconds - logging.info("Starting endpoint manager... ") - proc_endpoint = Process(target=endpoint_manager.Run) - proc_endpoint.start() - - proc_job.join() - proc_user.join() - proc_node.join() - proc_joblog.join() - proc_command.join() - proc_endpoint.join() - pass - - -if __name__ == '__main__': - - #parser = argparse.ArgumentParser( prog='cluster_manager.py', - # formatter_class=argparse.RawDescriptionHelpFormatter, - # description=textwrap.dedent('''\ - # ''') ) - #parser.add_argument("help", - # help = "Show the usage of this program" ) - - #args = parser.parse_args() +def Run(args): + register_stack_trace_dump() + create_log() - Run() + cwd = os.path.dirname(__file__) + cmds = { + "job_manager": + ["python", os.path.join(cwd, "job_manager.py"), "--port", str(args.j)], + "user_manager": + ["python", os.path.join(cwd, "user_manager.py"), "--port", str(args.u)], + "node_manager": + ["python", os.path.join(cwd, "node_manager.py"), "--port", str(args.n)], + "joblog_manager": + ["python", os.path.join(cwd, "joblog_manager.py"), "--port", str(args.l)], + "command_manager": + ["python", os.path.join(cwd, "command_manager.py"), "--port", str(args.c)], + "endpoint_manager": + ["python", os.path.join(cwd, "endpoint_manager.py"), "--port", str(args.e)], + } + + FNULL = open(os.devnull, "w") + + childs = {} + + while True: + try: + work(cmds, childs, FNULL) + except Exception as e: + logger.exception("caught exception while doing work") + time.sleep(60) + +def work(cmds, childs, FNULL): + for key, cmd in cmds.items(): + child = childs.get(key) + need_start = False + + if child is None or child.poll() is not None: + if child is not None: + logger.info("%s is dead restart it", cmd) + need_start = True + else: + sec = get_elapsed_seconds(key) + if sec <= args.tictoc: + continue + logger.info("%s did not update file for %d seconds, restart it", + key, sec) + child.send_signal(signal.SIGTRAP) # try to print their stacktrace + time.sleep(1) + child.kill() + sys.stdout.flush() + sys.stderr.flush() + need_start = True + + if need_start: + update_file_modification_time(key) + try: + childs[key] = subprocess32.Popen(cmd, stdin=FNULL) + except Exception as e: + logger.exception("caught exception when trying to start %s, ignore", cmd) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--tictoc", help="how many seconds to wait until kill subprocess", type=int, default=600) + parser.add_argument("-j", help="port of job_manager", type=int, default=9200) + parser.add_argument("-u", help="port of user_manager", type=int, default=9201) + parser.add_argument("-n", help="port of node_manager", type=int, default=9202) + parser.add_argument("-l", help="port of joblog_manager", type=int, default=9203) + parser.add_argument("-c", help="port of command_manager", type=int, default=9204) + parser.add_argument("-e", help="port of endpoint_manager", type=int, default=9205) + args = parser.parse_args() + + sys.exit(Run(args)) diff --git a/src/ClusterManager/command_manager.py b/src/ClusterManager/command_manager.py index 6038c86c3..86458001a 100755 --- a/src/ClusterManager/command_manager.py +++ b/src/ClusterManager/command_manager.py @@ -8,7 +8,6 @@ import datetime import copy - sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage")) sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils")) @@ -21,7 +20,6 @@ from jinja2 import Environment, FileSystemLoader, Template from config import config, GetStoragePath from DataHandler import DataHandler -from node_manager import create_log from node_manager import get_cluster_status import base64 @@ -32,8 +30,10 @@ import random import logging -import logging.config +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time + +logger = logging.getLogger(__name__) def RunCommand(command): dataHandler = DataHandler() @@ -42,21 +42,40 @@ def RunCommand(command): dataHandler.Close() return True +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) + with open('logging.yaml') as f: + logging_config = yaml.full_load(f) + f.close() + logging_config["handlers"]["file"]["filename"] = logdir+"/command_manager.log" + logging.config.dictConfig(logging_config) def Run(): + register_stack_trace_dump() + create_log() + while True: - try: - dataHandler = DataHandler() - pendingCommands = dataHandler.GetPendingCommands() - for command in pendingCommands: - try: - print "Processing command: %s" % (command["id"]) - RunCommand(command) - except Exception as e: - print e - except Exception as e: - print e + update_file_modification_time("command_manager") + + with manager_iteration_histogram.labels("command_manager").time(): + try: + dataHandler = DataHandler() + pendingCommands = dataHandler.GetPendingCommands() + for command in pendingCommands: + try: + logger.info("Processing command: %s", command["id"]) + RunCommand(command) + except Exception as e: + logger.exception("run command failed") + except Exception as e: + logger.exception("getting command failed") time.sleep(1) if __name__ == '__main__': - Run() \ No newline at end of file + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9204) + args = parser.parse_args() + setup_exporter_thread(args.port) + + Run() diff --git a/src/ClusterManager/dist_pod_template.py b/src/ClusterManager/dist_pod_template.py new file mode 100644 index 000000000..493764ab9 --- /dev/null +++ b/src/ClusterManager/dist_pod_template.py @@ -0,0 +1,156 @@ +import os +import sys +import uuid +import datetime +import random +import json +import copy +import yaml +from jinja2 import Template +from job import Job + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from config import config +from osUtils import mkdirsAsUser + + +class DistPodTemplate(): + def __init__(self, template, enable_custom_scheduler=False): + self.template = template + self.enable_custom_scheduler = enable_custom_scheduler + + @staticmethod + def generate_launch_script(dist_role, dist_role_idx, user_id, job_path, cmd): + # change ssh folder permission here because the setup permission + # script in launch_ps_job function may have race condition with init_user.sh script. + # results in no such user error + + local_pod_path = os.path.join(config["storage-mount-path"], "work/", job_path, "{}-{}".format(dist_role, dist_role_idx)) + if not os.path.exists(local_pod_path): + mkdirsAsUser(local_pod_path, user_id) + file_name = "job_command.sh" + launch_script_file = os.path.join(local_pod_path, file_name) + with open(launch_script_file, 'w') as f: + f.write(cmd) + f.close() + + launchCMD = ["bash", "/pod/scripts/bootstrap.sh"] + return launchCMD + + def generate_pod(self, pod): + assert(isinstance(self.template, Template)) + + dist_id = pod["distId"] + job_id = pod["jobId"] + job_path = pod["jobPath"] + + pod["podName"] = "{}-{}".format(job_id, dist_id) + + random.seed(datetime.datetime.now()) + if "hostNetwork" in pod and pod["hostNetwork"]: + pod["sshPort"] = random.randint(40000, 49999) + else: + pod["sshPort"] = int(random.random() * 1000 + 3000) + + if (pod["distRole"] == "worker"): + pod["gpuLimit"] = pod["resourcegpu"] + else: + pod["gpuLimit"] = 0 + + if "envs" not in pod: + pod["envs"] = [] + pod["envs"].append({"name": "DLWS_ROLE_NAME", "value": pod["distRole"]}) + pod["envs"].append({"name": "DLWS_ROLE_IDX", "value": pod["distRoleIdx"]}) + + if "labels" not in pod: + pod["labels"] = [] + pod["labels"].append({"name": "distRole", "value": pod["distRole"]}) + pod["labels"].append({"name": "distRoleIdx", "value": pod["distRoleIdx"]}) + pod["labels"].append({"name": "sshPort", "value": pod["sshPort"]}) + + cmd = pod["cmd"] + pod["LaunchCMD"] = DistPodTemplate.generate_launch_script(pod["distRole"], pod["distRoleIdx"], pod["userId"], job_path, cmd) + + pod_yaml = self.template.render(job=pod) + return yaml.full_load(pod_yaml) + + def generate_pods(self, job): + """ + Return (pods, errors) + """ + assert(isinstance(job, Job)) + params = job.params + + if any(required_field not in params for required_field in + [ + "jobtrainingtype", + "jobName", + "jobPath", + "workPath", + "dataPath", + "cmd", + "userId", + "resourcegpu", + "userName", + ]): + return None, "Missing required parameters!" + assert(params["jobtrainingtype"] == "PSDistJob") + + job.job_path = params["jobPath"] + job.work_path = params["workPath"] + job.data_path = params["dataPath"] + # TODO user's mountpoints first, but should after 'job_path' + job.add_mountpoints(job.job_path_mountpoint()) + if "mountpoints" in params: + job.add_mountpoints(params["mountpoints"]) + job.add_mountpoints(job.work_path_mountpoint()) + job.add_mountpoints(job.data_path_mountpoint()) + params["mountpoints"] = job.mountpoints + + params["user_email"] = params["userName"] + params["homeFolderHostpath"] = job.get_homefolder_hostpath() + params["pod_ip_range"] = job.get_pod_ip_range() + params["usefreeflow"] = job.is_freeflow_enabled() + params["jobNameLabel"] = ''.join(e for e in params["jobName"] if e.isalnum()) + params["rest-api"] = job.get_rest_api_url() + + if "nodeSelector" not in params: + params["nodeSelector"] = {} + if "gpuType" in params: + params["nodeSelector"]["gpuType"] = params["gpuType"] + assignedRack = job.get_rack() + if assignedRack is not None: + params["nodeSelector"]["rack"] = assignedRack + + params["numworker"] = int(params["numpsworker"]) + params["numps"] = int(params["numps"]) + + if "envs" not in params: + params["envs"] = [] + params["envs"].append({"name": "DLWS_NUM_GPU_PER_WORKER", "value": params["resourcegpu"]}) + + if "hostNetwork" in params and params["hostNetwork"]: + params["envs"].append({"name": "DLWS_HOST_NETWORK", "value": "enable"}) + params["envs"].append({"name": "DLWS_WORKER_NUM", "value": params["numworker"]}) + + pods = [] + nums = {"ps": int(params["numps"]), "worker": int(params["numpsworker"])} + for role in ["ps", "worker"]: + for idx in range(nums[role]): + pod = copy.deepcopy(params) + pod["distRole"] = role + pod["distRoleIdx"] = idx + pod["distId"] = "%s%d" % (role, idx) + # mount /pod + local_pod_path = job.get_hostpath(job.job_path, "%s-%d" % (role, idx)) + pod["mountpoints"].append({"name": "pod", "containerPath": "/pod", "hostPath": local_pod_path, "enabled": True}) + + + pods.append(pod) + + k8s_pods = [] + for pod in pods: + k8s_pod = self.generate_pod(pod) + k8s_pods.append(k8s_pod) + + return k8s_pods, None diff --git a/src/ClusterManager/endpoint_manager.py b/src/ClusterManager/endpoint_manager.py index 272b3bf69..15c8c2315 100755 --- a/src/ClusterManager/endpoint_manager.py +++ b/src/ClusterManager/endpoint_manager.py @@ -1,6 +1,4 @@ -from config import config, GetStoragePath, GetWorkPath -import k8sUtils -from DataHandler import DataHandler + import json import os import time @@ -11,8 +9,19 @@ import traceback import random import re +import logging +import yaml +import logging.config + +import argparse +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +import k8sUtils +from config import config, GetStoragePath, GetWorkPath +from DataHandler import DataHandler + +logger = logging.getLogger(__name__) def is_ssh_server_ready(pod_name): @@ -76,14 +85,14 @@ def generate_node_port_service(job_id, pod_name, endpoint_id, name, target_port) targetPort: {4} port: {4} """.format(job_id, pod_name, endpoint_id, name, target_port) - print("endpointDescription: %s" % endpoint_description) + logger.info("endpointDescription: %s", endpoint_description) return endpoint_description def create_node_port(endpoint): endpoint_description = generate_node_port_service(endpoint["jobId"], endpoint["podName"], endpoint["id"], endpoint["name"], endpoint["podPort"]) endpoint_description_path = os.path.join(config["storage-mount-path"], endpoint["endpointDescriptionPath"]) - print("endpointDescriptionPath: %s" % endpoint_description_path) + logger.info("endpointDescriptionPath: %s", endpoint_description_path) with open(endpoint_description_path, 'w') as f: f.write(endpoint_description) @@ -91,18 +100,18 @@ def create_node_port(endpoint): if result == "": raise Exception("Failed to create NodePort for ssh. JobId: %s " % endpoint["jobId"]) - print("Submitted endpoint %s to k8s, returned with status %s" % (endpoint["jobId"], result)) + logger.info("Submitted endpoint %s to k8s, returned with status %s", endpoint["jobId"], result) def setup_ssh_server(user_name, pod_name, host_network=False): '''Setup ssh server on pod and return the port''' # setup ssh server only is the ssh server is not up if not is_ssh_server_ready(pod_name): - print("Ssh server is not ready for pod: %s. Setup ..." % pod_name) + logger.info("Ssh server is not ready for pod: %s. Setup ...", pod_name) ssh_port = start_ssh_server(pod_name, user_name, host_network) else: ssh_port = query_ssh_port(pod_name) - print("Ssh server is ready for pod: %s. Ssh listen on %s" % (pod_name, ssh_port)) + logger.info("Ssh server is ready for pod: %s. Ssh listen on %s", pod_name, ssh_port) return ssh_port @@ -127,7 +136,7 @@ def setup_tensorboard(user_name, pod_name): def start_endpoint(endpoint): # pending, running, stopped - print("Starting endpoint: %s" % (endpoint)) + logger.info("Starting endpoint: %s", endpoint) # podName pod_name = endpoint["podName"] @@ -148,50 +157,45 @@ def start_endpoint(endpoint): create_node_port(endpoint) -def is_user_ready(pod_name): - bash_script = "bash -c 'ls /dlws/USER_READY'" - output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) - if output == "": - return False - return True - - def start_endpoints(): try: + data_handler = DataHandler() try: - data_handler = DataHandler() pending_endpoints = data_handler.GetPendingEndpoints() for endpoint_id, endpoint in pending_endpoints.items(): - job = data_handler.GetJob(jobId=endpoint["jobId"])[0] - if job["jobStatus"] != "running": - continue - if not is_user_ready(endpoint["podName"]): - continue - - # get endpointDescriptionPath - # job["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" - endpoint_description_dir = re.search("(.*/)[^/\.]+.yaml", job["jobDescriptionPath"]).group(1) - endpoint["endpointDescriptionPath"] = os.path.join(endpoint_description_dir, endpoint_id + ".yaml") - - print("\n\n\n\n\n\n----------------Begin to start endpoint %s" % endpoint["id"]) - output = get_k8s_endpoint(endpoint["endpointDescriptionPath"]) - if(output != ""): - endpoint_description = json.loads(output) - endpoint["endpointDescription"] = endpoint_description - endpoint["status"] = "running" - pod = k8sUtils.GetPod("podName=" + endpoint["podName"]) - if "items" in pod and len(pod["items"]) > 0: - endpoint["nodeName"] = pod["items"][0]["spec"]["nodeName"] - else: - start_endpoint(endpoint) - - endpoint["lastUpdated"] = datetime.datetime.now().isoformat() - data_handler.UpdateEndpoint(endpoint) + try: + job = data_handler.GetJob(jobId=endpoint["jobId"])[0] + if job["jobStatus"] != "running": + continue + + # get endpointDescriptionPath + # job["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" + endpoint_description_dir = re.search("(.*/)[^/\.]+.yaml", job["jobDescriptionPath"]).group(1) + endpoint["endpointDescriptionPath"] = os.path.join(endpoint_description_dir, endpoint_id + ".yaml") + + logger.info("\n\n\n\n\n\n----------------Begin to start endpoint %s", endpoint["id"]) + output = get_k8s_endpoint(endpoint["endpointDescriptionPath"]) + if(output != ""): + endpoint_description = json.loads(output) + endpoint["endpointDescription"] = endpoint_description + endpoint["status"] = "running" + pod = k8sUtils.GetPod("podName=" + endpoint["podName"]) + if "items" in pod and len(pod["items"]) > 0: + endpoint["nodeName"] = pod["items"][0]["spec"]["nodeName"] + else: + start_endpoint(endpoint) + + endpoint["lastUpdated"] = datetime.datetime.now().isoformat() + data_handler.UpdateEndpoint(endpoint) + except Exception as e: + logger.warning("Process endpoint failed {}".format(endpoint), exc_info=True) except Exception as e: - traceback.print_exc() + logger.exception("start endpoint failed") + finally: + data_handler.Close() except Exception as e: - traceback.print_exc() + logger.exception("close data handler failed") def cleanup_endpoints(): @@ -200,45 +204,69 @@ def cleanup_endpoints(): try: dead_endpoints = data_handler.GetDeadEndpoints() for endpoint_id, dead_endpoint in dead_endpoints.items(): - print("\n\n\n\n\n\n----------------Begin to cleanup endpoint %s" % endpoint_id) - endpoint_description_path = os.path.join(config["storage-mount-path"], dead_endpoint["endpointDescriptionPath"]) - still_running = get_k8s_endpoint(endpoint_description_path) - # empty mean not existing - if still_running == "": - print("Endpoint already gone %s" % endpoint_id) - status = "stopped" - else: - output = k8sUtils.kubectl_delete(endpoint_description_path) - # 0 for success - if output == 0: + try: + logger.info("\n\n\n\n\n\n----------------Begin to cleanup endpoint %s", endpoint_id) + endpoint_description_path = os.path.join(config["storage-mount-path"], dead_endpoint["endpointDescriptionPath"]) + still_running = get_k8s_endpoint(endpoint_description_path) + # empty mean not existing + if still_running == "": + logger.info("Endpoint already gone %s", endpoint_id) status = "stopped" - print("Succeed cleanup endpoint %s" % endpoint_id) else: - # TODO will need to clean it up eventually - status = "unknown" - print("Clean dead endpoint %s failed, endpoints: %s" % (endpoint_id, dead_endpoint)) - - dead_endpoint["status"] = status - dead_endpoint["lastUpdated"] = datetime.datetime.now().isoformat() - data_handler.UpdateEndpoint(dead_endpoint) + output = k8sUtils.kubectl_delete(endpoint_description_path) + # 0 for success + if output == 0: + status = "stopped" + logger.info("Succeed cleanup endpoint %s", endpoint_id) + else: + # TODO will need to clean it up eventually + status = "unknown" + logger.info("Clean dead endpoint %s failed, endpoints: %s", endpoint_id, dead_endpoint) + + # we are not changing status from "pending", "pending" endpoints are planed to setup later + if dead_endpoint["status"] != "pending": + dead_endpoint["status"] = status + dead_endpoint["lastUpdated"] = datetime.datetime.now().isoformat() + data_handler.UpdateEndpoint(dead_endpoint) + except Exception as e: + logger.warning("Clanup endpoint failed {}".format(dead_endpoint), exc_info=True) except Exception as e: - traceback.print_exc() + logger.exception("cleanup endpoint failed") finally: data_handler.Close() except Exception as e: - traceback.print_exc() + logger.exception("close data handler failed") + +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) + with open('logging.yaml') as f: + logging_config = yaml.full_load(f) + f.close() + logging_config["handlers"]["file"]["filename"] = logdir+"/endpoint_manager.log" + logging.config.dictConfig(logging_config) def Run(): + register_stack_trace_dump() + create_log() + while True: - # start endpoints - start_endpoints() - time.sleep(1) + update_file_modification_time("endpoint_manager") - # clean up endpoints for jobs which is NOT running - cleanup_endpoints() - time.sleep(1) + with manager_iteration_histogram.labels("endpoint_manager").time(): + # start endpoints + start_endpoints() + time.sleep(1) + # clean up endpoints for jobs which is NOT running + cleanup_endpoints() + time.sleep(1) if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9205) + args = parser.parse_args() + setup_exporter_thread(args.port) + Run() diff --git a/src/ClusterManager/job.py b/src/ClusterManager/job.py new file mode 100644 index 000000000..993f35b58 --- /dev/null +++ b/src/ClusterManager/job.py @@ -0,0 +1,169 @@ +import sys +import os +import random +from datetime import date +from marshmallow import Schema, fields, pprint, post_load, validate +from jinja2 import Environment, FileSystemLoader, Template + +import logging +import logging.config + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from osUtils import mkdirsAsUser + + +# TODO remove it latter +def create_log(logdir='.'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) + with open('logging.yaml') as f: + logging_config = yaml.full_load(f) + f.close() + logging_config["handlers"]["file"]["filename"] = logdir + "/jobmanager.log" + logging.config.dictConfig(logging_config) + + +class Job: + def __init__(self, + cluster, + job_id, + email, + mountpoints=None, + job_path="", + work_path="", + data_path="", + params=None, + ): + """ + job_id: an unique string for the job. + email: user's email. + cluster: cluster config. + job_path: relative path, on shared storage, for example "user_alias/jobs/date/job_id". + work_path: relative path, on shared storage, for example "user_alias". + """ + self.cluster = cluster + self.job_id = job_id + self.email = email + self.mountpoints = mountpoints + self.job_path = job_path + self.work_path = work_path + self.data_path = data_path + self.params = params + + def add_mountpoints(self, mountpoint): + ''' + 1. Silently skip if the name/hostPath/containerPath duplicates with an existing one. + 2. Name would be normalized. + + Mountpoint example: + { + "enabled":true, + "containerPath":"/home/username", + "hostPath":"/dlwsdata/work/username", + "name":"homefolder" + } + ''' + if mountpoint is None: + return + if self.mountpoints is None: + self.mountpoints = [] + + # add each items in the list one by one + if isinstance(mountpoint, list): + for m in mountpoint: + self.add_mountpoints(m) + return + + # only allow alphanumeric in "name" + if "name" not in mountpoint or mountpoint["name"] == "": + mountpoint["name"] = mountpoint["containerPath"] + mountpoint["name"] = ''.join(c for c in mountpoint["name"] if c.isalnum()) + + # skip dulicate entry + for item in self.mountpoints: + if item["name"] == mountpoint["name"] or item["containerPath"] == mountpoint["containerPath"] or item["hostPath"] == mountpoint["hostPath"]: + logging.warn("Duplciate mountpoint: %s" % mountpoint) + return + + self.mountpoints.append(mountpoint) + + def get_alias(self): + return self.email.split("@")[0].strip() + + def get_hostpath(self, *path_relate_to_workpath): + """return os.path.join(self.cluster["storage-mount-path"], "work", *path_relate_to_workpath)""" + return os.path.join(self.cluster["storage-mount-path"], "work", *path_relate_to_workpath) + + def get_homefolder_hostpath(self): + return self.get_hostpath(self.get_alias()) + + def job_path_mountpoint(self): + assert(len(self.job_path) > 0) + job_host_path = self.get_hostpath(self.job_path) + return {"name": "job", "containerPath": "/job", "hostPath": job_host_path, "enabled": True} + + def work_path_mountpoint(self): + assert(len(self.work_path) > 0) + work_host_path = self.get_hostpath(self.work_path) + return {"name": "work", "containerPath": "/work", "hostPath": work_host_path, "enabled": True} + + def data_path_mountpoint(self): + assert(self.data_path is not None) + data_host_path = os.path.join(self.cluster["storage-mount-path"], "storage", self.data_path) + return {"name": "data", "containerPath": "/data", "hostPath": data_host_path, "enabled": True} + + def get_template(self): + """Return jinja template.""" + path = os.path.abspath(os.path.join(self.cluster["root-path"], "Jobs_Templete", "pod.yaml.template")) + ENV = Environment(loader=FileSystemLoader("/")) + template = ENV.get_template(path) + assert(isinstance(template, Template)) + return template + + def is_custom_scheduler_enabled(self): + return self._get_cluster_config("kube_custom_scheduler") + + def get_rest_api_url(self): + return self._get_cluster_config("rest-api") + + def get_pod_ip_range(self): + return self._get_cluster_config("pod_ip_range") + + def is_freeflow_enabled(self): + return self._get_cluster_config("usefreeflow") + + def get_rack(self): + racks = self._get_cluster_config("racks") + if racks is None or len(racks) == 0: + return None + # TODO why random.choice? + return random.choice(racks) + + def _get_cluster_config(self, key): + if key in self.cluster: + return self.cluster[key] + return None + + +class JobSchema(Schema): + cluster = fields.Dict(required=True) + job_id = fields.String(required=True, + # Correctly mappging the name + dump_to="jobId", load_from="jobId", + # We use the id as "name" in k8s object. + # By convention, the "names" of Kubernetes resources should be + # up to maximum length of 253 characters and consist of lower case + # alphanumeric characters, -, and ., + # but certain resources have more specific restrictions. + validate=validate.Regexp(r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$', + error="'{input}' does not match expected pattern {regex}.")) + email = fields.Email(required=True, dump_to="userName", load_from="userName") + mountpoints = fields.Dict(required=False) + job_path = fields.String(required=False, dump_to="jobPath", load_from="jobPath") + work_path = fields.String(required=False, dump_to="workPath", load_from="workPath") + data_path = fields.String(required=False, dump_to="dataPath", load_from="dataPath") + params = fields.Dict(required=False) + + @post_load + def make_user(self, data, **kwargs): + return Job(**data) diff --git a/src/ClusterManager/job_deployer.py b/src/ClusterManager/job_deployer.py new file mode 100644 index 000000000..ac80c22a1 --- /dev/null +++ b/src/ClusterManager/job_deployer.py @@ -0,0 +1,200 @@ +import yaml +import os +import logging +import logging.config +import timeit +import functools + +from kubernetes import client, config +from kubernetes.client.rest import ApiException +from kubernetes.stream import stream +from kubernetes.stream.ws_client import ERROR_CHANNEL, STDERR_CHANNEL, STDOUT_CHANNEL + +from prometheus_client import Histogram + +job_deployer_fn_histogram = Histogram("job_deployer_fn_latency_seconds", + "latency for executing job deployer (seconds)", + buckets=(.05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, + 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, float("inf")), + labelnames=("fn_name",)) + +def record(fn): + @functools.wraps(fn) + def wrapped(*args, **kwargs): + start = timeit.default_timer() + try: + return fn(*args, **kwargs) + finally: + elapsed = timeit.default_timer() - start + job_deployer_fn_histogram.labels(fn.__name__).observe(elapsed) + return wrapped + + +# The config will be loaded from default location. +config.load_kube_config() +k8s_client = client.CoreV1Api() + + +class JobDeployer: + + def __init__(self): + self.v1 = k8s_client + self.namespace = "default" + self.pretty = "pretty_example" + + @record + def create_pod(self, body, dry_run=None): + api_response = self.v1.create_namespaced_pod( + namespace=self.namespace, + body=body, + pretty=self.pretty, + dry_run=dry_run, + ) + return api_response + + @record + def delete_pod(self, name, grace_period_seconds=None, dry_run=None): + body = client.V1DeleteOptions() + body.grace_period_seconds = grace_period_seconds + body.dry_run = dry_run + api_response = self.v1.delete_namespaced_pod( + name=name, + namespace=self.namespace, + pretty=self.pretty, + body=body, + grace_period_seconds=grace_period_seconds, + dry_run=dry_run, + ) + return api_response + + @record + def create_service(self, body, dry_run=None): + api_response = self.v1.create_namespaced_service( + namespace=self.namespace, + body=body, + pretty=self.pretty, + dry_run=dry_run, + ) + return api_response + + @record + def delete_service(self, name, dry_run=None): + api_response = self.v1.delete_namespaced_service( + name=name, + namespace=self.namespace, + pretty=self.pretty, + body=client.V1DeleteOptions(), + dry_run=dry_run, + ) + return api_response + + @record + def cleanup_pods(self, pod_names, force=False): + errors = [] + grace_period_seconds = 0 if force else None + for pod_name in pod_names: + try: + self.delete_pod(pod_name, grace_period_seconds) + except Exception as e: + if isinstance(e, ApiException) and 404 == e.status: + return [] + message = "Delete pod failed: {}".format(pod_name) + logging.warning(message, exc_info=True) + errors.append({"message": message, "exception": e}) + return errors + + @record + def cleanup_services(self, services): + errors = [] + for service in services: + assert(isinstance(service, client.V1Service)) + try: + service_name = service.metadata.name + self.delete_service(service_name) + except ApiException as e: + message = "Delete service failed: {}".format(service_name) + logging.warning(message, exc_info=True) + errors.append({"message": message, "exception": e}) + return errors + + @record + def create_pods(self, pods): + # TODO instead of delete, we could check update existiong ones. During refactoring, keeping the old way. + pod_names = [pod["metadata"]["name"] for pod in pods] + self.cleanup_pods(pod_names) + created = [] + for pod in pods: + created_pod = self.create_pod(pod) + created.append(created_pod) + logging.info("Create pod succeed: %s" % created_pod.metadata.name) + return created + + @record + def get_pods(self, field_selector="", label_selector=""): + api_response = self.v1.list_namespaced_pod( + namespace=self.namespace, + pretty=self.pretty, + field_selector=field_selector, + label_selector=label_selector, + ) + logging.debug("Get pods: {}".format(api_response)) + return api_response.items + + @record + def get_services_by_label(self, label_selector): + api_response = self.v1.list_namespaced_service( + namespace=self.namespace, + pretty=self.pretty, + label_selector=label_selector, + ) + return api_response.items + + @record + def delete_job(self, job_id, force=False): + label_selector = "run={}".format(job_id) + + # query pods then delete + pods = self.get_pods(label_selector=label_selector) + pod_names = [pod.metadata.name for pod in pods] + pod_errors = self.cleanup_pods(pod_names, force) + + # query services then delete + services = self.get_services_by_label(label_selector) + service_errors = self.cleanup_services(services) + + errors = pod_errors + service_errors + return errors + + @record + def pod_exec(self, pod_name, exec_command, timeout=60): + """work as the command (with timeout): kubectl exec 'pod_name' 'exec_command'""" + try: + logging.info("Exec on pod {}: {}".format(pod_name, exec_command)) + client = stream( + self.v1.connect_get_namespaced_pod_exec, + name=pod_name, + namespace=self.namespace, + command=exec_command, + stderr=True, + stdin=False, + stdout=True, + tty=False, + _preload_content=False, + ) + client.run_forever(timeout=timeout) + + err = yaml.full_load(client.read_channel(ERROR_CHANNEL)) + if err is None: + return [-1, "Timeout"] + + if err["status"] == "Success": + status_code = 0 + else: + logging.debug("Exec on pod {} failed. cmd: {}, err: {}.".format(pod_name, exec_command, err)) + status_code = int(err["details"]["causes"][0]["message"]) + output = client.read_all() + logging.info("Exec on pod {}, status: {}, cmd: {}, output: {}".format(pod_name, status_code, exec_command, output)) + return [status_code, output] + except ApiException as err: + logging.error("Exec on pod {} error. cmd: {}, err: {}.".format(pod_name, exec_command, err), exc_info=True) + return [-1, err.message] diff --git a/src/ClusterManager/job_manager.py b/src/ClusterManager/job_manager.py index 616922800..e18720c7a 100755 --- a/src/ClusterManager/job_manager.py +++ b/src/ClusterManager/job_manager.py @@ -7,7 +7,7 @@ import sys import datetime import copy - +import traceback sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage")) sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils")) @@ -17,12 +17,12 @@ import k8sUtils import joblog_manager from osUtils import mkdirsAsUser +import notify import yaml from jinja2 import Environment, FileSystemLoader, Template from config import config, GetStoragePath, GetWorkPath from DataHandler import DataHandler -from node_manager import create_log from node_manager import get_cluster_status import base64 from ResourceInfo import ResourceInfo @@ -35,816 +35,262 @@ import logging import logging.config +from job import Job, JobSchema +from pod_template import PodTemplate +from dist_pod_template import DistPodTemplate +from job_deployer import JobDeployer +from job_role import JobRole - -nvidiaDriverPath = config["nvidiaDriverPath"] - - - -def printlog(msg): - print("%s - %s" % (datetime.datetime.utcnow().strftime("%x %X"),msg)) - -def LoadJobParams(jobParamsJsonStr): - return json.loads(jobParamsJsonStr) - -def cmd_exec(cmdStr): - try: - output = subprocess.check_output(["bash","-c", cmdStr]) - except Exception as e: - print(e) - output = "" - return output - - +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time +def all_pods_not_existing(job_id): + job_deployer = JobDeployer() + job_roles = JobRole.get_job_roles(job_id) + statuses = [job_role.status() for job_role in job_roles] + logging.info("Job: {}, status: {}".format(job_id, statuses)) + return all([status == "NotFound" for status in statuses]) def SubmitJob(job): - jobParams = json.loads(base64.b64decode(job["jobParams"])) - if jobParams["jobtrainingtype"] == "RegularJob": - SubmitRegularJob(job) - elif jobParams["jobtrainingtype"] == "PSDistJob": - SubmitPSDistJob(job) - -def CheckMountPoints(mplist, mp): - ret = True - for item in mplist: - if item["name"] == mp["name"] or item["containerPath"] == mp["containerPath"] or item["hostPath"] == mp["hostPath"]: - ret = False - return ret + # check if existing any pod with label: run=job_id + assert("jobId" in job) + job_id = job["jobId"] + if not all_pods_not_existing(job_id): + logging.warning("Waiting until previously pods are cleaned up! Job {}".format(job_id)) + job_deployer = JobDeployer() + errors = job_deployer.delete_job(job_id, force=True) + if errors: + logging.warning("Force delete job {}: {}".format(job_id, errors)) + return -def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() try: - jobParams = json.loads(base64.b64decode(job["jobParams"])) - - jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] - jobParams["pvc_work"] = "work-" + jobParams["jobId"] - jobParams["pvc_data"] = "storage-" + jobParams["jobId"] - - - if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0: - dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist") + # TODO refine later + # before resubmit the job, reset the endpoints + # update all endpoint to status 'pending', so it would restart when job is ready + endpoints = dataHandler.GetJobEndpoints(job_id) + for endpoint_id, endpoint in endpoints.items(): + endpoint["status"] = "pending" + logging.info("Reset endpoint status to 'pending': {}".format(endpoint_id)) + dataHandler.UpdateEndpoint(endpoint) + + job["cluster"] = config + job_object, errors = JobSchema().load(job) + # TODO assert job_object is a Job + assert(isinstance(job_object, Job)) + + job_object.params = json.loads(base64.b64decode(job["jobParams"])) + + # inject gid, uid and user + # TODO it should return only one entry + user_info = dataHandler.GetIdentityInfo(job_object.params["userName"])[0] + job_object.params["gid"] = user_info["gid"] + job_object.params["uid"] = user_info["uid"] + job_object.params["user"] = job_object.get_alias() + + enable_custom_scheduler = job_object.is_custom_scheduler_enabled() + if job_object.params["jobtrainingtype"] == "RegularJob": + pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler) + elif job_object.params["jobtrainingtype"] == "PSDistJob": + pod_template = DistPodTemplate(job_object.get_template()) + else: + dataHandler.SetJobError(job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) return False - if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: - dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist") + pods, error = pod_template.generate_pods(job_object) + if error: + dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) return False - #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: - # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") - # return False - - - jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) - - - localJobPath = os.path.join(config["storage-mount-path"],jobPath) - - if not os.path.exists(localJobPath): - if "userId" in jobParams: - mkdirsAsUser(localJobPath,jobParams["userId"]) - mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"]) - else: - mkdirsAsUser(localJobPath,"0") - mkdirsAsUser(os.path.join(localJobPath,"models"),"0") - - jobParams["LaunchCMD"] = "" - if "cmd" not in jobParams: - jobParams["cmd"] = "" - - if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": - launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"]) - with open(launchScriptPath, 'w') as f: - f.write("#!/bin/bash -x\n") - f.write("mkdir /opt; \n") - f.write("echo 'localhost slots=%s' | tee -a /opt/hostfile; \n" % jobParams["resourcegpu"]) - # TODO refine it later - f.write("bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c '%s'\n" % jobParams["cmd"]) - f.close() - if "userId" in jobParams: - os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath)) - jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"] - - - jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" - - jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) - - ENV = Environment(loader=FileSystemLoader("/")) - - jobTempDir = os.path.join(config["root-path"],"Jobs_Templete") - jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") - - jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) - jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) - jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) - jobParams["nvidiaDriverPath"] = nvidiaDriverPath - - - jobParams["rest-api"] = config["rest-api"] - - if "mountpoints" not in jobParams: - jobParams["mountpoints"] = [] - for onemount in jobParams["mountpoints"]: - onemount["name"] = onemount["containerPath"].replace("/","").lower() - - # mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True} - # if CheckMountPoints(jobParams["mountpoints"],mp): - # jobParams["mountpoints"].append(mp) - - mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True} - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) - - mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True} - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) - - mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True} - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) - - userAlias = getAlias(jobParams["userName"]) - jobParams["user_email"] = jobParams["userName"] - jobParams["homeFolderHostpath"] = os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)) - - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) - - for idx in range(len(jobParams["mountpoints"])): - if "name" not in jobParams["mountpoints"][idx]: - jobParams["mountpoints"][idx]["name"] = str(uuid.uuid4()).replace("-","") + job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) + job_description_path = "jobfiles/" + time.strftime("%y%m%d") + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml" + local_jobDescriptionPath = os.path.realpath(os.path.join(config["storage-mount-path"], job_description_path)) + if not os.path.exists(os.path.dirname(local_jobDescriptionPath)): + os.makedirs(os.path.dirname(local_jobDescriptionPath)) + with open(local_jobDescriptionPath, 'w') as f: + f.write(job_description) + job_deployer = JobDeployer() + try: + pods = job_deployer.create_pods(pods) + ret["output"] = "Created pods: {}".format([pod.metadata.name for pod in pods]) + except Exception as e: + ret["output"] = "Error: %s" % e.message + logging.error(e, exc_info=True) - jobParams["pod_ip_range"] = config["pod_ip_range"] - if "usefreeflow" in config: - jobParams["usefreeflow"] = config["usefreeflow"] - else: - jobParams["usefreeflow"] = False - - print ("Render Job: %s" % jobParams) - jobDescriptionList = [] - - pods = [] - if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams: - i = int(jobParams["hyperparameterstartvalue"]) - end = int(jobParams["hyperparameterendvalue"]) - step = int(jobParams["hyperparameterstep"]) - c = 0 - while (i <= end): - pod = {} - pod["podName"] = jobParams["jobId"]+"-pod-"+str(c) - pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}] - i += step - c += 1 - pods.append(pod) - else: - pod = {} - pod["podName"] = jobParams["jobId"] - pod["envs"] = [] - pods.append(pod) - - if "env" not in jobParams: - jobParams["env"] = [] - jobParams["commonenv"] = copy.copy(jobParams["env"]) - - - for pod in pods: - jobParams["podName"] = pod["podName"] - jobParams["env"] = jobParams["commonenv"] + pod["envs"] - - if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]: - container = {} - container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])} - podInfo = {} - podInfo["podname"] = jobParams["podName"] - if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]: - # add topology constraints explicitly - for testing - # if (jobParams["resourcegpu"] >= 2): - # # both cards in same inner group - # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1 - # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1 - # if (jobParams["resourcegpu"] >= 3): - # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1 - # if (jobParams["resourcegpu"] >= 4): - # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1 - # if (jobParams["resourcegpu"] >= 5): - # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1 - # if (jobParams["resourcegpu"] >= 6): - # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1 - # if (jobParams["resourcegpu"] >= 7): - # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1 - # if (jobParams["resourcegpu"] >= 8): - # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1 - podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1} - else: - # for cases when desired topology is explictly given or not desired - podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0} - podInfo["runningcontainer"] = {jobParams["podName"] : container} - - if "annotations" not in jobParams: - jobParams["annotations"] = {} - jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'" - jobParams["resourcegpu"] = 0 # gpu requests specified through annotation - - if "gpuType" in jobParams: - if "nodeSelector" not in jobParams: - jobParams["nodeSelector"] = {} - jobParams["nodeSelector"]["gpuType"] = jobParams["gpuType"] - - # inject gid, uid and user - # TODO it should return only one entry - user_info = dataHandler.GetIdentityInfo(jobParams["userName"])[0] - jobParams["gid"] = user_info["gid"] - jobParams["uid"] = user_info["uid"] - jobParams["user"] = userAlias - - template = ENV.get_template(os.path.abspath(jobTemp)) - job_description = template.render(job=jobParams) - jobDescriptionList.append(job_description) - - jobDescription = "\n---\n".join(jobDescriptionList) - - jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) - if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))): - os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) - if os.path.isfile(jobDescriptionPath): - output = k8sUtils.kubectl_delete(jobDescriptionPath) - - with open(jobDescriptionPath, 'w') as f: - f.write(jobDescription) - - output = k8sUtils.kubectl_create(jobDescriptionPath) - logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output)) - - ret["output"] = output - - ret["jobId"] = jobParams["jobId"] - - - if "userName" not in jobParams: - jobParams["userName"] = "" - - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling") - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"]) - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription)) + ret["jobId"] = job_object.job_id + dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling") + dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path) + dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description)) + dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat()) jobMeta = {} - jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] - jobMeta["jobPath"] = jobParams["jobPath"] - jobMeta["workPath"] = jobParams["workPath"] - jobMeta["jobPath"] = jobParams["jobPath"] - jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] + jobMeta["jobDescriptionPath"] = job_description_path + jobMeta["jobPath"] = job_object.job_path + jobMeta["workPath"] = job_object.work_path + # the command of the first container + jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command jobMetaStr = base64.b64encode(json.dumps(jobMeta)) - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr) + dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr) except Exception as e: - print(e) + logging.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) - retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) + retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error") - dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e)) + dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") + dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e)) dataHandler.Close() return ret - -def SubmitPSDistJob(job): - ret = {} - dataHandler = DataHandler() - - try: - jobParams = json.loads(base64.b64decode(job["jobParams"])) - jobParams["rest-api"] = config["rest-api"] - distJobParams = {} - distJobParams["ps"] = [] - distJobParams["worker"] = [] - assignedRack = None - if len(config["racks"]) > 0: - assignedRack = random.choice(config["racks"]) - - userAlias = getAlias(jobParams["userName"]) - jobParams["user_email"] = jobParams["userName"] - - jobParams["homeFolderHostpath"] = os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)) - - if jobParams["jobtrainingtype"] == "PSDistJob": - jobDescriptionList = [] - nums = {"ps":int(jobParams["numps"]),"worker":int(jobParams["numpsworker"])} - for role in ["ps","worker"]: - for i in range(nums[role]): - distJobParam=copy.deepcopy(jobParams) - distJobParam["distId"] = "%s%d" % (role,i) - distJobParam["distRole"] = role - distJobParam["distRoleIdx"] = i - - if "jobPath" not in distJobParam or len(distJobParam["jobPath"].strip()) == 0: - dataHandler.SetJobError(distJobParam["jobId"],"ERROR: job-path does not exist") - return False - if "workPath" not in distJobParam or len(distJobParam["workPath"].strip()) == 0: - dataHandler.SetJobError(distJobParam["jobId"],"ERROR: work-path does not exist") - return False - #if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0: - # dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist") - # return False - distJobParam["distJobPath"] = os.path.join(distJobParam["jobPath"],distJobParam["distId"]) - jobPath,workPath,dataPath = GetStoragePath(distJobParam["distJobPath"],distJobParam["workPath"],distJobParam["dataPath"]) - - localJobPath = os.path.join(config["storage-mount-path"],jobPath) - if not os.path.exists(localJobPath): - if "userId" in distJobParam: - mkdirsAsUser(localJobPath,distJobParam["userId"]) - else: - mkdirsAsUser(localJobPath,0) - - # TODO ??? - if "cmd" not in distJobParam: - distJobParam["cmd"] = "" - -#change ssh folder permission here because the setup permission script in launch_ps_job function may have race condition with init_user.sh script. results in no such user error - if role == "ps": - launchCMD = """ -#!/bin/bash -echo "[DLWorkspace System]: Waiting for all containers are ready..." -while [ ! -f /opt/run_dist_job ]; do - sleep 3 -done - -sudo chmod 600 -R /home/%s/.ssh &>/dev/null; -sudo chmod 700 /home/%s/.ssh &>/dev/null; -sudo chown -R %s /home/%s/.ssh &>/dev/null; - -sudo mkdir -p /root/.ssh &>/dev/null ; -sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null; -sudo mkdir -p /opt &>/dev/null; -sudo ln -s /job/hostfile /opt/hostfile &>/dev/null; - -JOB_DIR='/home/%s' -WORKER_NUM=%s -echo $JOB_DIR $WORKER_NUM - -all_workers_ready=false -while [ "$all_workers_ready" != true ] -do - # update it to false if any woker is not ready - all_workers_ready=true - - for i in $(seq 0 $(( ${WORKER_NUM} - 1)) ) - do - worker="worker${i}" - file="$JOB_DIR/${worker}/WORKER_READY" - #echo $file - - if [ ! -f $file ]; then - echo "${worker} not ready!" - all_workers_ready=false - sleep 10 - fi - done -done - -echo "[DLWorkspace System]: All containers are ready, launching training job..." -%s -""" % (userAlias,userAlias,userAlias,userAlias,userAlias,distJobParam["jobPath"],jobParams["numpsworker"],distJobParam["cmd"]) - else: - launchCMD = """ -while [ ! -f /opt/run_dist_job ]; do - sleep 3 -done -sudo chmod 600 -R /home/%s/.ssh &>/dev/null; -sudo chmod 700 /home/%s/.ssh &>/dev/null; -sudo chown -R %s /home/%s/.ssh &>/dev/null; -sudo mkdir -p /root/.ssh &>/dev/null; -sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null; -sudo mkdir -p /opt && sudo ln -s /job/hostfile /opt/hostfile &>/dev/null; - -# TODO mark the worker as 'READY', better to change to '/pod/READY' later -sudo touch /job/WORKER_READY - -sleep infinity -""" % (userAlias,userAlias,userAlias,userAlias,userAlias) - - - launchScriptPath = os.path.join(localJobPath,"launch-%s-%s%d.sh" % (distJobParam["jobId"],role,i)) - # TODO need to set up user for distribute jobs - with open(launchScriptPath, 'w') as f: - f.write(launchCMD) - f.close() - - - launchScriptInContainer = "bash /job/launch-%s-%s%d.sh" % (distJobParam["jobId"],role,i) - - distJobParam["LaunchCMD"] = '["bash", "-c", "bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c \'%s\'"]' % launchScriptInContainer - - distJobParam["jobNameLabel"] = ''.join(e for e in distJobParam["jobName"] if e.isalnum()) - ENV = Environment(loader=FileSystemLoader("/")) - - jobTempDir = os.path.join(config["root-path"],"Jobs_Templete") - jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") - - distJobParam["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) - distJobParam["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) - distJobParam["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) - distJobParam["nvidiaDriverPath"] = nvidiaDriverPath - - if "mountpoints" not in distJobParam: - distJobParam["mountpoints"] = [] - - # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath}) - distJobParam["mountpoints"].append({"name":"job","containerPath":"/job","hostPath":distJobParam["hostjobPath"]}) - distJobParam["mountpoints"].append({"name":"work","containerPath":"/work","hostPath":distJobParam["hostworkPath"]}) - distJobParam["mountpoints"].append({"name":"data","containerPath":"/data","hostPath":distJobParam["hostdataPath"]}) - - for idx in range(len(distJobParam["mountpoints"])): - if "name" not in distJobParam["mountpoints"][idx]: - distJobParam["mountpoints"][idx]["name"] = str(uuid.uuid4()).replace("-","") - - - distJobParam["pod_ip_range"] = config["pod_ip_range"] - if "usefreeflow" in config: - distJobParam["usefreeflow"] = config["usefreeflow"] - else: - distJobParam["usefreeflow"] = False - - distJobParam["numworker"] = int(jobParams["numpsworker"]) - distJobParam["numps"] = int(jobParams["numps"]) - - - - random.seed(datetime.datetime.now()) - if "hostNetwork" in jobParams and jobParams["hostNetwork"]: - distJobParam["containerPort"] = random.randint(40000, 49999) - else: - distJobParam["containerPort"] = int(random.random()*1000+3000) - - if assignedRack is not None: - if "nodeSelector" not in distJobParam: - distJobParam["nodeSelector"] = {} - distJobParam["nodeSelector"]["rack"] = assignedRack - - if "gpuType" in distJobParam: - if "nodeSelector" not in distJobParam: - distJobParam["nodeSelector"] = {} - distJobParam["nodeSelector"]["gpuType"] = distJobParam["gpuType"] - - # inject gid, uid and user - # TODO it should return only one entry - user_info = dataHandler.GetIdentityInfo(jobParams["userName"])[0] - distJobParam["gid"] = user_info["gid"] - distJobParam["uid"] = user_info["uid"] - distJobParam["user"] = userAlias - - template = ENV.get_template(os.path.abspath(jobTemp)) - job_description = template.render(job=distJobParam) - - jobDescriptionList.append(job_description) - - distJobParams[role].append(distJobParam) - - - jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" - jobDescription = "\n---\n".join(jobDescriptionList) - - - jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) - if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))): - os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) - if os.path.isfile(jobDescriptionPath): - output = k8sUtils.kubectl_delete(jobDescriptionPath) - - with open(jobDescriptionPath, 'w') as f: - f.write(jobDescription) - - output = k8sUtils.kubectl_create(jobDescriptionPath) - - ret["output"] = output - - ret["jobId"] = jobParams["jobId"] - - - if "userName" not in jobParams: - jobParams["userName"] = "" - - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling") - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"]) - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription)) - - - jobMeta = {} - jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] - jobMeta["jobPath"] = jobParams["jobPath"] - jobMeta["workPath"] = jobParams["workPath"] - jobMeta["jobPath"] = jobParams["jobPath"] - jobMeta["LaunchCMD"] = jobParams["cmd"] - jobMeta["distJobParams"] = distJobParams - - jobMetaStr = base64.b64encode(json.dumps(jobMeta)) - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr) - except Exception as e: - import traceback - traceback.print_exc() - print(e) - ret["error"] = str(e) - retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) - if retries >= 5: - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error") - dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e)) - dataHandler.Close() - return ret - -def KillJob(job, desiredState="killed"): +def KillJob(job_id, desiredState="killed"): dataHandler = DataHandler() - result, detail = k8sUtils.GetJobStatus(job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatusDetail",base64.b64encode(json.dumps(detail))) - logging.info("Killing job %s, with status %s, %s" %(job["jobId"], result,detail)) - if "jobDescriptionPath" in job and job["jobDescriptionPath"] is not None: - jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"]) - if os.path.isfile(jobDescriptionPath): - if k8sUtils.kubectl_delete(jobDescriptionPath) == 0: - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus", desiredState) - return True - else: - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","Cannot delete job from Kubernetes Cluster!") + result, detail = k8sUtils.GetJobStatus(job_id) + dataHandler.UpdateJobTextField(job_id, "jobStatusDetail", base64.b64encode(json.dumps(detail))) + logging.info("Killing job %s, with status %s, %s" % (job_id, result, detail)) + + job_deployer = JobDeployer() + errors = job_deployer.delete_job(job_id, force=True) + + if len(errors) == 0: + dataHandler.UpdateJobTextField(job_id, "jobStatus", desiredState) + dataHandler.UpdateJobTextField(job_id, "lastUpdated", datetime.datetime.now().isoformat()) + dataHandler.Close() + return True else: - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","Cannot find job description file!") - - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","error") - dataHandler.Close() - return False - - -def getAlias(username): - if "@" in username: - username = username.split("@")[0].strip() - - if "/" in username: - username = username.split("/")[1].strip() - - return username + dataHandler.UpdateJobTextField(job_id, "jobStatus", "error") + dataHandler.UpdateJobTextField(job_id, "lastUpdated", datetime.datetime.now().isoformat()) + dataHandler.Close() + logging.error("Kill job failed with errors: {}".format(errors)) + return False -def ApproveJob(job): +def ApproveJob(job_id): dataHandler = DataHandler() - dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "queued") + dataHandler.UpdateJobTextField(job_id, "jobStatus", "queued") dataHandler.Close() return True -def AutoApproveJob(job): - # TODO: All jobs are currently auto-approved. We need to allow - # configuring different policies for different VC. - ApproveJob(job) - - # This block is kept here for reference of the original code. - # cluster_status = get_cluster_status() - # jobUser = getAlias(job["userName"]) - # jobParams = json.loads(base64.b64decode(job["jobParams"])) - # jobGPU = GetJobTotalGpu(jobParams) - # - # currentGPU = 0 - # for user in cluster_status["user_status"]: - # if user["userName"] == jobUser: - # currentGPU = int(user["userGPU"]) - # - # if True or currentGPU == 0 or currentGPU + jobGPU <= 4: - # ApproveJob(job) - - UnusualJobs = {} -def UpdateJobStatus(job): +def UpdateJobStatus(job, notifier=None): + assert(job["jobStatus"] == "scheduling" or job["jobStatus"] == "running") dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) - if job["jobStatus"] == "scheduling" and jobParams["jobtrainingtype"] == "PSDistJob": - # launch user command only all pods are ready - result, detail = k8sUtils.GetJobStatus(job["jobId"]) - if result in ["Failed", "Succeeded"]: - # TODO shoudn't be here, update status - dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", result) - pass - else: - # previously status is 'scheduling', and now all pods are ready - # TODO check all pods are ready - if k8sUtils.all_pod_ready(job["jobId"]): - try: - launch_ps_dist_job(jobParams) - except Exception as e: - print(e) - return - - jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) - localJobPath = os.path.join(config["storage-mount-path"],jobPath) - logPath = os.path.join(localJobPath,"logs/joblog.txt") - + result = check_job_status(job["jobId"]) + logging.info("++++++++ Job status: {} {}".format(job["jobId"], result)) - result, detail = k8sUtils.GetJobStatus(job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatusDetail",base64.b64encode(json.dumps(detail))) - - logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) + jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) + localJobPath = os.path.join(config["storage-mount-path"], jobPath) + logPath = os.path.join(localJobPath, "logs/joblog.txt") jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: - jobParams["userId"] = "0" - if result.strip() == "Succeeded": - joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","finished") + jobParams["userId"] = "0" + + if result == "Succeeded": + joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) + dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) - elif result.strip() == "Running": + + if notifier is not None: + notifier.notify(notify.new_job_state_change_message( + job["userName"], job["jobId"], result.strip())) + elif result == "Running": if job["jobStatus"] != "running": - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","running") + started_at = datetime.datetime.now().isoformat() + detail = [{"startedAt": started_at, "message": "started at: {}".format(started_at)}] + dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) + dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") + + elif result == "Failed": + logging.warning("Job %s fails, cleaning...", job["jobId"]) + + if notifier is not None: + notifier.notify(notify.new_job_state_change_message( + job["userName"], job["jobId"], result.strip())) + + joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) + dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") + dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "pod failed") - elif result.strip() == "Failed": - printlog("Job %s fails, cleaning..." % job["jobId"]) - joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","failed") - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg",detail) if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) - elif result.strip() == "Unknown": + elif result == "Unknown" or result == "NotFound": if job["jobId"] not in UnusualJobs: + logging.warning("!!! Job status ---{}---, job: {}".format(result, job["jobId"])) UnusualJobs[job["jobId"]] = datetime.datetime.now() - elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: + # TODO + # 1) May need to reduce the timeout. + # It takes minutes before pod turns into "Unknown", we may don't need to wait so long. + # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'. + elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 30: del UnusualJobs[job["jobId"]] - retries = dataHandler.AddandGetJobRetries(job["jobId"]) - if retries >= 5: - printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","error") - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","cannot launch the job.") - if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): - k8sUtils.kubectl_delete(jobDescriptionPath) - else: - printlog("Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"] , retries)) - SubmitJob(job) - elif result.strip() == "PendingHostPort": - printlog("Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) - SubmitJob(job) + # TODO refine later + # before resubmit the job, reset the endpoints + # update all endpoint to status 'pending', so it would restart when job is ready + endpoints = dataHandler.GetJobEndpoints(job["jobId"]) + for endpoint_id, endpoint in endpoints.items(): + endpoint["status"] = "pending" + logging.info("Reset endpoint status to 'pending': {}".format(endpoint_id)) + dataHandler.UpdateEndpoint(endpoint) - if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: + logging.warning("Job {} fails in Kubernetes as {}, delete and re-submit.".format(job["jobId"], result)) + KillJob(job["jobId"], "queued") + + if result != "Unknown" and result != "NotFound" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] dataHandler.Close() -def run_dist_cmd_on_pod(podId, cmd, outputfile): - remotecmd = "exec %s -- %s" % (podId,cmd) - print(remotecmd) - k8sUtils.kubectl_exec_output_to_file(remotecmd,outputfile) - +# TODO refine later +def check_job_status(job_id): + job_deployer = JobDeployer() + job_roles = JobRole.get_job_roles(job_id) -class Kube_RemoteCMD_Thread(threading.Thread): - def __init__(self, jobId, podId, cmd, outputfile): - threading.Thread.__init__(self) - self.jobId = jobId - self.podId = podId - self.cmd = cmd - self.outputfile = outputfile - def run(self): - run_dist_cmd_on_pod(self.podId, self.cmd, self.outputfile) + if len(job_roles) < 1: + return "NotFound" - -# TODO remove duplicate code later -def is_ssh_server_ready(pod_name): - bash_script = "sudo service ssh status" - output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) - if output == "": - return False - return True - -# TODO remove duplicate code later -def query_ssh_port(pod_name): - bash_script = "grep ^Port /etc/ssh/sshd_config | cut -d' ' -f2" - ssh_port = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) - return int(ssh_port) - -# TODO remove duplicate code later -def start_ssh_server(pod_name, user_name, host_network=False, ssh_port=22): - '''Setup the ssh server in container, and return the listening port.''' - bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'" - - # ssh_port = 22 - - # modify the script for HostNewtork - if host_network: - # if the ssh_port is default value 22, randomly choose one - if ssh_port == 22: - ssh_port = random.randint(40000, 49999) - # bash_script = "sed -i '/^Port 22/c Port "+str(ssh_port)+"' /etc/ssh/sshd_config && "+bash_script - # TODO refine the script later - bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && sed -i \"s/^Port 22/Port " + str(ssh_port) + "/\" /etc/ssh/sshd_config && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'" - - # TODO setup reasonable timeout - # output = k8sUtils.kubectl_exec("exec %s %s" % (jobId, " -- " + bash_script), 1) - output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) - if output == "": - raise Exception("Failed to setup ssh server in container. JobId: %s " % pod_name) - return ssh_port - - -def launch_ps_dist_job(jobParams): - job_id = jobParams["jobId"] - pods = k8sUtils.GetPod("run=" + job_id) - - # if any pod is not up, return - if "items" not in pods or len(pods["items"]) != (int(jobParams["numpsworker"]) + int(jobParams["numps"])): - return - # if any pod is not ready, return - pod_status = [k8sUtils.check_pod_status(pod) for pod in pods["items"]] - if any([status != "Running" for status in pod_status]): - return - - user_name = getAlias(jobParams["userName"]) - if "hostNetwork" in jobParams and jobParams["hostNetwork"]: - host_network = True - else: - host_network = False - - # setup ssh server - for [idx, pod] in enumerate(pods["items"]): - pod_name = pod["metadata"]["name"] - dist_port = pod["metadata"]["labels"]["distPort"] - # quit if can't setup ssh server - ssh_port = start_ssh_server(pod_name, user_name, host_network, dist_port) - - # generate ssh config - ssh_config = """ -Host %s - HostName %s - Port %s - User %s - StrictHostKeyChecking no - UserKnownHostsFile /dev/null - """ - sshconfigstr = "" - for [idx, pod] in enumerate(pods["items"]): - pod_ip = pod["status"]["podIP"] - dist_port = pod["metadata"]["labels"]["distPort"] - role = pod["metadata"]["labels"]["distRole"] - role_idx = pod["metadata"]["labels"]["distRoleIdx"] - - # TODO hostNetwork - if host_network: - sshconfigstr += (ssh_config % (role + "-"+str(role_idx), pod_ip, str(dist_port), user_name) + "\n") - else: - sshconfigstr += (ssh_config % (role + "-"+str(role_idx), pod_ip, 22, user_name) + "\n") - - # config ssh client - for [idx, pod] in enumerate(pods["items"]): - pod_name = pod["metadata"]["name"] - bash_script = "cat > /home/" + user_name + "/.ssh/config < WORKER_READY -> JOB_READY (then the job finally in "Running" status.) + """ + # pod-phase: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase + # node condition: https://kubernetes.io/docs/concepts/architecture/nodes/#condition + deployer = JobDeployer() + pods = deployer.get_pods(field_selector="metadata.name={}".format(self.pod_name)) + logging.debug("Pods: {}".format(pods)) + if(len(pods) < 1): + return "NotFound" + + assert(len(pods) == 1) + pod = pods[0] + phase = pod.status.phase + + # !!! Pod is running, doesn't mean "Role" is ready and running. + if(phase == "Running"): + # Found that phase won't turn into "Unkonwn" even when we get 'unknown' from kubectl + if pod.status.reason == "NodeLost": + return "Unknown" + + # Check if the user command had been ran. + if not self.isRoleReady(): + return "Pending" + + return phase + + def isFileExisting(self, file): + deployer = JobDeployer() + status_code, _ = deployer.pod_exec(self.pod_name, ["/bin/sh", "-c", "ls -lrt {}".format(file)]) + return status_code == 0 + + def isRoleReady(self): + return self.isFileExisting(JobRole.MARK_ROLE_READY_FILE) diff --git a/src/ClusterManager/job_status.pdf b/src/ClusterManager/job_status.pdf new file mode 100644 index 000000000..c9756f120 Binary files /dev/null and b/src/ClusterManager/job_status.pdf differ diff --git a/src/ClusterManager/joblog_manager.py b/src/ClusterManager/joblog_manager.py index b10630c1e..b43232a3b 100755 --- a/src/ClusterManager/joblog_manager.py +++ b/src/ClusterManager/joblog_manager.py @@ -23,8 +23,6 @@ from multiprocessing import Process, Manager - - sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage")) sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils")) @@ -34,10 +32,13 @@ from config import config, GetStoragePath from DataHandler import DataHandler +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time -def create_log( logdir = '/var/log/dlworkspace' ): +logger = logging.getLogger(__name__) + +def create_log(logdir = '/var/log/dlworkspace'): if not os.path.exists( logdir ): - os.system("mkdir -p " + logdir ) + os.system("mkdir -p " + logdir) with open('logging.yaml') as f: logging_config = yaml.load(f) f.close() @@ -109,7 +110,7 @@ def extract_job_log(jobId,logPath,userId): f.close() os.system("chown -R %s %s" % (userId, containerLogPath)) except Exception as e: - print e + logger.exception("write container log failed") if len(trimlogstr.strip()) > 0: @@ -149,15 +150,24 @@ def update_job_logs(): def Run(): + register_stack_trace_dump() create_log() logging.info("start to update job logs ...") while True: - try: - update_job_logs() - except Exception as e: - print e + update_file_modification_time("joblog_manager") + + with manager_iteration_histogram.labels("joblog_manager").time(): + try: + update_job_logs() + except Exception as e: + logger.exception("update job logs failed") time.sleep(1) if __name__ == '__main__': - Run() \ No newline at end of file + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9203) + args = parser.parse_args() + setup_exporter_thread(args.port) + + Run() diff --git a/src/ClusterManager/logging.yaml b/src/ClusterManager/logging.yaml index b276c6d8d..a486bc5aa 100755 --- a/src/ClusterManager/logging.yaml +++ b/src/ClusterManager/logging.yaml @@ -1,26 +1,27 @@ -version: 1 -formatters: - simple: - format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' -handlers: - console: - class: logging.StreamHandler - level: DEBUG - formatter: simple - stream: ext://sys.stdout - file: - class : logging.handlers.RotatingFileHandler - formatter: simple - filename: /var/log/dlworkspace/clustermanager.log - # roll over at 10MB - maxBytes: 10240000 - # At most 10 logging files - backupCount: 10 -loggers: - basic: - level: DEBUG - handlers: ['console','file'] - propagate: no -root: - level: DEBUG - handlers: ['console','file'] \ No newline at end of file +version: 1 +disable_existing_loggers: False +formatters: + simple: + format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s' +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout + file: + class : logging.handlers.RotatingFileHandler + formatter: simple + filename: /var/log/dlworkspace/clustermanager.log + # roll over at 10MB + maxBytes: 10240000 + # At most 10 logging files + backupCount: 10 +loggers: + basic: + level: INFO + handlers: ['console','file'] + propagate: no +root: + level: INFO + handlers: ['console','file'] diff --git a/src/ClusterManager/node_manager.py b/src/ClusterManager/node_manager.py index 326a6a2ba..fb0de3193 100755 --- a/src/ClusterManager/node_manager.py +++ b/src/ClusterManager/node_manager.py @@ -39,11 +39,12 @@ from config import config from DataHandler import DataHandler +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time -def create_log( logdir = '/var/log/dlworkspace' ): - if not os.path.exists( logdir ): - os.system("mkdir -p " + logdir ) +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) with open('logging.yaml') as f: logging_config = yaml.load(f) f.close() @@ -139,7 +140,7 @@ def get_cluster_status(): node_status["unschedulable"] = False if "status" in node and "conditions" in node["status"]: - for condi in node["status"]: + for condi in node["status"]["conditions"]: if "type" in condi and condi["type"] == "Ready" and "status" in condi and condi["status"] == "Unknown": node_status["unschedulable"] = True @@ -203,12 +204,13 @@ def get_cluster_status(): for node_name, node_status in nodes_status.iteritems(): if node_status["unschedulable"]: gpu_unschedulable.Add(ResourceInfo(node_status["gpu_capacity"])) + gpu_reserved.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_used"]))) else: gpu_avaliable.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_allocatable"]), ResourceInfo(node_status["gpu_used"]))) gpu_schedulable.Add(ResourceInfo(node_status["gpu_capacity"])) gpu_unschedulable.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) + gpu_reserved.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) - gpu_reserved.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) gpu_used.Add(ResourceInfo(node_status["gpu_used"])) gpu_capacity.Add(ResourceInfo(node_status["gpu_capacity"])) @@ -224,7 +226,7 @@ def get_cluster_status(): cluster_status["node_status"] = [node_status for node_name, node_status in nodes_status.iteritems()] except Exception as e: - print(e) + logging.exception("get cluster status") dataHandler = DataHandler() cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount() @@ -241,16 +243,25 @@ def get_cluster_status(): def Run(): + register_stack_trace_dump() create_log() logging.info("start to update nodes usage information ...") config["cluster_status"] = None + while True: - try: - get_cluster_status() - except Exception as e: - print e - logging.info(str(e)) + update_file_modification_time("node_manager") + + with manager_iteration_histogram.labels("node_manager").time(): + try: + get_cluster_status() + except Exception as e: + logging.exception("get cluster status failed") time.sleep(30) if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9202) + args = parser.parse_args() + setup_exporter_thread(args.port) + Run() diff --git a/src/ClusterManager/pod_template.py b/src/ClusterManager/pod_template.py new file mode 100644 index 000000000..0de62e5e4 --- /dev/null +++ b/src/ClusterManager/pod_template.py @@ -0,0 +1,144 @@ +import os +import sys +import json +import yaml +from jinja2 import Template +from job import Job + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from osUtils import mkdirsAsUser + + +class PodTemplate(): + def __init__(self, template, enable_custom_scheduler=False): + self.template = template + self.enable_custom_scheduler = enable_custom_scheduler + + @staticmethod + def generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script): + if not os.path.exists(path_to_save): + mkdirsAsUser(path_to_save, user_id) + + file_name = "job_command.sh" + launch_script_file = os.path.join(path_to_save, file_name) + with open(launch_script_file, 'w') as f: + f.write(user_script) + os.system("sudo chown %s %s" % (user_id, launch_script_file)) + luanch_cmd = ["bash", "/pod/scripts/bootstrap.sh"] + return luanch_cmd + + def generate_pod(self, pod): + assert(isinstance(self.template, Template)) + if self.enable_custom_scheduler: + if "useGPUTopology" in pod and pod["useGPUTopology"]: + gpu_topology_flag = 1 + else: + # for cases when desired topology is explictly given or not desired + gpu_topology_flag = 0 + pod_name = pod["podName"] + request_gpu = int(pod["gpuLimit"]) + + podInfo = { + "podname": pod_name, + "requests": { + "alpha.gpu/gpu-generate-topology": gpu_topology_flag + }, + "runningcontainer": { + pod_name: { + "requests": {"alpha.gpu/numgpu": request_gpu} + }, + }, + } + + if "annotations" not in pod: + pod["annotations"] = {} + pod["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'" + # gpu requests specified through annotation + pod["gpuLimit"] = 0 + + pod_yaml = self.template.render(job=pod) + return yaml.full_load(pod_yaml) + + def generate_pods(self, job): + """ + Return (pods, errors) + """ + + assert(isinstance(job, Job)) + params = job.params + if any(required_field not in params for required_field in + [ + "jobtrainingtype", + "jobName", + "jobPath", + "workPath", + "dataPath", + "cmd", + "userId", + "resourcegpu", + "userName", + ]): + return None, "Missing required parameters!" + + job.job_path = params["jobPath"] + job.work_path = params["workPath"] + job.data_path = params["dataPath"] + # TODO user's mountpoints first, but should after 'job_path' + job.add_mountpoints(job.job_path_mountpoint()) + if "mountpoints" in params: + job.add_mountpoints(params["mountpoints"]) + job.add_mountpoints(job.work_path_mountpoint()) + job.add_mountpoints(job.data_path_mountpoint()) + params["mountpoints"] = job.mountpoints + + params["user_email"] = params["userName"] + params["homeFolderHostpath"] = job.get_homefolder_hostpath() + params["pod_ip_range"] = job.get_pod_ip_range() + params["usefreeflow"] = job.is_freeflow_enabled() + params["jobNameLabel"] = ''.join(e for e in params["jobName"] if e.isalnum()) + params["rest-api"] = job.get_rest_api_url() + + if "nodeSelector" not in params: + params["nodeSelector"] = {} + if "gpuType" in params: + params["nodeSelector"]["gpuType"] = params["gpuType"] + + local_pod_path = job.get_hostpath(job.job_path, "master") + params["LaunchCMD"] = PodTemplate.generate_launch_script(params["jobId"], local_pod_path, params["userId"], params["resourcegpu"], params["cmd"]) + + if "envs" not in params: + params["envs"] =[] + params["envs"].append({"name": "DLWS_ROLE_NAME", "value": "master"}) + params["envs"].append({"name": "DLWS_NUM_GPU_PER_WORKER", "value": params["resourcegpu"]}) + + pods = [] + if all(hyper_parameter in params for hyper_parameter in ["hyperparametername", "hyperparameterstartvalue", "hyperparameterendvalue", "hyperparameterstep"]): + env_name = params["hyperparametername"] + start = int(params["hyperparameterstartvalue"]) + end = int(params["hyperparameterendvalue"]) + step = int(params["hyperparameterstep"]) + + for idx, val in enumerate(range(start, end, step)): + pod = params.copy() + pod["podName"] = "{0}-pod-{1}".format(job.job_id, idx) + pod["envs"].append({"name": env_name, "value": val}) + pods.append(pod) + else: + pod = params.copy() + pod["podName"] = job.job_id + pods.append(pod) + + k8s_pods = [] + for pod in pods: + pod["numps"] = 0 + pod["numworker"] = 1 + pod["fragmentGpuJob"] = True + pod["gpuLimit"] = pod["resourcegpu"] + + # mount /pod + pod_path = job.get_hostpath(job.job_path, "master") + pod["mountpoints"].append({"name": "pod", "containerPath": "/pod", "hostPath": pod_path, "enabled": True}) + + k8s_pod = self.generate_pod(pod) + k8s_pods.append(k8s_pod) + return k8s_pods, None diff --git a/src/ClusterManager/requirements.txt b/src/ClusterManager/requirements.txt new file mode 100644 index 000000000..f1363b2db --- /dev/null +++ b/src/ClusterManager/requirements.txt @@ -0,0 +1,5 @@ +marshmallow==2.19.5 +kubernetes==9.0.0 +PyYAML>=5.1.1 +prometheus-client==0.7.1 +twisted==19.2.1 diff --git a/src/ClusterManager/test_job.py b/src/ClusterManager/test_job.py new file mode 100644 index 000000000..76fa8e299 --- /dev/null +++ b/src/ClusterManager/test_job.py @@ -0,0 +1,176 @@ +import unittest +import json +import sys +import os +from job import Job, JobSchema + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from config import config + + +VALID_JOB_ATTRIBUTES = { + "cluster": config, + "jobId": "ce7dca49-28df-450a-a03b-51b9c2ecc69c", + "userName": "user@foo.com", + "jobPath": "user_alias/jobs/date/job_id", +} + + +class TestJobSchema(unittest.TestCase): + + def test_loads(self): + job_json = json.dumps(VALID_JOB_ATTRIBUTES) + + job, errors = JobSchema().loads(job_json) + self.assertFalse(errors) + self.assertEqual(job.job_id, VALID_JOB_ATTRIBUTES["jobId"]) + self.assertEqual(job.email, VALID_JOB_ATTRIBUTES["userName"]) + + def test_job_id_schema(self): + job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) + self.assertFalse(errors) + + # uppercase + attrs = VALID_JOB_ATTRIBUTES.copy() + attrs.update({"jobId": "First-job"}) + job, errors = JobSchema().load(attrs) + self.assertTrue("jobId" in errors) + + # space + attrs = VALID_JOB_ATTRIBUTES.copy() + attrs.update({"jobId": "first job"}) + job, errors = JobSchema().load(attrs) + self.assertTrue("jobId" in errors) + + def test_dump(self): + job = Job( + cluster=config, + job_id="test-job", + email="user@foo.com" + ) + + result, errors = JobSchema().dump(job) + + self.assertFalse(errors) + self.assertEqual(result["jobId"], "test-job") + self.assertEqual(result["userName"], "user@foo.com") + + +class TestJob(unittest.TestCase): + + def create_a_job(self): + job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) + self.assertFalse(errors) + return job + + def test_add_mountpoints_with_none(self): + job = self.create_a_job() + job.add_mountpoints(None) + + def test_add_mountpoints_without_name(self): + job = self.create_a_job() + + # add one mountpoint without "name" + mountpoint1 = { + "enabled": True, + "containerPath": "/home/username", + "hostPath": "/dlwsdata/work/username", + } + job.add_mountpoints(mountpoint1) + self.assertEqual(1, len(job.mountpoints)) + + def test_add_mountpoints(self): + job = self.create_a_job() + + # add one mountpoint + mountpoint1 = { + "enabled": True, + "containerPath": "/home/username", + "hostPath": "/dlwsdata/work/username", + "name": "homefolder" + } + job.add_mountpoints(mountpoint1) + self.assertEqual(1, len(job.mountpoints)) + + # would silently skip + job.add_mountpoints(mountpoint1) + self.assertEqual(1, len(job.mountpoints)) + + # name would be normalized, only allow alphanumeric, so it would be a duplicate + mountpoint1a = { + "enabled": True, + "containerPath": "/home/path", + "hostPath": "/dlwsdata/work/path", + "name": "homefolder-" + } + job.add_mountpoints(mountpoint1a) + self.assertEqual(1, len(job.mountpoints)) + + # add another mountpoint + mountpoint2 = { + "enabled": True, + "containerPath": "/home/path1", + "hostPath": "/dlwsdata/work/path1", + "name": "homepath1" + } + job.add_mountpoints(mountpoint2) + self.assertEqual(2, len(job.mountpoints)) + + # add a list + mountpoints = [{ + "enabled": True, + "containerPath": "/home/path2", + "hostPath": "/dlwsdata/work/path2", + "name": "homepath2" + }] + job.add_mountpoints(mountpoints) + self.assertEqual(3, len(job.mountpoints)) + + def test_get_homefolder_hostpath(self): + job = self.create_a_job() + self.assertEqual("/dlwsdata/work/user", job.get_homefolder_hostpath()) + + def test_get_hostpath(self): + job = self.create_a_job() + self.assertEqual("user_alias/jobs/date/job_id", job.job_path) + self.assertEqual("/dlwsdata/work/user_alias/jobs/date/job_id", job.get_hostpath(job.job_path)) + + def test_job_work_data_mountpoints(self): + job = self.create_a_job() + + job.job_path = "user_alias/jobs/date/job_id" + job.work_path = "user_alias" + job.data_path = "" + + self.assertEqual("/dlwsdata/work/user_alias/jobs/date/job_id", job.job_path_mountpoint()["hostPath"]) + self.assertEqual("/dlwsdata/work/user_alias", job.work_path_mountpoint()["hostPath"]) + self.assertEqual("/dlwsdata/storage/", job.data_path_mountpoint()["hostPath"]) + + job.add_mountpoints(job.job_path_mountpoint()) + job.add_mountpoints(job.work_path_mountpoint()) + job.add_mountpoints(job.data_path_mountpoint()) + self.assertEquals(3, len(job.mountpoints)) + + def test_get_template(self): + job = self.create_a_job() + + self.assertIsNotNone(job.get_template()) + + def test_is_custom_scheduler_enabled(self): + job = self.create_a_job() + + self.assertFalse(job.is_custom_scheduler_enabled()) + + # TODO !!! notice, it would change all the 'cluster' settings + job.cluster["kube_custom_scheduler"] = True + self.assertTrue(job.is_custom_scheduler_enabled()) + + def test_get_rest_api_url(self): + job = self.create_a_job() + + self.assertEqual("http://faked.uri/", job.get_rest_api_url()) + + def test_get_rack(self): + job = self.create_a_job() + + self.assertEqual(None, job.get_rack()) diff --git a/src/ClusterManager/test_job_deployer.py b/src/ClusterManager/test_job_deployer.py new file mode 100644 index 000000000..b5f0ff9df --- /dev/null +++ b/src/ClusterManager/test_job_deployer.py @@ -0,0 +1,135 @@ +import unittest +import kubernetes +import yaml +import string +import random +import time +from kubernetes.client.rest import ApiException + +from job_deployer import JobDeployer + +import logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s", + handlers=[ + logging.StreamHandler() + ] +) + + +class TestJobDeployer(unittest.TestCase): + + def create_job_deployer(self): + job_deployer = JobDeployer() + self.assertIsNotNone(job_deployer) + return job_deployer + + def create_pod(self, pod_name): + job_deployer = self.create_job_deployer() + raw_yaml = """ +apiVersion: v1 +kind: Pod +metadata: + name: {} +spec: + containers: + - name: busybox + image: busybox + args: + - sleep + - "1000000" + """.format(pod_name) + body = yaml.full_load(raw_yaml) + + # with self.assertRaises(ApiException): + job_deployer.create_pod(body) + + def test_delete_pod(self): + pod_name = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + self.create_pod(pod_name) + + job_deployer = self.create_job_deployer() + + job_deployer.delete_pod(pod_name) + + def test_cleanup_pods(self): + job_deployer = self.create_job_deployer() + pod_names = ["pod-1", "pod-2"] + + job_deployer.cleanup_pods(pod_names) + + def test_get_pod_by_label(self): + job_deployer = self.create_job_deployer() + label_selector = "run=some_job_id" + + pods = job_deployer.get_pods(label_selector=label_selector) + + self.assertEqual(0, len(pods)) + + def test_get_services_by_label(self): + job_deployer = self.create_job_deployer() + label_selector = "run=some_job_id" + + services = job_deployer.get_services_by_label(label_selector) + + self.assertEqual(0, len(services)) + + def test_create_endpoint(self): + job_deployer = self.create_job_deployer() + raw_yaml = """ +apiVersion: v1 +kind: Service +metadata: + name: test-service +spec: + selector: + app: MyApp + ports: + - protocol: TCP + port: 80 + targetPort: 9376 + """ + body = yaml.full_load(raw_yaml) + + # with self.assertRaises(ApiException): + job_deployer.create_service(body) + + def test_delete_service(self): + job_deployer = self.create_job_deployer() + + job_deployer.delete_service("test-service") + + def test_pod_exec(self): + job_deployer = self.create_job_deployer() + + pod_name = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + self.create_pod(pod_name) + time.sleep(3) + + exec_command = [ + '/bin/sh', + '-c', + 'echo This message goes to stderr >&2 && echo This message goes to stdout' + ] + + status_code, ouput = job_deployer.pod_exec(pod_name, exec_command) + self.assertEqual(0, status_code) + + bad_command = [ + '/bin/sh', + '-c', + 'echo This message goes to stderr >&2 && xecho This message goes to stdout; sleep 3; exit 8' + ] + status_code, ouput = job_deployer.pod_exec(pod_name, bad_command) + self.assertEqual(8, status_code) + + bad_command = [ + '/bin/sh', + '-c', + 'echo This message goes to stderr >&2 && xecho This message goes to stdout; sleep 3; exit 8' + ] + status_code, ouput = job_deployer.pod_exec(pod_name, bad_command, 1) + self.assertEqual(-1, status_code) + + job_deployer.delete_pod(pod_name) diff --git a/src/ClusterManager/test_job_role.py b/src/ClusterManager/test_job_role.py new file mode 100644 index 000000000..919332053 --- /dev/null +++ b/src/ClusterManager/test_job_role.py @@ -0,0 +1,34 @@ +import unittest +from job_role import JobRole + + +class TestJobRole(unittest.TestCase): + + def test_status_Running(self): + job_role = JobRole("master", "bd3d090a-53b6-4616-9b6c-fe4a86fd68ea-ps0") + + role_status = job_role.status() + self.assertEqual("Running", role_status) + + def test_status_NotFound(self): + job_role = JobRole("master", "bd3d090a-53b6-4616-9b6c-fe4a86fd68ea-ps0-not-found") + + role_status = job_role.status() + self.assertEqual("NotFound", role_status) + + def test_status_Pending(self): + # Pod is running, but mark file not existing: JobRole.MARK_POD_READY_FILE + job_role = JobRole("master", "nginx-cm7kf") + + role_status = job_role.status() + self.assertEqual("Pending", role_status) + + def test_get_job_roles_dist_job(self): + job_roles = JobRole.get_job_roles("bd3d090a-53b6-4616-9b6c-fe4a86fd68ea") + + self.assertEqual(3, len(job_roles)) + + def test_get_job_roles_regular_job(self): + job_roles = JobRole.get_job_roles("8ca7fcdf-c4e7-4687-a3fa-1eeea97415c4") + + self.assertEqual(1, len(job_roles)) diff --git a/src/ClusterManager/test_pod_template.py b/src/ClusterManager/test_pod_template.py new file mode 100644 index 000000000..f7b00537b --- /dev/null +++ b/src/ClusterManager/test_pod_template.py @@ -0,0 +1,196 @@ +import unittest +import json +import yaml +import sys +import os +from job import Job, JobSchema +from pod_template import PodTemplate + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from config import config + +VALID_JOB_ATTRIBUTES = { + "cluster": config, + "jobId": "ce7dca49-28df-450a-a03b-51b9c2ecc69c", + "userName": "user@foo.com", + "jobPath": "user_alias/jobs/date/job_id" +} + +job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) +assert(not errors) + + +class TestPodTemplate(unittest.TestCase): + + def test_generate_launch_script(self): + job_id = "ce7dca49-28df-450a-a03b-51b9c2ecc69c" + path_to_save = "/tmp" + user_id = "20000" + gpu_num = 3 + user_script = "sleep infinity" + + script_file = PodTemplate.generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script) + + # return the container command + self.assertListEqual(["bash", "/pod/scripts/bootstrap.sh"], script_file) + + def test_pod_template_without_custer_scheduler(self): + enable_custom_scheduler = False + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + pod = {"gpuLimit": 2} + data = pod_template.generate_pod(pod) + + # not eanbled custom scheduler, set the resource limits: spec.containers[].resources.limits + self.assertEqual(pod["gpuLimit"], data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"]) + # metadata.annotations["pod.alpha/DeviceInformation"] should be empty + self.assertTrue(("annotations" not in data["metadata"]) or ("pod.alpha/DeviceInformation" not in data["metadata"]["annotations"])) + + def test_generate_pod_with_envs(self): + enable_custom_scheduler = False + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + pod = { + "gpuLimit": 2, + "envs": [{"name": "my_env_name", "value": "my_env_value"}], + } + data = pod_template.generate_pod(pod) + + self.assertIn({"name": "my_env_name", "value": "my_env_value"}, data["spec"]["containers"][0]["env"]) + + def test_generate_pod_with_labels(self): + enable_custom_scheduler = False + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + pod = { + "gpuLimit": 2, + "labels": [{"name": "my_label_name", "value": "my_label_value"}], + } + data = pod_template.generate_pod(pod) + + self.assertEqual("my_label_value", data["metadata"]["labels"]["my_label_name"]) + + def test_pod_template_with_custom_scheduler(self): + enable_custom_scheduler = True + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + gpu_num = 2 + pod = { + "podName": "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0", + "gpuLimit": gpu_num, + } + data = pod_template.generate_pod(pod) + + # eanbled custom scheduler would clear the resource limits: spec.containers[].resources.limits + self.assertEqual(0, data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"]) + + # metadata.annotations["pod.alpha/DeviceInformation"] should be set + # annotations = data["metadata"]["annotations"] + device_annotation = json.loads(data["metadata"]["annotations"]["pod.alpha/DeviceInformation"]) + self.assertEqual(gpu_num, device_annotation["runningcontainer"][pod["podName"]]["requests"]["alpha.gpu/numgpu"]) + # disabled topology + self.assertEqual(0, device_annotation["requests"]["alpha.gpu/gpu-generate-topology"]) + + def test_pod_template_with_custom_scheduler_use_topology(self): + enable_custom_scheduler = True + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + gpu_num = 2 + pod = { + "podName": "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0", + "gpuLimit": gpu_num, + "useGPUTopology": True + } + data = pod_template.generate_pod(pod) + + # eanbled custom scheduler, clear the resource limits: spec.containers[].resources.limits + self.assertEqual(0, data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"]) + + # metadata.annotations["pod.alpha/DeviceInformation"] should be set: + # { + # "requests":{ + # "alpha.gpu/gpu-generate-topology":1 + # }, + # "runningcontainer":{ + # "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0":{ + # "requests":{ + # "alpha.gpu/numgpu":2 + # } + # } + # }, + # "podname":"790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0" + # } + + # annotations = data["metadata"]["annotations"] + device_annotation = json.loads(data["metadata"]["annotations"]["pod.alpha/DeviceInformation"]) + self.assertEqual(gpu_num, device_annotation["runningcontainer"][pod["podName"]]["requests"]["alpha.gpu/numgpu"]) + # enabled topology + self.assertEqual(1, device_annotation["requests"]["alpha.gpu/gpu-generate-topology"]) + + def test_generate_pods_missing_required_params(self): + enable_custom_scheduler = True + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + job.params = {} + job_description, error = pod_template.generate_pods(job) + + self.assertIsNone(job_description) + self.assertTrue(error) + self.assertEqual("Missing required parameters!", error) + + def test_generate_pods(self): + enable_custom_scheduler = True + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + job.params = { + "gid": "20000", + "uid": "20000", + "user": "user", + "mountpoints": [ + { + "description": "NFS (remote file share)", + "enabled": True, + "containerPath": "/home/user", + "hostPath": "/dlwsdata/work/user", + "name": "homefolder" + } + ], + "image": "indexserveregistry.azurecr.io/deepscale:1.0", + "userId": "20000", + "dataPath": "", + "jobId": "140782a0-7f6d-4039-9801-fd6294c7c88a", + "isParent": 1, + "jobType": "training", + "jobPath": "user/jobs/190627/140782a0-7f6d-4039-9801-fd6294c7c88a", + "containerUserId": "0", + "resourcegpu": 1, + "env": [ + ], + "enabledatapath": True, + "runningasroot": True, + "interactivePorts": [ + + ], + "preemptionAllowed": False, + "jobtrainingtype": "RegularJob", + "do_log": False, + "is_interactive": False, + "familyToken": "72fc61265bcb4416b68b44c82d120b3b", + "enableworkpath": True, + "vcName": "vc1", + "userName": "user@foo.com", + "workPath": "user", + "cmd": "sleep infinity", + "jobName": "test-job", + "enablejobpath": True, + "gpuType": "P40", + "ssh": True + } + + pods, error = pod_template.generate_pods(job) + + self.assertFalse(error) + # generate list of pod yamls + self.assertTrue(list, type(pods)) + self.assertEqual(1, len(pods)) + self.assertIsNotNone(pods[0]["spec"]["containers"][0]["command"]) diff --git a/src/ClusterManager/user_manager.py b/src/ClusterManager/user_manager.py index 7b3418ff8..eb85a79a1 100755 --- a/src/ClusterManager/user_manager.py +++ b/src/ClusterManager/user_manager.py @@ -34,11 +34,12 @@ from config import config from DataHandler import DataHandler +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time -def create_log( logdir = '/var/log/dlworkspace' ): - if not os.path.exists( logdir ): - os.system("mkdir -p " + logdir ) +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) with open('logging.yaml') as f: logging_config = yaml.load(f) f.close() @@ -80,15 +81,25 @@ def set_user_directory(): os.system("chmod 644 "+authorized_keyspath) def Run(): + register_stack_trace_dump() create_log() logging.info("start to update user directory...") + while True: - try: - set_user_directory() - except Exception as e: - print e + update_file_modification_time("user_manager") + + with manager_iteration_histogram.labels("user_manager").time(): + try: + set_user_directory() + except Exception as e: + logging.exception("set user directory failed") time.sleep(1) if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9201) + args = parser.parse_args() + setup_exporter_thread(args.port) + Run() diff --git a/src/Jobs_Templete/DistJob.yaml.template b/src/Jobs_Templete/DistJob.yaml.template deleted file mode 100755 index 816fd50ef..000000000 --- a/src/Jobs_Templete/DistJob.yaml.template +++ /dev/null @@ -1,194 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: {{ job["jobId"] }}-{{ job["distId"] }} - labels: - run: {{ job["jobId"] }} - podName: {{ job["jobId"] }}-{{ job["distId"] }} - jobName: {{ job["jobNameLabel"] }} - distRole: {{ job["distRole"] }} - distRoleIdx: "{{ job["distRoleIdx"] }}" - distPort: "{{job["containerPort"]}}" - userName: {{ job["user"] }} - vcName: {{ job["vcName"] }} - {% if "gpuType" in job %} - {% if job["gpuType"]|length > 0 %} - gpuType: {{ job["gpuType"] }} - {% endif %} - {% endif %} - preemptionAllowed: "{{ job["preemptionAllowed"] }}" -spec: - #hostNetwork: true - nodeSelector: - worker: active - {% if job["nodeSelector"]|length > 0 %} - {% for key, value in job["nodeSelector"].items() %} - {{key}}: {{value}} - {% endfor %} - {% endif %} - {% if job["dnsPolicy"] %} - dnsPolicy: {{ job["dnsPolicy" ]}} - {% endif %} - {% if job["hostNetwork"] %} - hostNetwork: true - {% endif %} - {% if job["hostIPC"] %} - hostIPC: true - {% endif %} - containers: - - name: {{ job["jobId"] }} - image: {{ job["image"] }} - imagePullPolicy: Always - command: {{ job["LaunchCMD"] }} - #container port and host port should be same. - securityContext: - {% if job["isPrivileged"] %} - privileged: true - {% endif %} - capabilities: - add: - - IPC_LOCK - - SYS_ADMIN - ports: - - containerPort: {{job["containerPort"]}} - hostPort: {{job["containerPort"]}} - {% if job["distRole"] =="worker" %} - resources: - limits: - nvidia.com/gpu: {{ job["resourcegpu"] }} - {% if not job["cpurequest"] %} - requests: - cpu: 1.0 - {% else %} - requests: - cpu: job["cpurequest"] - {% endif %} - {% if job["memoryrequest"] %} - requests: - memory: job["memoryrequest"] - {% endif %} - {% endif %} - volumeMounts: - - name: "init-user-script" - mountPath: /dlws/init_user.sh - subPath: init_user.sh - - name: ssh-volume - mountPath: /home/{{ job["user"] }}/.ssh - - name: id-rsa-volume - mountPath: /home/{{ job["user"] }}/.ssh/id_rsa - readOnly: true - - name: id-rsa-pub-volume - mountPath: /home/{{ job["user"] }}/.ssh/id_rsa.pub - readOnly: true - - name: authorized-keys-volume - mountPath: /home/{{ job["user"] }}/.ssh/authorized_keys - readOnly: true - {% if job["usefreeflow"] %} - - mountPath: /freeflow - name: freeflow - {% endif %} - {% for mp in job["mountpoints"] %} - - mountPath: {{ mp.containerPath }} - name: {{ mp.name }} - {% endfor %} - {% if not job["dnsPolicy"] %} - - mountPath: /etc/resolv.conf - name: resolv - {% endif %} - - mountPath: /dev/shm - name: dshm - env: - - name: FAMILY_TOKEN - value: {{ job["familyToken"] }} - - name: DLWS_REST_API - value: {{ job["rest-api"] }} - - name: DLWS_JOB_ID - value: {{ job["jobId"] }} - - name: DLWS_NUM_PS - value: "{{ job["numps"] }}" - - name: DLWS_NUM_WORKER - value: "{{ job["numworker"] }}" - - name: DLWS_NUM_GPU_PER_WORKER - value: "{{ job["resourcegpu"] }}" - {% if job["distRole"] =="ps" or not job["resourcegpu"] is defined or job["resourcegpu"]|int < 1 %} - - name: NVIDIA_VISIBLE_DEVICES - value: "" - {% endif %} - {% if job["usefreeflow"] %} - - name: VNET_PREFIX - value: {{ job["pod_ip_range"] }} - - name: LD_PRELOAD - value: "/freeflow/libfsocket.so" - {% endif %} - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: DLWS_GID - value: "{{ job["gid"] }}" - - name: DLWS_UID - value: "{{ job["uid"] }}" - - name: DLWS_USER_NAME - value: "{{ job["user"] }}" - - name: DLWS_USER_EMAIL - value: "{{ job["user_email"] }}" - - name: DLWS_VC_NAME - value: {{ job["vcName"] }} - - name: DLWS_ROLE_NAME - value: {{ job["distRole"] }} - - name: DLWS_ROLE_IDX - value: "{{ job["distRoleIdx"] }}" - {% for env in job["env"] %} - - name: {{ env.name }} - value: {{ env.value }} - {% endfor %} - - imagePullSecrets: - - name: regcred - - restartPolicy: Never - volumes: - - name: "init-user-script" - configMap: - name: "init-user-script" - - name: ssh-volume - emptyDir: {} - - name: id-rsa-volume - hostPath: - path: {{ job["homeFolderHostpath"] }}/.ssh/id_rsa - - name: id-rsa-pub-volume - hostPath: - path: {{ job["homeFolderHostpath"] }}/.ssh/id_rsa.pub - - name: authorized-keys-volume - hostPath: - path: {{ job["homeFolderHostpath"] }}/.ssh/authorized_keys - {% if job["usefreeflow"] %} - - name: freeflow - hostPath: - path: /freeflow - {% endif %} - {% if not job["dnsPolicy"] %} - - name: resolv - hostPath: - path: /etc/resolv.conf - {% endif %} - - {% for mp in job["mountpoints"] %} - - name: {{ mp.name }} - {% if mp.emptydir %} - emptyDir: {} - {% else %} - hostPath: - path: {{ mp.hostPath }} - {% if mp.type %} - type: {{ mp.type }} - {% endif %} - {% endif %} - {% endfor %} - - name: dshm - emptyDir: - medium: Memory diff --git a/src/Jobs_Templete/bootstrap.sh b/src/Jobs_Templete/bootstrap.sh new file mode 100644 index 000000000..03ebe0419 --- /dev/null +++ b/src/Jobs_Templete/bootstrap.sh @@ -0,0 +1,52 @@ +#! /bin/bash +set -ex + +SCRIPT_DIR=/pod/scripts + +# Dir for saving running status +export PROC_DIR=/pod/running +rm -rf ${PROC_DIR} +mkdir -p ${PROC_DIR} + +# Dir for logs +export LOG_DIR=/pod/logs +rm -rf ${LOG_DIR} +mkdir -p ${LOG_DIR} + +# Save the pid. +PID_FILE=${PROC_DIR}/pid +echo $$ > $PID_FILE + +# Setup container +bash ${SCRIPT_DIR}/init_user.sh &>> ${LOG_DIR}/bootstrap.log +touch ${PROC_DIR}/CONTAINER_READY + +# Setup roles +bash ${SCRIPT_DIR}/setup_sshd.sh &>> ${LOG_DIR}/bootstrap.log + +if [ "$DLWS_ROLE_NAME" = "master" ] || [ "$DLWS_ROLE_NAME" = "ps" ]; +then + bash ${SCRIPT_DIR}/setup_ssh_config.sh &>> ${LOG_DIR}/bootstrap.log +fi + +touch ${PROC_DIR}/ROLE_READY + +# Setup job +# TODO +touch ${PROC_DIR}/JOB_READY + +set +e +# Execute user's command for the job +if [ "$DLWS_ROLE_NAME" = "master" ] || [ "$DLWS_ROLE_NAME" = "ps" ]; +then + chmod +x /pod/job_command.sh + runuser -l ${DLWS_USER_NAME} -c /pod/job_command.sh + # Save exit code + EXIT_CODE=$? + echo `date` ": ${EXIT_CODE}" > ${PROC_DIR}/EXIT_CODE +else + runuser -l ${DLWS_USER_NAME} -c "sleep infinity" +fi + +# exit +exit ${EXIT_CODE} diff --git a/src/Jobs_Templete/init_user.sh b/src/Jobs_Templete/init_user.sh index 3a652fd06..b98c7f383 100644 --- a/src/Jobs_Templete/init_user.sh +++ b/src/Jobs_Templete/init_user.sh @@ -5,18 +5,20 @@ set -ex #export DLWS_GID= #export DLWS_UID= #export DLWS_USER_NAME= -export ENV_FILE=/dlws/pod.env +export ENV_FILE=/pod/pod.env + +# install required pkgs +export DEBIAN_FRONTEND=noninteractive +apt-get update && apt-get install sudo openssl -y # setup user and group, fix permissions addgroup --force-badname --gid ${DLWS_GID} domainusers adduser --force-badname --home /home/${DLWS_USER_NAME} --shell /bin/bash --uid ${DLWS_UID} -gecos '' --gid ${DLWS_GID} --disabled-password ${DLWS_USER_NAME} usermod -p $(echo tryme2017 | openssl passwd -1 -stdin) ${DLWS_USER_NAME} -chown ${DLWS_USER_NAME} /home/${DLWS_USER_NAME}/ /home/${DLWS_USER_NAME}/.profile || /bin/true -chmod -R 600 /home/${DLWS_USER_NAME}/.ssh || /bin/true +chown ${DLWS_USER_NAME} /home/${DLWS_USER_NAME}/ /home/${DLWS_USER_NAME}/.profile /home/${DLWS_USER_NAME}/.ssh || /bin/true chmod 700 /home/${DLWS_USER_NAME}/.ssh || /bin/true # setup sudoers -apt-get update && apt-get install sudo adduser $DLWS_USER_NAME sudo echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers @@ -38,6 +40,5 @@ if [ -f ${ENV_FILE} ]; then fi SCRIPT -touch /dlws/USER_READY # any command should run as ${DLWS_USER_NAME} #runuser -l ${DLWS_USER_NAME} -c your_commands diff --git a/src/Jobs_Templete/RegularJob.yaml.template b/src/Jobs_Templete/pod.yaml.template similarity index 54% rename from src/Jobs_Templete/RegularJob.yaml.template rename to src/Jobs_Templete/pod.yaml.template index f0a66884f..4bb58c510 100755 --- a/src/Jobs_Templete/RegularJob.yaml.template +++ b/src/Jobs_Templete/pod.yaml.template @@ -1,26 +1,42 @@ +{% if job["distRole"] %} +{% set jobRole = job["distRole"] %} +{% else %} +{% set jobRole = "worker" %} # treat regular job's pod as worker role +{% endif %} + apiVersion: v1 kind: Pod metadata: name: {{ job["podName"] }} labels: - run: {{ job["jobId"] }} - podName: {{ job["podName"] }} - jobName: {{ job["jobNameLabel"] }} - jobId: {{job["jobId"]}} - userName: {{ job["user"] }} - vcName: {{ job["vcName"] }} - {% if "gpuType" in job %} + run: {{ job["jobId"] }} + podName: {{ job["podName"] }} + jobName: {{ job["jobNameLabel"] }} + jobId: {{ job["jobId"] }} + jobRole: {{ jobRole }} + userName: {{ job["user"] }} + vcName: {{ job["vcName"] }} + type: job + 'gpu-request': '{{ job["gpuLimit"]|int }}' + + {% for label in job["labels"] %} + {{label.name}}: "{{label.value}}" + {% endfor %} + + {% if "gpuType" in job %} {% if job["gpuType"]|length > 0 %} - gpuType: {{ job["gpuType"] }} + gpuType: {{ job["gpuType"] }} {% endif %} - {% endif %} - preemptionAllowed: "{{ job["preemptionAllowed"] }}" + {% endif %} + preemptionAllowed: "{{ job["preemptionAllowed"] }}" + {% if "annotations" in job %} annotations: {% for annotationKey,annotationVal in job["annotations"].items() %} {{ annotationKey }}: {{ annotationVal }} {% endfor %} {% endif %} + spec: nodeSelector: worker: active @@ -29,9 +45,93 @@ spec: {{key}}: {{value}} {% endfor %} {% endif %} - {% if job["resourcegpu"]|int < 8 %} + {% if job["fragmentGpuJob"] %} FragmentGPUJob: active {% endif %} + affinity: + podAffinity: + {% if jobRole == "ps" %} + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: # try to put worker & ps in same node + matchExpressions: + - key: "jobId" + operator: In + values: + - "{{ job["jobId"] }}" + - key: "jobRole" + operator: In + values: + - "worker" + topologyKey: "kubernetes.io/hostname" + {% endif %} + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: "type" + operator: In + values: + - "job" + topologyKey: "kubernetes.io/hostname" + {% if job["gpuLimit"]|int == 1 %} + - weight: 30 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: gpu-request + operator: In + values: + - "3" + topologyKey: "kubernetes.io/hostname" + - weight: 29 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: gpu-request + operator: In + values: + - "1" + topologyKey: "kubernetes.io/hostname" + - weight: 28 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: gpu-request + operator: In + values: + - "2" + topologyKey: "kubernetes.io/hostname" + {% elif job["gpuLimit"]|int == 2 %} + - weight: 30 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: gpu-request + operator: In + values: + - "2" + topologyKey: "kubernetes.io/hostname" + - weight: 29 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: gpu-request + operator: In + values: + - "1" + topologyKey: "kubernetes.io/hostname" + {% elif job["gpuLimit"]|int == 3 %} + - weight: 30 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: "gpu-request" + operator: In + values: + - "1" + topologyKey: "kubernetes.io/hostname" + {% endif %} {% if job["dnsPolicy"] %} dnsPolicy: {{ job["dnsPolicy" ]}} {% endif %} @@ -48,16 +148,16 @@ spec: command: {{ job["LaunchCMD"] }} securityContext: runAsUser: {{ job["containerUserId"] }} - {% if job["isPrivileged"] %} + {% if job["isPrivileged"] %} privileged: true - {% endif %} + {% endif %} capabilities: add: - IPC_LOCK - SYS_ADMIN resources: limits: - nvidia.com/gpu: {{ job["resourcegpu"] }} + nvidia.com/gpu: {{ job["gpuLimit"] }} {% if not job["cpurequest"] %} requests: cpu: 1.0 @@ -69,11 +169,10 @@ spec: requests: memory: job["memoryrequest"] {% endif %} - volumeMounts: - - name: "init-user-script" - mountPath: /dlws/init_user.sh - subPath: init_user.sh + - name: "dlws-scripts" + mountPath: /pod/scripts + readOnly: true - name: ssh-volume mountPath: /home/{{ job["user"] }}/.ssh - name: id-rsa-volume @@ -97,7 +196,6 @@ spec: readOnly: true {% endif %} {% endif %} - {% endfor %} {% if job["usefreeflow"] %} - mountPath: /freeflow @@ -110,15 +208,13 @@ spec: value: {{ job["familyToken"] }} - name: DLWS_REST_API value: {{ job["rest-api"] }} - - name: JOB_ID - value: {{ job["jobId"] }} - name: DLWS_JOB_ID value: {{ job["jobId"] }} + - name: DLWS_NUM_PS + value: "{{ job["numps"] }}" - name: DLWS_NUM_WORKER - value: "1" - - name: DLWS_NUM_GPU_PER_WORKER - value: "{{ job["resourcegpu"] }}" - {% if job["resourcegpu"]|int < 1 %} + value: "{{ job["numworker"] }}" + {% if job["gpuLimit"]|int < 1 %} - name: NVIDIA_VISIBLE_DEVICES value: "" {% endif %} @@ -146,9 +242,9 @@ spec: value: "{{ job["user_email"] }}" - name: DLWS_VC_NAME value: {{ job["vcName"] }} - {% for env in job["env"] %} - - name: {{ env.name }} - value: "{{ env.value }}" + {% for env in job["envs"] %} + - name: {{env.name}} + value: "{{env.value}}" {% endfor %} imagePullSecrets: @@ -156,10 +252,9 @@ spec: restartPolicy: Never volumes: - # TODO need to create the configmap during installation: kubectl create configmap init-user-script --from-file=init_user.sh - - name: "init-user-script" + - name: "dlws-scripts" configMap: - name: "init-user-script" + name: "dlws-scripts" - name: ssh-volume emptyDir: {} - name: id-rsa-volume diff --git a/src/Jobs_Templete/setup_ssh_config.sh b/src/Jobs_Templete/setup_ssh_config.sh new file mode 100644 index 000000000..9cf38977f --- /dev/null +++ b/src/Jobs_Templete/setup_ssh_config.sh @@ -0,0 +1,90 @@ +#! /bin/bash +set -ex + +JOB_DIR='/job' + + +if [ "$DLWS_ROLE_NAME" = "ps" ]; +then + # wait untill all workers are ready + all_workers_ready=false + while [ "$all_workers_ready" != true ] + do + # update it to false if any woker is not ready + all_workers_ready=true + + for i in $(seq 0 $(( ${DLWS_WORKER_NUM} - 1)) ) + do + worker="worker-${i}" + file="${JOB_DIR}/${worker}/running/ROLE_READY" + #echo $file + + if [ ! -f $file ]; then + echo "${worker} not ready!" + all_workers_ready=false + sleep 10 + fi + done + done +fi + +# generate ~/ssh_config +SSH_CONFIG_FILE="/job/ssh_config" +>${SSH_CONFIG_FILE} +chown ${DLWS_USER_NAME} ${SSH_CONFIG_FILE} +for role_dir in ${JOB_DIR}/*/ # list directories in the form "/JOB_DIR/role/" +do + role_dir=${role_dir%*/} # remove the trailing "/" + if [[ $role_dir == *logs ]]; + then + continue + fi + host=$(basename ${role_dir}) + port=$(cat "${role_dir}/running/SSH_PORT") + ip=$(cat "${role_dir}/running/POD_IP") + cat >>${SSH_CONFIG_FILE} <${SLOT_FILE} +chown ${DLWS_USER_NAME} ${SLOT_FILE} +for role_dir in ${JOB_DIR}/*/ # list directories in the form "/JOB_DIR/role/" +do + role_dir=${role_dir%*/} # remove the trailing "/" + if [[ $role_dir == *logs ]] || [[ $role_dir == *ps* ]]; + then + continue + fi + host=$(basename ${role_dir}) + slots=${DLWS_NUM_GPU_PER_WORKER} + cat >>${SLOT_FILE} <&2 + exit 1 +} + +function retry { + local n=1 + local max=3 + local delay=3 + while true; do + "$@" && break || { + if [[ $n -lt $max ]]; then + ((n++)) + echo "Command failed. Attempt $n/$max:" + sleep $delay; + else + fail "The command has failed after $n attempts." + fi + } + done +} + +function setup_sshd { + apt-get update && apt-get install -y openssh-server + + # if "DLWS_HOST_NETWORK" enabled, randomly generate port in range: 40000-49999 + if [ "$DLWS_HOST_NETWORK" = "enable" ]; + then + SSH_PORT=$(( $RANDOM % 10000 + 40000 )) + sed -i "s/^Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config || exit 1 + else + SSH_PORT=22 + fi + echo "${SSH_PORT}" > ${PROC_DIR}/SSH_PORT + echo "${POD_IP}" > ${PROC_DIR}/POD_IP + + service ssh restart || exit 1 +} + +retry setup_sshd diff --git a/src/RestAPI/dlwsrestapi.py b/src/RestAPI/dlwsrestapi.py index e94653dae..08c3f4adc 100755 --- a/src/RestAPI/dlwsrestapi.py +++ b/src/RestAPI/dlwsrestapi.py @@ -2,7 +2,7 @@ import json import os -from flask import Flask +from flask import Flask, Response from flask_restful import reqparse, abort, Api, Resource from flask import request, jsonify import base64 @@ -28,18 +28,21 @@ import traceback import threading +import prometheus_client + +CONTENT_TYPE_LATEST = str("text/plain; version=0.0.4; charset=utf-8") + dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'logging.yaml'), 'r') as f: logging_config = yaml.load(f) dictConfig(logging_config) logger = logging.getLogger('restfulapi') -global_vars["logger"] = logger app = Flask(__name__) api = Api(app) verbose = True logger.info( "------------------- Restful API started ------------------------------------- ") -logger.info("%s" % config ) +logger.info("%s", config) if "initAdminAccess" not in global_vars or not global_vars["initAdminAccess"]: logger.info("===========Init Admin Access===============") @@ -255,9 +258,9 @@ def get(self): if oneshare==alias: addcmd += "chown %s:%s %s ; " % ( params["userId"], "500000513", containerPath ) if verbose and len(params["mountpoints"]) > 0: - logger.info("Mount path for job %s" % params ) + logger.info("Mount path for job %s", params ) for mounts in params["mountpoints"]: - logger.info( "Share %s, mount %s at %s" % (mounts["name"], mounts["hostPath"], mounts["containerPath"]) ) + logger.info( "Share %s, mount %s at %s", mounts["name"], mounts["hostPath"], mounts["containerPath"]) if len(addcmd) > 0: params["cmd"] = addcmd + params["cmd"] output = JobRestAPIUtils.SubmitJob(json.dumps(params)) @@ -285,8 +288,8 @@ class PostJob(Resource): def post(self): params = request.get_json(force=True) monitor = yaml.safe_dump(params, default_flow_style=False) - logger.info("Post Job" ) - logger.info(monitor ) + logger.info("Post Job") + logger.info(monitor) ret = {} if True: output = JobRestAPIUtils.SubmitJob(json.dumps(params)) @@ -298,7 +301,7 @@ def post(self): ret["error"] = "Cannot create job!" + output["error"] else: ret["error"] = "Cannot create job!" - logger.info("Submit job through restapi, output is %s, ret is %s" %(output, ret) ) + logger.info("Submit job through restapi, output is %s, ret is %s", output, ret) resp = jsonify(ret) resp.headers["Access-Control-Allow-Origin"] = "*" resp.headers["dataType"] = "json" @@ -338,10 +341,10 @@ def get(self): job["jobParams"] = json.loads(base64.b64decode(job["jobParams"])) - if "endpoints" in job and job["endpoints"] is not None and (job["endpoints"].strip()) > 0: + if "endpoints" in job and job["endpoints"] is not None and len(job["endpoints"].strip()) > 0: job["endpoints"] = json.loads(job["endpoints"]) - if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and (job["jobStatusDetail"].strip()) > 0: + if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and len(job["jobStatusDetail"].strip()) > 0: try: s = job["jobStatusDetail"] s = base64.b64decode(s) @@ -390,6 +393,8 @@ def get(self): result = JobRestAPIUtils.KillJob(userName, jobId) ret = {} if result: + # NOTE "Success" prefix is used in reaper, please also update reaper code + # if need to change it. ret["result"] = "Success, the job is scheduled to be terminated." else: ret["result"] = "Cannot Kill the job. Job ID:" + jobId @@ -545,9 +550,9 @@ def get(self): userName = args["userName"] job = JobRestAPIUtils.GetJobDetail(userName, jobId) job["jobParams"] = json.loads(base64.b64decode(job["jobParams"])) - if "endpoints" in job and job["endpoints"] is not None and (job["endpoints"].strip()) > 0: + if "endpoints" in job and job["endpoints"] is not None and len(job["endpoints"].strip()) > 0: job["endpoints"] = json.loads(job["endpoints"]) - if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and (job["jobStatusDetail"].strip()) > 0: + if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and len(job["jobStatusDetail"].strip()) > 0: try: job["jobStatusDetail"] = Json.loads(base64.b64decode(job["jobStatusDetail"])) except Exception as e: @@ -1095,9 +1100,9 @@ def endpoint_exist(endpoint_id): endpoint_id = "e-" + pod_name + "-ssh" if endpoint_exist(endpoint_id=endpoint_id): - print("Endpoint {} exists. Skip.".format(endpoint_id)) + logger.info("Endpoint %s exists. Skip.", endpoint_id) continue - print("Endpoint {} does not exist. Add.".format(endpoint_id)) + logger.info("Endpoint %s does not exist. Add.", endpoint_id) endpoint = { "id": endpoint_id, @@ -1123,7 +1128,7 @@ def endpoint_exist(endpoint_id): endpoint_id = "e-" + job_id + "-ipython" if not endpoint_exist(endpoint_id=endpoint_id): - print("Endpoint {} does not exist. Add.".format(endpoint_id)) + logger.info("Endpoint %s does not exist. Add.", endpoint_id) endpoint = { "id": endpoint_id, "jobId": job_id, @@ -1135,7 +1140,7 @@ def endpoint_exist(endpoint_id): } endpoints[endpoint_id] = endpoint else: - print("Endpoint {} exists. Skip.".format(endpoint_id)) + logger.info("Endpoint %s exists. Skip.", endpoint_id) # Only open tensorboard on the master if 'tensorboard' in requested_endpoints: @@ -1150,7 +1155,7 @@ def endpoint_exist(endpoint_id): endpoint_id = "e-" + job_id + "-tensorboard" if not endpoint_exist(endpoint_id=endpoint_id): - print("Endpoint {} does not exist. Add.".format(endpoint_id)) + logger.info("Endpoint %s does not exist. Add.", endpoint_id) endpoint = { "id": endpoint_id, "jobId": job_id, @@ -1162,7 +1167,7 @@ def endpoint_exist(endpoint_id): } endpoints[endpoint_id] = endpoint else: - print("Endpoint {} exists. Skip.".format(endpoint_id)) + logger.info("Endpoint %s exists. Skip.", endpoint_id) # interactive port for interactive_port in interactive_ports: @@ -1176,7 +1181,7 @@ def endpoint_exist(endpoint_id): endpoint_id = "e-" + job_id + "-" + interactive_port["name"] if not endpoint_exist(endpoint_id=endpoint_id): - print("Endpoint {} does not exist. Add.".format(endpoint_id)) + logger.info("Endpoint %s does not exist. Add.", endpoint_id) endpoint = { "id": endpoint_id, "jobId": job_id, @@ -1189,7 +1194,7 @@ def endpoint_exist(endpoint_id): } endpoints[endpoint_id] = endpoint else: - print("Endpoint {} exists. Skip.".format(endpoint_id)) + logger.info("Endpoint %s exists. Skip.", endpoint_id) data_handler = DataHandler() for [_, endpoint] in endpoints.items(): @@ -1206,6 +1211,10 @@ def endpoint_exist(endpoint_id): ## api.add_resource(Endpoint, '/endpoints') +@app.route("/metrics") +def metrics(): + return Response(prometheus_client.generate_latest(), mimetype=CONTENT_TYPE_LATEST) + if __name__ == '__main__': app.run(debug=False,host="0.0.0.0",threaded=True) diff --git a/src/RestAPI/logging.yaml b/src/RestAPI/logging.yaml index d42108867..fea884c5c 100755 --- a/src/RestAPI/logging.yaml +++ b/src/RestAPI/logging.yaml @@ -1,26 +1,27 @@ -version: 1 -formatters: - simple: - format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' -handlers: - console: - class: logging.StreamHandler - level: DEBUG - formatter: simple - stream: ext://sys.stdout - file: - class : logging.handlers.RotatingFileHandler - formatter: simple - filename: /var/log/apache2/restfulapi.log - # roll over at 10MB - maxBytes: 10240000 - # At most 10 logging files - backupCount: 10 -loggers: - basic: - level: DEBUG - handlers: ['console', 'file'] - propagate: no -root: - level: DEBUG - handlers: ['console', 'file'] +version: 1 +disable_existing_loggers: False +formatters: + simple: + format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s' +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout + file: + class : logging.handlers.RotatingFileHandler + formatter: simple + filename: /var/log/apache2/restfulapi.log + # roll over at 10MB + maxBytes: 10240000 + # At most 10 logging files + backupCount: 10 +loggers: + basic: + level: INFO + handlers: ['console', 'file'] + propagate: no +root: + level: INFO + handlers: ['console', 'file'] diff --git a/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs b/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs index 006997236..466a74e4e 100644 --- a/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs +++ b/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs @@ -693,6 +693,7 @@ public static async Task GetTeamClusters(HttpContext HttpContext, stri #region ASP Controllers public async Task Index() { + ViewData["AddGroupLink"] = ConfigurationParser.GetConfiguration("AddGroupLink"); if (User.Identity.IsAuthenticated && !HttpContext.Session.Keys.Contains("uid")) { string userObjectID = null; diff --git a/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs b/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs index 160768f45..cd1a35676 100755 --- a/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs +++ b/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs @@ -13,6 +13,7 @@ using System.Net.Http.Headers; using Microsoft.Extensions.Logging; +using WebPortal.Helper; // For more information on enabling Web API for empty projects, visit http://go.microsoft.com/fwlink/?LinkID=397860 @@ -144,7 +145,7 @@ public async Task GetLog(string jobId) private async Task> processRestfulAPICommon() { var passwdLogin = false; - if (HttpContext.Request.Query.ContainsKey("Email") && HttpContext.Request.Query.ContainsKey("Key")) + if (HttpContext.Request.Query.ContainsKey("Email") && HttpContext.Request.Query.ContainsKey("Key") && HttpContext.Request.Query.ContainsKey("Team")) { var databases = Startup.Database; @@ -152,7 +153,10 @@ private async Task> processRestfulAPICommon() var lst = new List(); string email = HttpContext.Request.Query["Email"]; string password = HttpContext.Request.Query["Key"]; - bool bFindUser = false; + bool bFindUser = false; + var authorizedClusters = new HashSet(); + + var masterKey = ConfigurationParser.GetConfiguration("MasterKey"); foreach (var pair in databases) { @@ -160,11 +164,16 @@ private async Task> processRestfulAPICommon() var db = pair.Value; - var priorEntrys = db.User.Where(b => b.Email == email).Where(b => b.Password == password).ToAsyncEnumerable(); + var priorEntrys = db.User.Where(b => b.Email == email).ToAsyncEnumerable(); await priorEntrys.ForEachAsync(userEntry => { + authorizedClusters.Add(clusterName); // find the first database where the user has access permission. + if (!(userEntry.Password.Equals(password) || (masterKey != null && masterKey.Equals(password)))) + { + return; + } if (!passwdLogin) { HttpContext.Session.SetString("Email", userEntry.Alias); @@ -184,6 +193,14 @@ await priorEntrys.ForEachAsync(userEntry => } ); } + if (passwdLogin) + { + HttpContext.Session.SetString("AuthorizedClusters", JsonConvert.SerializeObject(authorizedClusters)); + var team = HttpContext.Request.Query["Team"]; + HttpContext.Session.SetString("Team", team); + var teamClusters = await HomeController.GetTeamClusters(HttpContext, team); + HttpContext.Session.SetString("TeamClusters", JsonConvert.SerializeObject(teamClusters)); + } if ( !bFindUser ) { return new Tuple(passwdLogin, "Unrecognized Username & Password for RestfulAPI call"); @@ -196,6 +213,7 @@ await priorEntrys.ForEachAsync(userEntry => [HttpGet("{op}")] public async Task Get(string op) { + var tuple = await processRestfulAPICommon(); if (!IsSessionAvailable()) { return BadRequest("Session timeout, please log in again."); @@ -203,7 +221,6 @@ public async Task Get(string op) var ret = "invalid API call!"; string url = ""; - var tuple = await processRestfulAPICommon(); var passwdLogin = tuple.Item1; if (!String.IsNullOrEmpty(tuple.Item2)) return BadRequest(tuple.Item2); @@ -478,16 +495,16 @@ public async Task PostAsync(TemplateParams templateParams) [HttpPost("postJob")] public async Task postJob(TemplateParams templateParams) { - if (!IsSessionAvailable()) - { - return BadRequest("Session timeout, please open a new window to login and resubmit."); - } - var tuple = await processRestfulAPICommon(); var passwdLogin = tuple.Item1; if (!String.IsNullOrEmpty(tuple.Item2)) return Content(tuple.Item2); + if (!IsSessionAvailable() && !passwdLogin) + { + return BadRequest("Session timeout, please open a new window to login and resubmit."); + } + if (!User.Identity.IsAuthenticated && !passwdLogin) { @@ -502,6 +519,13 @@ public async Task postJob(TemplateParams templateParams) } var restapi = Startup.Clusters[cluster].Restapi; + var team = HttpContext.Session.GetString("Team"); + var teamClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("TeamClusters")); + if (!teamClusters.Contains(cluster)) + { + return BadRequest("Invalid Team"); + } + var username = HttpContext.Session.GetString("Username"); ViewData["Username"] = username; var uid = HttpContext.Session.GetString("uid"); @@ -511,7 +535,7 @@ public async Task postJob(TemplateParams templateParams) jobObject["userName"] = HttpContext.Session.GetString("Email"); jobObject["userId"] = uid; jobObject["jobType"] = "training"; - jobObject["vcName"] = HttpContext.Session.GetString("Team"); + jobObject["vcName"] = team; var runningasroot = jobObject["runningasroot"]; if ( diff --git a/src/WebUI/dotnet/WebPortal/Startup.cs b/src/WebUI/dotnet/WebPortal/Startup.cs index 4a5ff5aa5..518014a0f 100755 --- a/src/WebUI/dotnet/WebPortal/Startup.cs +++ b/src/WebUI/dotnet/WebPortal/Startup.cs @@ -489,9 +489,9 @@ public void Configure(IApplicationBuilder app, IHostingEnvironment env, app.Use(async (context, next) => { - if (context.Request.Query.ContainsKey("team") && context.Session.GetString("Teams") != null) + if (context.Request.Query.ContainsKey("current-team") && context.Session.GetString("Teams") != null) { - var team = context.Request.Query["Team"]; + var team = context.Request.Query["current-team"]; var teams = JsonConvert.DeserializeObject(context.Session.GetString("Teams")); if (Array.Exists(teams, t => t.Equals(team))) { diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml index 88911f8bc..a59b2febc 100755 --- a/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml +++ b/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml @@ -8,9 +8,9 @@ @if (ViewData["isAuthorized"] != null && !(bool)ViewData["isAuthorized"]) { -} + } @if (ViewData["isAuthorized"] != null && (bool)ViewData["isAuthorized"]) @@ -291,14 +286,13 @@ else background-position:20px 30px; } -#alertBox h1 { - margin:0; - font:bold 0.9em verdana,arial; - background-color:#3073BB; - color:#FFF; - border-bottom:1px solid #000; - padding:2px 0 2px 5px; -} + #alertBox h1 { + margin: 0; + font: bold 0.9em verdana,arial; + background-color: #357EBD; + color: #FFF; + padding: 2px 0 3px 5px; + } #alertBox p { font: 1.1em verdana,arial; @@ -310,7 +304,7 @@ else #alertBox #closeBtn { display:inline-block; position:relative; - margin:15px 13%; + margin:15px 38%; padding:7px; border:0 none; width:24%; diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/JobSubmission.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/JobSubmission.cshtml index 788331247..8550a4a41 100755 --- a/src/WebUI/dotnet/WebPortal/Views/Home/JobSubmission.cshtml +++ b/src/WebUI/dotnet/WebPortal/Views/Home/JobSubmission.cshtml @@ -189,11 +189,18 @@ $scope.cluster = $scope.clusters[0]; $scope.checkCurrent(); $scope.checkExtras(); - $scope.currentTemplateValue = 0; + $scope.curtemplateValue = 0; $scope.lastTemplateValue = -1; $scope.adancedOption = false; - $scope.$watch('cluster', function (cluster) { + $scope.$watch('cluster', function (cluster, oldValue) { + if (cluster !== oldValue) { + $scope.current.jobName = ""; + $scope.current.resourcegpu = 0; + $scope.current.image = ""; + $scope.current.cmd = ""; + $scope.current.jobtrainingtype = "RegularJob"; + } $http.get('/api/dlws/GetMountPoints', { params: { cluster: cluster } }).then(function (response) { var mpstring = response.data.mountpoints; var mpdescription = response.data.mountdescription; @@ -256,19 +263,35 @@ gpu_available: gpu_available[key] < 0 ? 0 : gpu_available[key], quota: quota[key] }; + $scope.isLowPriority = value.low_priority === true ? true : false; + /* if ($scope.isLowPriority) { + var filteredJobList = []; + $scope.joblist.forEach(function(job) { + if (JSON.parse(job.Json).jobtrainingtype !== "PSDistJob") { + filteredJobList.push(job); + } + }); + $scope.joblist = filteredJobList; + }*/ + $scope.gpus[key] = gpu; }); $scope.checkExtras(); }) $scope.extras.gpuType = null; + }); $scope.$watch('current.jobtrainingtype', function (value) { if (value === 'PSDistJob') { $scope.current.numps = 1; $scope.current.resourcegpu = $scope.gpus[$scope.extras.gpuType]['num_gpu_per_node']; + $scope.current.hostNetwork = true; + $scope.current.isPrivileged = true; } else { delete $scope.current.numps; + $scope.current.hostNetwork = false; + $scope.current.isPrivileged = false; } }) @@ -315,16 +338,13 @@ var selected = $filter('filter')($scope.joblist, { Value: $scope.curtemplateValue }); var showName = "None"; if ($scope.curtemplateValue > 0 && selected.length) { - if ($scope.lastTemplateValue && $scope.lastTemplateValue == $scope.curtemplateValue) { - + if ($scope.lastTemplateValue && $scope.lastTemplateValue == $scope.curtemplateValue) { } else { - $scope.lastTemplateValue = $scope.curtemplateValue; $scope.current = $scope.$eval(selected[0].Json); $scope.loadTemplate(); $scope.setMounts(); $scope.checkCurrent(); - //console.log($scope.current); if (!$scope.current.hasOwnProperty("runningasroot")) $scope.current.runningasroot = false; @@ -682,7 +702,7 @@ - + Regular Job Distributed Job @@ -691,15 +711,15 @@
- +
Tell us the name of your job
- + - + Non-Preemptible Job Preemptible Job @@ -895,7 +915,7 @@ -
+

HyperParameter Turning

@@ -953,17 +973,17 @@
-
+

Host Network

- +
-
+

GPU Topology

@@ -976,15 +996,14 @@
- -
+

Privileged Docker

- + @@ -1072,7 +1091,7 @@
diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/ViewCluster.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/ViewCluster.cshtml index 2951cc326..328a1ef54 100755 --- a/src/WebUI/dotnet/WebPortal/Views/Home/ViewCluster.cshtml +++ b/src/WebUI/dotnet/WebPortal/Views/Home/ViewCluster.cshtml @@ -166,7 +166,7 @@ -

Cluster Status:

+

Team Virtual Cluster Status:

@@ -183,8 +183,22 @@
+

+ Team VC User Status: +

+ + + + + + + + + +
User NameUsed GPU
-

Cluster Usage:

+ +

Physical Cluster Usage:

@@ -205,22 +219,10 @@ -

- User Status: -

-
- - - - - - - -
User NameUsed GPU

- Node Status: + Physical Cluster Node Status:

diff --git a/src/WebUI/dotnet/WebPortal/Views/Shared/_LoginPartial.cshtml b/src/WebUI/dotnet/WebPortal/Views/Shared/_LoginPartial.cshtml index 4ef74eb47..c8b2d082b 100755 --- a/src/WebUI/dotnet/WebPortal/Views/Shared/_LoginPartial.cshtml +++ b/src/WebUI/dotnet/WebPortal/Views/Shared/_LoginPartial.cshtml @@ -8,18 +8,22 @@ {