diff --git a/src/ClusterBootstrap/deploy.py b/src/ClusterBootstrap/deploy.py
index 4f3bee96b..d0876d765 100755
--- a/src/ClusterBootstrap/deploy.py
+++ b/src/ClusterBootstrap/deploy.py
@@ -2831,8 +2831,8 @@ def start_one_kube_service(fname):
pass
if fname == "./deploy/services/jobmanager/jobmanager.yaml":
- # recreate the configmap init-user-script
- run_kubectl( ["create configmap init-user-script --from-file=../Jobs_Templete/init_user.sh -o yaml --dry-run | ./deploy/bin/kubectl apply -f -"] )
+ # recreate the configmap dlws-scripts
+ run_kubectl( ["create configmap dlws-scripts --from-file=../Jobs_Templete/ -o yaml --dry-run | ./deploy/bin/kubectl apply -f -"] )
run_kubectl( ["create", "-f", fname ] )
diff --git a/src/ClusterBootstrap/params.py b/src/ClusterBootstrap/params.py
index fa0bfd4f8..5fafa11a6 100755
--- a/src/ClusterBootstrap/params.py
+++ b/src/ClusterBootstrap/params.py
@@ -23,7 +23,7 @@
"job-exporter": { "port": 9102 },
"node-exporter": { "port": 9100 },
"watchdog": { "port": 9101 },
- "grafana": { "port": 3000 },
+ "grafana": { "port": 3000, "prometheus-ip": "localhost" },
"alert-manager": {
"port": 9093,
"configured": False,
@@ -31,6 +31,11 @@
# If want to deploy with alert-manager, should config
# configured with True, and fill appropriate value to:
# smtp_url, smtp_from, smtp_auth_username, smtp_auth_password and receiver
+ "reaper": {
+ "dry-run": True,
+ "port": "9500",
+ "restful-url": "http://localhost:5000",
+ }
},
"mysql_port": "3306",
diff --git a/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml b/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml
index caa7ae9d6..54ccf8c67 100755
--- a/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml
+++ b/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml
@@ -13,6 +13,9 @@ spec:
labels:
jobmanager-node: pod
app: jobmanager
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/path: "/metrics"
spec:
{% if cnf["dnsPolicy"] %}
dnsPolicy: {{cnf["dnsPolicy"]}}
@@ -39,7 +42,40 @@ spec:
- mountPath: {{cnf["storage-mount-path"]}}/jobfiles
name: dlwsdatajobfiles
- mountPath: /var/log/dlworkspace
- name: log
+ name: log
+ ports:
+ - containerPort: 9200
+ hostPort: 9200
+ name: job-mgr
+ protocol: TCP
+ - containerPort: 9201
+ hostPort: 9201
+ name: user-mgr
+ protocol: TCP
+ - containerPort: 9202
+ hostPort: 9202
+ name: node-mgr
+ protocol: TCP
+ - containerPort: 9203
+ hostPort: 9203
+ name: joblog-mgr
+ protocol: TCP
+ - containerPort: 9204
+ hostPort: 9204
+ name: cmd-mgr
+ protocol: TCP
+ - containerPort: 9205
+ hostPort: 9205
+ name: endpoint-mgr
+ protocol: TCP
+ readinessProbe:
+ failureThreshold: 3
+ initialDelaySeconds: 3
+ periodSeconds: 30
+ successThreshold: 1
+ tcpSocket:
+ port: 9200
+ timeoutSeconds: 10
volumes:
- name: certs
hostPath:
diff --git a/src/ClusterBootstrap/services/monitor/alert-manager.yaml b/src/ClusterBootstrap/services/monitor/alert-manager.yaml
index 8bc493e6d..a15534e3f 100644
--- a/src/ClusterBootstrap/services/monitor/alert-manager.yaml
+++ b/src/ClusterBootstrap/services/monitor/alert-manager.yaml
@@ -24,7 +24,7 @@ spec:
hostNetwork: true
containers:
- name: alert-manager
- image: prom/alertmanager:v0.15.1
+ image: prom/alertmanager:v0.18.0
args:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
@@ -40,6 +40,23 @@ spec:
mountPath: /alertmanager
- name: templates-volume
mountPath: /etc/alertmanager/template
+ {% if cnf["alert-manager"]["reaper"] %}
+ - name: reaper
+ image: {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}reaper:{{cnf["dockertag"]}}
+ command:
+ - 'python'
+ - '/reaper/main.py'
+ - '--port'
+ - '{{ cnf["alert-manager"]["reaper"]["port"] }}'
+ - '--restful_url'
+ - '{{ cnf["alert-manager"]["reaper"]["restful-url"] }}'
+ {% if cnf["alert-manager"]["reaper"]["dry-run"] %}
+ - '--dry_run'
+ {% endif %}
+ ports:
+ - name: alert-manager
+ containerPort: {{ cnf["alert-manager"]["reaper"]["port"] }}
+ {% endif %}
volumes:
- name: config-volume
configMap:
@@ -80,14 +97,30 @@ data:
receiver: alert-email
group_wait: 30s
group_interval: 5m
- group_by: [alertname]
+ group_by: [alertname, cluster]
routes:
- - receiver: task_user
+ - receiver: idle_gpu_receiver
repeat_interval: 4h
group_by: [alertname, user_email, cluster]
match_re:
- type: user_alert
+ type: idle_gpu
alertname: "zero-gpu-usage"
+ - receiver: job_state_change_receiver
+ group_by: [alertname, user_email, cluster, subject]
+ match_re:
+ type: user_alert
+ alertname: "job-state-changed"
+ - receiver: reaper
+ group_by: [alertname, user_email, job_name]
+ group_wait: 0s
+ match_re:
+ type: reaper
+ - receiver: kill_idle_job_email
+ group_by: [alertname, user_email, cluster]
+ group_wait: 0s
+ match_re:
+ type: kill_idle_job_email
+ alertname: "kill-idle-jobs-email"
receivers:
- name: "alert-email"
email_configs:
@@ -95,7 +128,7 @@ data:
html: '{{ "{{" }} template "email.html" . {{ "}}" }}'
headers:
subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
- - name: "task_user"
+ - name: "idle_gpu_receiver"
email_configs:
{% if cnf["alert-manager"]["alert_users"] %}
- to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
@@ -109,4 +142,40 @@ data:
CC: '{{ alert_info["receiver"] }}'
{% endif %}
subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
+ - name: "job_state_change_receiver"
+ email_configs:
+ {% if cnf["alert-manager"]["alert_users"] %}
+ - to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
+ {% else %}
+ - to: '{{ alert_info["receiver"] }}'
+ {% endif %}
+ html: '{{ "{{" }} template "job_state.html" . {{ "}}" }}'
+ headers:
+ {% if cnf["alert-manager"]["alert_users"] %}
+ To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}'
+ CC: '{{ alert_info["receiver"] }}'
+ {% endif %}
+ subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
+ - name: "reaper"
+ {% if cnf["alert-manager"]["reaper"] %}
+ webhook_configs:
+ - send_resolved: False
+ url: 'http://localhost:{{ cnf["alert-manager"]["reaper"]["port"] }}/kill'
+ http_config:
+ bearer_token: 'shinigami'
+ - name: "kill_idle_job_email"
+ email_configs:
+ {% if cnf["alert-manager"]["alert_users"] %}
+ - to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
+ {% else %}
+ - to: '{{ alert_info["receiver"] }}'
+ {% endif %}
+ html: '{{ "{{" }} template "kill_idle.html" . {{ "}}" }}'
+ headers:
+ {% if cnf["alert-manager"]["alert_users"] %}
+ To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}'
+ CC: '{{ alert_info["receiver"] }}'
+ {% endif %}
+ subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
+ {% endif %}
{% endif %}
diff --git a/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl b/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl
new file mode 100644
index 000000000..2da286cc1
--- /dev/null
+++ b/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl
@@ -0,0 +1,71 @@
+{{ define "job_state.html" }}
+
+
+
+
+
+
+{{ template "__subject" . }}
+
+
+
+
+
+
+
+ |
+
+
+
+
+
+
+ {{ range .Alerts.Firing }}
+
+ |
+Your job
+
+{{.Labels.job_name}}
+ from cluster '{{.Labels.cluster}}' has changed to state of {{.Labels.job_state}}.
+ |
+
+ {{ end }}
+
+ |
+
+
+
+
+ |
+ |
+
+
+
+
+
+{{ end }}
diff --git a/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl b/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl
new file mode 100644
index 000000000..e29dc993c
--- /dev/null
+++ b/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl
@@ -0,0 +1,71 @@
+{{ define "kill_idle.html" }}
+
+
+
+
+
+
+{{ template "__subject" . }}
+
+
+
+
+
+
+
+ |
+
+
+
+
+
+
+ {{ range .Alerts.Firing }}
+
+ |
+Your job
+
+{{.Labels.job_name}}
+ from cluster '{{.Labels.cluster}}' VC '{{.Labels.vc_name}}' was killed because it have been idle for too long.
+ |
+
+ {{ end }}
+
+ |
+
+
+
+
+ |
+ |
+
+
+
+
+
+{{ end }}
diff --git a/src/ClusterBootstrap/services/monitor/alerting/jobs.rules b/src/ClusterBootstrap/services/monitor/alerting/jobs.rules
index 976263107..7a6160384 100644
--- a/src/ClusterBootstrap/services/monitor/alerting/jobs.rules
+++ b/src/ClusterBootstrap/services/monitor/alerting/jobs.rules
@@ -5,4 +5,14 @@ groups:
expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0
for: 4h
labels:
- type: user_alert
+ type: idle_gpu
+ - alert: kill-idle-jobs-email
+ expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0
+ for: 8h
+ labels:
+ type: kill_idle_job_email
+ - alert: kill-idle-jobs
+ expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0
+ for: 8h
+ labels:
+ type: reaper
diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json
new file mode 100644
index 000000000..60dd045da
--- /dev/null
+++ b/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json
@@ -0,0 +1,239 @@
+{
+ "dashboard": {
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [],
+ "refresh": "30s",
+ "rows": [
+ {
+ "collapse": false,
+ "height": "250px",
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 1,
+ "legend": {
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "connected",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "(sum(k8s_node_gpu_total) - sum(k8s_node_gpu_available) - sum(k8s_node_gpu_reserved)) / sum(k8s_node_gpu_total) * 100",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 2,
+ "legendFormat": "Allocation Rate",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cluster wide GPU allocation rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 2,
+ "legend": {
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "connected",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg(task_gpu_percent)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Avg Util",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cluster wide avg util",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ }
+ ],
+ "schemaVersion": 14,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now/w",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Cluster GPU statistic",
+ "version": 0
+ }
+}
diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json
new file mode 100644
index 000000000..c78fab7dd
--- /dev/null
+++ b/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json
@@ -0,0 +1,389 @@
+{
+ "dashboard": {
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [],
+ "refresh": "30s",
+ "rows": [
+ {
+ "collapse": false,
+ "height": "250px",
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 0,
+ "id": 1,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.9, sum(rate(datahandler_fn_latency_seconds_bucket{scraped_from=~\"jobmanager.*\"}[5m])) by (le, fn_name))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{'{{'}} fn_name {{'}}'}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Datahandler 90th percentile latency per function from jobmanager",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 0,
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.9, sum(rate(datahandler_fn_latency_seconds_bucket{scraped_from=~\"restfulapi.*\"}[5m])) by (le, fn_name))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{'{{'}} fn_name {{'}}'}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Datahandler 90th percentile latency per function from restfulapi",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 0,
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.9, sum(rate(db_connect_latency_seconds_bucket{scraped_from=~\"jobmanager.*\"}[5m])) by (le))",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Connection Latency",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "90th percentile DB connection latency from jobmanager",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 0,
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.9, sum(rate(db_connect_latency_seconds_bucket{scraped_from=~\"restfulapi.*\"}[5m])) by (le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Connection Latency",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "90th percentile DB connection latency from restfulapi",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ }
+ ],
+ "schemaVersion": 14,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Performance dashboard",
+ "version": 0
+ }
+}
diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json b/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json
index 467dfa70e..027e5ea6c 100644
--- a/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json
+++ b/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json
@@ -1,6 +1,6 @@
{
"name": "PM",
- "url": "http://{{cnf['prometheus']['host']}}:9091/prometheus",
+ "url": "http://{{cnf['grafana']['prometheus-ip']}}:9091/prometheus",
"basicAuth": false,
"access": "proxy",
"type": "prometheus",
diff --git a/src/ClusterBootstrap/services/monitor/job-exporter.yaml b/src/ClusterBootstrap/services/monitor/job-exporter.yaml
index 54f32eda9..90df9fd62 100644
--- a/src/ClusterBootstrap/services/monitor/job-exporter.yaml
+++ b/src/ClusterBootstrap/services/monitor/job-exporter.yaml
@@ -82,3 +82,5 @@ spec:
operator: "Exists"
- key: node.kubernetes.io/disk-pressure
operator: "Exists"
+ - key: node-role.kubernetes.io/master
+ operator: "Exists"
diff --git a/src/ClusterBootstrap/services/monitor/node-exporter.yaml b/src/ClusterBootstrap/services/monitor/node-exporter.yaml
index cc86f3583..26fb8633e 100644
--- a/src/ClusterBootstrap/services/monitor/node-exporter.yaml
+++ b/src/ClusterBootstrap/services/monitor/node-exporter.yaml
@@ -78,3 +78,5 @@ spec:
operator: "Exists"
- key: node.kubernetes.io/disk-pressure
operator: "Exists"
+ - key: node-role.kubernetes.io/master
+ operator: "Exists"
diff --git a/src/ClusterBootstrap/services/monitor/prometheus.yaml b/src/ClusterBootstrap/services/monitor/prometheus.yaml
index 9a5e08bd1..bd4c0a203 100644
--- a/src/ClusterBootstrap/services/monitor/prometheus.yaml
+++ b/src/ClusterBootstrap/services/monitor/prometheus.yaml
@@ -83,6 +83,13 @@ spec:
nodeSelector:
prometheus: active
hostNetwork: true
+ initContainers:
+ - name: init
+ image: bash:4
+ volumeMounts:
+ - name: prometheus-data
+ mountPath: /prometheus-data
+ command: ["chmod", "777", "/prometheus-data"] # newly create dir have permission 755, which makes prometheus container unable to write
containers:
- name: prometheus
image: prom/prometheus:v2.1.0
@@ -96,6 +103,7 @@ spec:
- '--web.listen-address=0.0.0.0:{{cnf["prometheus"]["port"]}}'
- '--web.external-url=http://localhost:{{cnf["prometheus"]["port"]}}/prometheus/'
- '--web.route-prefix=prometheus'
+ - '--storage.tsdb.path=/prometheus-data'
- '--storage.tsdb.retention=31d'
ports:
- name: web
@@ -105,6 +113,8 @@ spec:
mountPath: /etc/prometheus
- name: rules-volume
mountPath: /etc/prometheus-alert
+ - name: prometheus-data
+ mountPath: /prometheus-data
volumes:
- name: config-volume
configMap:
@@ -112,6 +122,9 @@ spec:
- name: rules-volume
configMap:
name: prometheus-alert
+ - name: prometheus-data
+ hostPath:
+ path: /data/prometheus/data
tolerations:
- key: node.kubernetes.io/memory-pressure
operator: "Exists"
diff --git a/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml b/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml
index e3d089c83..d1e5a6ce4 100755
--- a/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml
+++ b/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml
@@ -4,7 +4,7 @@ metadata:
name: restfulapi
namespace: default
labels:
- run: dlwsrestfulapi
+ run: dlwsrestfulapi
spec:
selector:
matchLabels:
@@ -15,13 +15,17 @@ spec:
labels:
restfulapi-node: pod
app: restfulapi
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/path: "/metrics"
+ prometheus.io/port: "5000"
spec:
- {% if cnf["dnsPolicy"] %}
+ {% if cnf["dnsPolicy"] %}
dnsPolicy: {{cnf["dnsPolicy"]}}
{% endif %}
nodeSelector:
restfulapi: active
- hostNetwork: true
+ hostNetwork: true
containers:
- name: restfulapi
image: {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}{{cnf["restfulapi"]}}:{{cnf["dockertag"]}}
@@ -31,6 +35,10 @@ spec:
name: apiconfig
- mountPath: /var/log/apache2
name: log
+ ports:
+ - containerPort: 5000
+ hostPort: 5000
+ name: main
{% if False %}
{% for volume in cnf["mountpoints"] %}
{% if cnf["mountpoints"][volume]["mountpoints"] is string and cnf["mountpoints"][volume]["mountpoints"]!="" %}
@@ -42,7 +50,7 @@ spec:
name: {{mp}}
{% endfor %}
{% endif %}
- {% endfor %}
+ {% endfor %}
{% endif %}
volumes:
- name: apiconfig
@@ -60,14 +68,14 @@ spec:
{% else %}
{% for mp in cnf["mountpoints"][volume]["mountpoints"] %}
- name: {{mp}}
- hostPath:
+ hostPath:
path: {{cnf["storage-mount-path"]}}/{{mp}}
{% endfor %}
{% endif %}
- {% endfor %}
+ {% endfor %}
{% endif %}
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: node-role.kubernetes.io/master
- effect: NoSchedule
+ effect: NoSchedule
diff --git a/src/ClusterManager/cluster_manager.py b/src/ClusterManager/cluster_manager.py
index ae8427d66..87d7cecad 100755
--- a/src/ClusterManager/cluster_manager.py
+++ b/src/ClusterManager/cluster_manager.py
@@ -1,88 +1,156 @@
-import json
+import yaml
+import subprocess32
import os
-import time
-import argparse
-import uuid
-import subprocess
+import logging
+import logging.config
import sys
+import time
import datetime
+import argparse
+import threading
+import traceback
+import signal
-import yaml
-from jinja2 import Environment, FileSystemLoader, Template
-import base64
+from prometheus_client.twisted import MetricsResource
+from prometheus_client import Histogram
-import re
-import random
+from twisted.web.server import Site
+from twisted.web.resource import Resource
+from twisted.internet import reactor
+
+logger = logging.getLogger(__name__)
+
+manager_iteration_histogram = Histogram("manager_iteration_latency_seconds",
+ "latency for manager to iterate",
+ buckets=(2.5, 5.0, 10.0, 20.0, 40.0, 80.0, 160.0, float("inf")),
+ labelnames=("name",))
-import textwrap
-import logging
-import logging.config
-import job_manager
-import user_manager
-import node_manager
-import joblog_manager
-import command_manager
-import endpoint_manager
+class HealthResource(Resource):
+ def render_GET(self, request):
+ request.setHeader("Content-Type", "text/html; charset=utf-8")
+ return "Ok".encode("utf-8")
-from multiprocessing import Process, Manager
+def exporter_thread(port):
+ root = Resource()
+ root.putChild(b"metrics", MetricsResource())
+ root.putChild(b"healthz", HealthResource())
+ factory = Site(root)
+ reactor.listenTCP(port, factory)
+ reactor.run(installSignalHandlers=False)
+def setup_exporter_thread(port):
+ t = threading.Thread(target=exporter_thread, args=(port,),
+ name="exporter")
+ t.start()
+ return t
-def create_log(logdir='/var/log/dlworkspace'):
+def create_log(logdir="/var/log/dlworkspace"):
if not os.path.exists(logdir):
os.system("mkdir -p " + logdir)
- with open('logging.yaml') as f:
+ with open("logging.yaml") as f:
logging_config = yaml.load(f)
+ logging_config["handlers"]["file"]["filename"] = logdir + "/clustermanager.log"
+ logging.config.dictConfig(logging_config)
+
+def dumpstacks(signal, frame):
+ id2name = dict([(th.ident, th.name) for th in threading.enumerate()])
+ code = []
+ for threadId, stack in sys._current_frames().items():
+ code.append("\n# Thread: %s(%d)" % (id2name.get(threadId,""), threadId))
+ for filename, lineno, name, line in traceback.extract_stack(stack):
+ code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
+ if line:
+ code.append(" %s" % (line.strip()))
+ print "\n".join(code)
+ sys.stdout.flush()
+ sys.stderr.flush()
+
+def register_stack_trace_dump():
+ signal.signal(signal.SIGTRAP, dumpstacks)
+
+def update_file_modification_time(path):
+ if not os.path.isfile(path):
+ f = open(path, "w")
f.close()
- logging_config["handlers"]["file"]["filename"] = logdir+"/clustermanager.log"
- logging.config.dictConfig(logging_config)
+ mod_time = time.mktime(datetime.datetime.now().timetuple())
+ os.utime(path, (mod_time, mod_time))
-def Run():
- create_log()
-
- logging.info("Starting job manager... ")
- proc_job = Process(target=job_manager.Run)
- proc_job.start()
-
- logging.info("Starting user manager... ")
- proc_user = Process(target=user_manager.Run)
- proc_user.start()
-
- logging.info("Starting node manager... ")
- proc_node = Process(target=node_manager.Run)
- proc_node.start()
-
- logging.info("Starting joblogging manager... ")
- proc_joblog = Process(target=joblog_manager.Run)
- proc_joblog.start()
-
- logging.info("Starting command manager... ")
- proc_command = Process(target=command_manager.Run)
- proc_command.start()
+def get_elapsed_seconds(path):
+ mtime = datetime.datetime.fromtimestamp(os.path.getmtime(path))
+ return (datetime.datetime.now() - mtime).seconds
- logging.info("Starting endpoint manager... ")
- proc_endpoint = Process(target=endpoint_manager.Run)
- proc_endpoint.start()
-
- proc_job.join()
- proc_user.join()
- proc_node.join()
- proc_joblog.join()
- proc_command.join()
- proc_endpoint.join()
- pass
-
-
-if __name__ == '__main__':
-
- #parser = argparse.ArgumentParser( prog='cluster_manager.py',
- # formatter_class=argparse.RawDescriptionHelpFormatter,
- # description=textwrap.dedent('''\
- # ''') )
- #parser.add_argument("help",
- # help = "Show the usage of this program" )
-
- #args = parser.parse_args()
+def Run(args):
+ register_stack_trace_dump()
+ create_log()
- Run()
+ cwd = os.path.dirname(__file__)
+ cmds = {
+ "job_manager":
+ ["python", os.path.join(cwd, "job_manager.py"), "--port", str(args.j)],
+ "user_manager":
+ ["python", os.path.join(cwd, "user_manager.py"), "--port", str(args.u)],
+ "node_manager":
+ ["python", os.path.join(cwd, "node_manager.py"), "--port", str(args.n)],
+ "joblog_manager":
+ ["python", os.path.join(cwd, "joblog_manager.py"), "--port", str(args.l)],
+ "command_manager":
+ ["python", os.path.join(cwd, "command_manager.py"), "--port", str(args.c)],
+ "endpoint_manager":
+ ["python", os.path.join(cwd, "endpoint_manager.py"), "--port", str(args.e)],
+ }
+
+ FNULL = open(os.devnull, "w")
+
+ childs = {}
+
+ while True:
+ try:
+ work(cmds, childs, FNULL)
+ except Exception as e:
+ logger.exception("caught exception while doing work")
+ time.sleep(60)
+
+def work(cmds, childs, FNULL):
+ for key, cmd in cmds.items():
+ child = childs.get(key)
+ need_start = False
+
+ if child is None or child.poll() is not None:
+ if child is not None:
+ logger.info("%s is dead restart it", cmd)
+ need_start = True
+ else:
+ sec = get_elapsed_seconds(key)
+ if sec <= args.tictoc:
+ continue
+ logger.info("%s did not update file for %d seconds, restart it",
+ key, sec)
+ child.send_signal(signal.SIGTRAP) # try to print their stacktrace
+ time.sleep(1)
+ child.kill()
+ sys.stdout.flush()
+ sys.stderr.flush()
+ need_start = True
+
+ if need_start:
+ update_file_modification_time(key)
+ try:
+ childs[key] = subprocess32.Popen(cmd, stdin=FNULL)
+ except Exception as e:
+ logger.exception("caught exception when trying to start %s, ignore", cmd)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--tictoc", help="how many seconds to wait until kill subprocess", type=int, default=600)
+ parser.add_argument("-j", help="port of job_manager", type=int, default=9200)
+ parser.add_argument("-u", help="port of user_manager", type=int, default=9201)
+ parser.add_argument("-n", help="port of node_manager", type=int, default=9202)
+ parser.add_argument("-l", help="port of joblog_manager", type=int, default=9203)
+ parser.add_argument("-c", help="port of command_manager", type=int, default=9204)
+ parser.add_argument("-e", help="port of endpoint_manager", type=int, default=9205)
+ args = parser.parse_args()
+
+ sys.exit(Run(args))
diff --git a/src/ClusterManager/command_manager.py b/src/ClusterManager/command_manager.py
index 6038c86c3..86458001a 100755
--- a/src/ClusterManager/command_manager.py
+++ b/src/ClusterManager/command_manager.py
@@ -8,7 +8,6 @@
import datetime
import copy
-
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage"))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils"))
@@ -21,7 +20,6 @@
from jinja2 import Environment, FileSystemLoader, Template
from config import config, GetStoragePath
from DataHandler import DataHandler
-from node_manager import create_log
from node_manager import get_cluster_status
import base64
@@ -32,8 +30,10 @@
import random
import logging
-import logging.config
+from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time
+
+logger = logging.getLogger(__name__)
def RunCommand(command):
dataHandler = DataHandler()
@@ -42,21 +42,40 @@ def RunCommand(command):
dataHandler.Close()
return True
+def create_log(logdir = '/var/log/dlworkspace'):
+ if not os.path.exists(logdir):
+ os.system("mkdir -p " + logdir)
+ with open('logging.yaml') as f:
+ logging_config = yaml.full_load(f)
+ f.close()
+ logging_config["handlers"]["file"]["filename"] = logdir+"/command_manager.log"
+ logging.config.dictConfig(logging_config)
def Run():
+ register_stack_trace_dump()
+ create_log()
+
while True:
- try:
- dataHandler = DataHandler()
- pendingCommands = dataHandler.GetPendingCommands()
- for command in pendingCommands:
- try:
- print "Processing command: %s" % (command["id"])
- RunCommand(command)
- except Exception as e:
- print e
- except Exception as e:
- print e
+ update_file_modification_time("command_manager")
+
+ with manager_iteration_histogram.labels("command_manager").time():
+ try:
+ dataHandler = DataHandler()
+ pendingCommands = dataHandler.GetPendingCommands()
+ for command in pendingCommands:
+ try:
+ logger.info("Processing command: %s", command["id"])
+ RunCommand(command)
+ except Exception as e:
+ logger.exception("run command failed")
+ except Exception as e:
+ logger.exception("getting command failed")
time.sleep(1)
if __name__ == '__main__':
- Run()
\ No newline at end of file
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9204)
+ args = parser.parse_args()
+ setup_exporter_thread(args.port)
+
+ Run()
diff --git a/src/ClusterManager/dist_pod_template.py b/src/ClusterManager/dist_pod_template.py
new file mode 100644
index 000000000..493764ab9
--- /dev/null
+++ b/src/ClusterManager/dist_pod_template.py
@@ -0,0 +1,156 @@
+import os
+import sys
+import uuid
+import datetime
+import random
+import json
+import copy
+import yaml
+from jinja2 import Template
+from job import Job
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))
+from config import config
+from osUtils import mkdirsAsUser
+
+
+class DistPodTemplate():
+ def __init__(self, template, enable_custom_scheduler=False):
+ self.template = template
+ self.enable_custom_scheduler = enable_custom_scheduler
+
+ @staticmethod
+ def generate_launch_script(dist_role, dist_role_idx, user_id, job_path, cmd):
+ # change ssh folder permission here because the setup permission
+ # script in launch_ps_job function may have race condition with init_user.sh script.
+ # results in no such user error
+
+ local_pod_path = os.path.join(config["storage-mount-path"], "work/", job_path, "{}-{}".format(dist_role, dist_role_idx))
+ if not os.path.exists(local_pod_path):
+ mkdirsAsUser(local_pod_path, user_id)
+ file_name = "job_command.sh"
+ launch_script_file = os.path.join(local_pod_path, file_name)
+ with open(launch_script_file, 'w') as f:
+ f.write(cmd)
+ f.close()
+
+ launchCMD = ["bash", "/pod/scripts/bootstrap.sh"]
+ return launchCMD
+
+ def generate_pod(self, pod):
+ assert(isinstance(self.template, Template))
+
+ dist_id = pod["distId"]
+ job_id = pod["jobId"]
+ job_path = pod["jobPath"]
+
+ pod["podName"] = "{}-{}".format(job_id, dist_id)
+
+ random.seed(datetime.datetime.now())
+ if "hostNetwork" in pod and pod["hostNetwork"]:
+ pod["sshPort"] = random.randint(40000, 49999)
+ else:
+ pod["sshPort"] = int(random.random() * 1000 + 3000)
+
+ if (pod["distRole"] == "worker"):
+ pod["gpuLimit"] = pod["resourcegpu"]
+ else:
+ pod["gpuLimit"] = 0
+
+ if "envs" not in pod:
+ pod["envs"] = []
+ pod["envs"].append({"name": "DLWS_ROLE_NAME", "value": pod["distRole"]})
+ pod["envs"].append({"name": "DLWS_ROLE_IDX", "value": pod["distRoleIdx"]})
+
+ if "labels" not in pod:
+ pod["labels"] = []
+ pod["labels"].append({"name": "distRole", "value": pod["distRole"]})
+ pod["labels"].append({"name": "distRoleIdx", "value": pod["distRoleIdx"]})
+ pod["labels"].append({"name": "sshPort", "value": pod["sshPort"]})
+
+ cmd = pod["cmd"]
+ pod["LaunchCMD"] = DistPodTemplate.generate_launch_script(pod["distRole"], pod["distRoleIdx"], pod["userId"], job_path, cmd)
+
+ pod_yaml = self.template.render(job=pod)
+ return yaml.full_load(pod_yaml)
+
+ def generate_pods(self, job):
+ """
+ Return (pods, errors)
+ """
+ assert(isinstance(job, Job))
+ params = job.params
+
+ if any(required_field not in params for required_field in
+ [
+ "jobtrainingtype",
+ "jobName",
+ "jobPath",
+ "workPath",
+ "dataPath",
+ "cmd",
+ "userId",
+ "resourcegpu",
+ "userName",
+ ]):
+ return None, "Missing required parameters!"
+ assert(params["jobtrainingtype"] == "PSDistJob")
+
+ job.job_path = params["jobPath"]
+ job.work_path = params["workPath"]
+ job.data_path = params["dataPath"]
+ # TODO user's mountpoints first, but should after 'job_path'
+ job.add_mountpoints(job.job_path_mountpoint())
+ if "mountpoints" in params:
+ job.add_mountpoints(params["mountpoints"])
+ job.add_mountpoints(job.work_path_mountpoint())
+ job.add_mountpoints(job.data_path_mountpoint())
+ params["mountpoints"] = job.mountpoints
+
+ params["user_email"] = params["userName"]
+ params["homeFolderHostpath"] = job.get_homefolder_hostpath()
+ params["pod_ip_range"] = job.get_pod_ip_range()
+ params["usefreeflow"] = job.is_freeflow_enabled()
+ params["jobNameLabel"] = ''.join(e for e in params["jobName"] if e.isalnum())
+ params["rest-api"] = job.get_rest_api_url()
+
+ if "nodeSelector" not in params:
+ params["nodeSelector"] = {}
+ if "gpuType" in params:
+ params["nodeSelector"]["gpuType"] = params["gpuType"]
+ assignedRack = job.get_rack()
+ if assignedRack is not None:
+ params["nodeSelector"]["rack"] = assignedRack
+
+ params["numworker"] = int(params["numpsworker"])
+ params["numps"] = int(params["numps"])
+
+ if "envs" not in params:
+ params["envs"] = []
+ params["envs"].append({"name": "DLWS_NUM_GPU_PER_WORKER", "value": params["resourcegpu"]})
+
+ if "hostNetwork" in params and params["hostNetwork"]:
+ params["envs"].append({"name": "DLWS_HOST_NETWORK", "value": "enable"})
+ params["envs"].append({"name": "DLWS_WORKER_NUM", "value": params["numworker"]})
+
+ pods = []
+ nums = {"ps": int(params["numps"]), "worker": int(params["numpsworker"])}
+ for role in ["ps", "worker"]:
+ for idx in range(nums[role]):
+ pod = copy.deepcopy(params)
+ pod["distRole"] = role
+ pod["distRoleIdx"] = idx
+ pod["distId"] = "%s%d" % (role, idx)
+ # mount /pod
+ local_pod_path = job.get_hostpath(job.job_path, "%s-%d" % (role, idx))
+ pod["mountpoints"].append({"name": "pod", "containerPath": "/pod", "hostPath": local_pod_path, "enabled": True})
+
+
+ pods.append(pod)
+
+ k8s_pods = []
+ for pod in pods:
+ k8s_pod = self.generate_pod(pod)
+ k8s_pods.append(k8s_pod)
+
+ return k8s_pods, None
diff --git a/src/ClusterManager/endpoint_manager.py b/src/ClusterManager/endpoint_manager.py
index 272b3bf69..15c8c2315 100755
--- a/src/ClusterManager/endpoint_manager.py
+++ b/src/ClusterManager/endpoint_manager.py
@@ -1,6 +1,4 @@
-from config import config, GetStoragePath, GetWorkPath
-import k8sUtils
-from DataHandler import DataHandler
+
import json
import os
import time
@@ -11,8 +9,19 @@
import traceback
import random
import re
+import logging
+import yaml
+import logging.config
+
+import argparse
+from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))
+import k8sUtils
+from config import config, GetStoragePath, GetWorkPath
+from DataHandler import DataHandler
+
+logger = logging.getLogger(__name__)
def is_ssh_server_ready(pod_name):
@@ -76,14 +85,14 @@ def generate_node_port_service(job_id, pod_name, endpoint_id, name, target_port)
targetPort: {4}
port: {4}
""".format(job_id, pod_name, endpoint_id, name, target_port)
- print("endpointDescription: %s" % endpoint_description)
+ logger.info("endpointDescription: %s", endpoint_description)
return endpoint_description
def create_node_port(endpoint):
endpoint_description = generate_node_port_service(endpoint["jobId"], endpoint["podName"], endpoint["id"], endpoint["name"], endpoint["podPort"])
endpoint_description_path = os.path.join(config["storage-mount-path"], endpoint["endpointDescriptionPath"])
- print("endpointDescriptionPath: %s" % endpoint_description_path)
+ logger.info("endpointDescriptionPath: %s", endpoint_description_path)
with open(endpoint_description_path, 'w') as f:
f.write(endpoint_description)
@@ -91,18 +100,18 @@ def create_node_port(endpoint):
if result == "":
raise Exception("Failed to create NodePort for ssh. JobId: %s " % endpoint["jobId"])
- print("Submitted endpoint %s to k8s, returned with status %s" % (endpoint["jobId"], result))
+ logger.info("Submitted endpoint %s to k8s, returned with status %s", endpoint["jobId"], result)
def setup_ssh_server(user_name, pod_name, host_network=False):
'''Setup ssh server on pod and return the port'''
# setup ssh server only is the ssh server is not up
if not is_ssh_server_ready(pod_name):
- print("Ssh server is not ready for pod: %s. Setup ..." % pod_name)
+ logger.info("Ssh server is not ready for pod: %s. Setup ...", pod_name)
ssh_port = start_ssh_server(pod_name, user_name, host_network)
else:
ssh_port = query_ssh_port(pod_name)
- print("Ssh server is ready for pod: %s. Ssh listen on %s" % (pod_name, ssh_port))
+ logger.info("Ssh server is ready for pod: %s. Ssh listen on %s", pod_name, ssh_port)
return ssh_port
@@ -127,7 +136,7 @@ def setup_tensorboard(user_name, pod_name):
def start_endpoint(endpoint):
# pending, running, stopped
- print("Starting endpoint: %s" % (endpoint))
+ logger.info("Starting endpoint: %s", endpoint)
# podName
pod_name = endpoint["podName"]
@@ -148,50 +157,45 @@ def start_endpoint(endpoint):
create_node_port(endpoint)
-def is_user_ready(pod_name):
- bash_script = "bash -c 'ls /dlws/USER_READY'"
- output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script))
- if output == "":
- return False
- return True
-
-
def start_endpoints():
try:
+ data_handler = DataHandler()
try:
- data_handler = DataHandler()
pending_endpoints = data_handler.GetPendingEndpoints()
for endpoint_id, endpoint in pending_endpoints.items():
- job = data_handler.GetJob(jobId=endpoint["jobId"])[0]
- if job["jobStatus"] != "running":
- continue
- if not is_user_ready(endpoint["podName"]):
- continue
-
- # get endpointDescriptionPath
- # job["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
- endpoint_description_dir = re.search("(.*/)[^/\.]+.yaml", job["jobDescriptionPath"]).group(1)
- endpoint["endpointDescriptionPath"] = os.path.join(endpoint_description_dir, endpoint_id + ".yaml")
-
- print("\n\n\n\n\n\n----------------Begin to start endpoint %s" % endpoint["id"])
- output = get_k8s_endpoint(endpoint["endpointDescriptionPath"])
- if(output != ""):
- endpoint_description = json.loads(output)
- endpoint["endpointDescription"] = endpoint_description
- endpoint["status"] = "running"
- pod = k8sUtils.GetPod("podName=" + endpoint["podName"])
- if "items" in pod and len(pod["items"]) > 0:
- endpoint["nodeName"] = pod["items"][0]["spec"]["nodeName"]
- else:
- start_endpoint(endpoint)
-
- endpoint["lastUpdated"] = datetime.datetime.now().isoformat()
- data_handler.UpdateEndpoint(endpoint)
+ try:
+ job = data_handler.GetJob(jobId=endpoint["jobId"])[0]
+ if job["jobStatus"] != "running":
+ continue
+
+ # get endpointDescriptionPath
+ # job["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
+ endpoint_description_dir = re.search("(.*/)[^/\.]+.yaml", job["jobDescriptionPath"]).group(1)
+ endpoint["endpointDescriptionPath"] = os.path.join(endpoint_description_dir, endpoint_id + ".yaml")
+
+ logger.info("\n\n\n\n\n\n----------------Begin to start endpoint %s", endpoint["id"])
+ output = get_k8s_endpoint(endpoint["endpointDescriptionPath"])
+ if(output != ""):
+ endpoint_description = json.loads(output)
+ endpoint["endpointDescription"] = endpoint_description
+ endpoint["status"] = "running"
+ pod = k8sUtils.GetPod("podName=" + endpoint["podName"])
+ if "items" in pod and len(pod["items"]) > 0:
+ endpoint["nodeName"] = pod["items"][0]["spec"]["nodeName"]
+ else:
+ start_endpoint(endpoint)
+
+ endpoint["lastUpdated"] = datetime.datetime.now().isoformat()
+ data_handler.UpdateEndpoint(endpoint)
+ except Exception as e:
+ logger.warning("Process endpoint failed {}".format(endpoint), exc_info=True)
except Exception as e:
- traceback.print_exc()
+ logger.exception("start endpoint failed")
+ finally:
+ data_handler.Close()
except Exception as e:
- traceback.print_exc()
+ logger.exception("close data handler failed")
def cleanup_endpoints():
@@ -200,45 +204,69 @@ def cleanup_endpoints():
try:
dead_endpoints = data_handler.GetDeadEndpoints()
for endpoint_id, dead_endpoint in dead_endpoints.items():
- print("\n\n\n\n\n\n----------------Begin to cleanup endpoint %s" % endpoint_id)
- endpoint_description_path = os.path.join(config["storage-mount-path"], dead_endpoint["endpointDescriptionPath"])
- still_running = get_k8s_endpoint(endpoint_description_path)
- # empty mean not existing
- if still_running == "":
- print("Endpoint already gone %s" % endpoint_id)
- status = "stopped"
- else:
- output = k8sUtils.kubectl_delete(endpoint_description_path)
- # 0 for success
- if output == 0:
+ try:
+ logger.info("\n\n\n\n\n\n----------------Begin to cleanup endpoint %s", endpoint_id)
+ endpoint_description_path = os.path.join(config["storage-mount-path"], dead_endpoint["endpointDescriptionPath"])
+ still_running = get_k8s_endpoint(endpoint_description_path)
+ # empty mean not existing
+ if still_running == "":
+ logger.info("Endpoint already gone %s", endpoint_id)
status = "stopped"
- print("Succeed cleanup endpoint %s" % endpoint_id)
else:
- # TODO will need to clean it up eventually
- status = "unknown"
- print("Clean dead endpoint %s failed, endpoints: %s" % (endpoint_id, dead_endpoint))
-
- dead_endpoint["status"] = status
- dead_endpoint["lastUpdated"] = datetime.datetime.now().isoformat()
- data_handler.UpdateEndpoint(dead_endpoint)
+ output = k8sUtils.kubectl_delete(endpoint_description_path)
+ # 0 for success
+ if output == 0:
+ status = "stopped"
+ logger.info("Succeed cleanup endpoint %s", endpoint_id)
+ else:
+ # TODO will need to clean it up eventually
+ status = "unknown"
+ logger.info("Clean dead endpoint %s failed, endpoints: %s", endpoint_id, dead_endpoint)
+
+ # we are not changing status from "pending", "pending" endpoints are planed to setup later
+ if dead_endpoint["status"] != "pending":
+ dead_endpoint["status"] = status
+ dead_endpoint["lastUpdated"] = datetime.datetime.now().isoformat()
+ data_handler.UpdateEndpoint(dead_endpoint)
+ except Exception as e:
+ logger.warning("Clanup endpoint failed {}".format(dead_endpoint), exc_info=True)
except Exception as e:
- traceback.print_exc()
+ logger.exception("cleanup endpoint failed")
finally:
data_handler.Close()
except Exception as e:
- traceback.print_exc()
+ logger.exception("close data handler failed")
+
+def create_log(logdir = '/var/log/dlworkspace'):
+ if not os.path.exists(logdir):
+ os.system("mkdir -p " + logdir)
+ with open('logging.yaml') as f:
+ logging_config = yaml.full_load(f)
+ f.close()
+ logging_config["handlers"]["file"]["filename"] = logdir+"/endpoint_manager.log"
+ logging.config.dictConfig(logging_config)
def Run():
+ register_stack_trace_dump()
+ create_log()
+
while True:
- # start endpoints
- start_endpoints()
- time.sleep(1)
+ update_file_modification_time("endpoint_manager")
- # clean up endpoints for jobs which is NOT running
- cleanup_endpoints()
- time.sleep(1)
+ with manager_iteration_histogram.labels("endpoint_manager").time():
+ # start endpoints
+ start_endpoints()
+ time.sleep(1)
+ # clean up endpoints for jobs which is NOT running
+ cleanup_endpoints()
+ time.sleep(1)
if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9205)
+ args = parser.parse_args()
+ setup_exporter_thread(args.port)
+
Run()
diff --git a/src/ClusterManager/job.py b/src/ClusterManager/job.py
new file mode 100644
index 000000000..993f35b58
--- /dev/null
+++ b/src/ClusterManager/job.py
@@ -0,0 +1,169 @@
+import sys
+import os
+import random
+from datetime import date
+from marshmallow import Schema, fields, pprint, post_load, validate
+from jinja2 import Environment, FileSystemLoader, Template
+
+import logging
+import logging.config
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))
+from osUtils import mkdirsAsUser
+
+
+# TODO remove it latter
+def create_log(logdir='.'):
+ if not os.path.exists(logdir):
+ os.system("mkdir -p " + logdir)
+ with open('logging.yaml') as f:
+ logging_config = yaml.full_load(f)
+ f.close()
+ logging_config["handlers"]["file"]["filename"] = logdir + "/jobmanager.log"
+ logging.config.dictConfig(logging_config)
+
+
+class Job:
+ def __init__(self,
+ cluster,
+ job_id,
+ email,
+ mountpoints=None,
+ job_path="",
+ work_path="",
+ data_path="",
+ params=None,
+ ):
+ """
+ job_id: an unique string for the job.
+ email: user's email.
+ cluster: cluster config.
+ job_path: relative path, on shared storage, for example "user_alias/jobs/date/job_id".
+ work_path: relative path, on shared storage, for example "user_alias".
+ """
+ self.cluster = cluster
+ self.job_id = job_id
+ self.email = email
+ self.mountpoints = mountpoints
+ self.job_path = job_path
+ self.work_path = work_path
+ self.data_path = data_path
+ self.params = params
+
+ def add_mountpoints(self, mountpoint):
+ '''
+ 1. Silently skip if the name/hostPath/containerPath duplicates with an existing one.
+ 2. Name would be normalized.
+
+ Mountpoint example:
+ {
+ "enabled":true,
+ "containerPath":"/home/username",
+ "hostPath":"/dlwsdata/work/username",
+ "name":"homefolder"
+ }
+ '''
+ if mountpoint is None:
+ return
+ if self.mountpoints is None:
+ self.mountpoints = []
+
+ # add each items in the list one by one
+ if isinstance(mountpoint, list):
+ for m in mountpoint:
+ self.add_mountpoints(m)
+ return
+
+ # only allow alphanumeric in "name"
+ if "name" not in mountpoint or mountpoint["name"] == "":
+ mountpoint["name"] = mountpoint["containerPath"]
+ mountpoint["name"] = ''.join(c for c in mountpoint["name"] if c.isalnum())
+
+ # skip dulicate entry
+ for item in self.mountpoints:
+ if item["name"] == mountpoint["name"] or item["containerPath"] == mountpoint["containerPath"] or item["hostPath"] == mountpoint["hostPath"]:
+ logging.warn("Duplciate mountpoint: %s" % mountpoint)
+ return
+
+ self.mountpoints.append(mountpoint)
+
+ def get_alias(self):
+ return self.email.split("@")[0].strip()
+
+ def get_hostpath(self, *path_relate_to_workpath):
+ """return os.path.join(self.cluster["storage-mount-path"], "work", *path_relate_to_workpath)"""
+ return os.path.join(self.cluster["storage-mount-path"], "work", *path_relate_to_workpath)
+
+ def get_homefolder_hostpath(self):
+ return self.get_hostpath(self.get_alias())
+
+ def job_path_mountpoint(self):
+ assert(len(self.job_path) > 0)
+ job_host_path = self.get_hostpath(self.job_path)
+ return {"name": "job", "containerPath": "/job", "hostPath": job_host_path, "enabled": True}
+
+ def work_path_mountpoint(self):
+ assert(len(self.work_path) > 0)
+ work_host_path = self.get_hostpath(self.work_path)
+ return {"name": "work", "containerPath": "/work", "hostPath": work_host_path, "enabled": True}
+
+ def data_path_mountpoint(self):
+ assert(self.data_path is not None)
+ data_host_path = os.path.join(self.cluster["storage-mount-path"], "storage", self.data_path)
+ return {"name": "data", "containerPath": "/data", "hostPath": data_host_path, "enabled": True}
+
+ def get_template(self):
+ """Return jinja template."""
+ path = os.path.abspath(os.path.join(self.cluster["root-path"], "Jobs_Templete", "pod.yaml.template"))
+ ENV = Environment(loader=FileSystemLoader("/"))
+ template = ENV.get_template(path)
+ assert(isinstance(template, Template))
+ return template
+
+ def is_custom_scheduler_enabled(self):
+ return self._get_cluster_config("kube_custom_scheduler")
+
+ def get_rest_api_url(self):
+ return self._get_cluster_config("rest-api")
+
+ def get_pod_ip_range(self):
+ return self._get_cluster_config("pod_ip_range")
+
+ def is_freeflow_enabled(self):
+ return self._get_cluster_config("usefreeflow")
+
+ def get_rack(self):
+ racks = self._get_cluster_config("racks")
+ if racks is None or len(racks) == 0:
+ return None
+ # TODO why random.choice?
+ return random.choice(racks)
+
+ def _get_cluster_config(self, key):
+ if key in self.cluster:
+ return self.cluster[key]
+ return None
+
+
+class JobSchema(Schema):
+ cluster = fields.Dict(required=True)
+ job_id = fields.String(required=True,
+ # Correctly mappging the name
+ dump_to="jobId", load_from="jobId",
+ # We use the id as "name" in k8s object.
+ # By convention, the "names" of Kubernetes resources should be
+ # up to maximum length of 253 characters and consist of lower case
+ # alphanumeric characters, -, and .,
+ # but certain resources have more specific restrictions.
+ validate=validate.Regexp(r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$',
+ error="'{input}' does not match expected pattern {regex}."))
+ email = fields.Email(required=True, dump_to="userName", load_from="userName")
+ mountpoints = fields.Dict(required=False)
+ job_path = fields.String(required=False, dump_to="jobPath", load_from="jobPath")
+ work_path = fields.String(required=False, dump_to="workPath", load_from="workPath")
+ data_path = fields.String(required=False, dump_to="dataPath", load_from="dataPath")
+ params = fields.Dict(required=False)
+
+ @post_load
+ def make_user(self, data, **kwargs):
+ return Job(**data)
diff --git a/src/ClusterManager/job_deployer.py b/src/ClusterManager/job_deployer.py
new file mode 100644
index 000000000..ac80c22a1
--- /dev/null
+++ b/src/ClusterManager/job_deployer.py
@@ -0,0 +1,200 @@
+import yaml
+import os
+import logging
+import logging.config
+import timeit
+import functools
+
+from kubernetes import client, config
+from kubernetes.client.rest import ApiException
+from kubernetes.stream import stream
+from kubernetes.stream.ws_client import ERROR_CHANNEL, STDERR_CHANNEL, STDOUT_CHANNEL
+
+from prometheus_client import Histogram
+
+job_deployer_fn_histogram = Histogram("job_deployer_fn_latency_seconds",
+ "latency for executing job deployer (seconds)",
+ buckets=(.05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0,
+ 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, float("inf")),
+ labelnames=("fn_name",))
+
+def record(fn):
+ @functools.wraps(fn)
+ def wrapped(*args, **kwargs):
+ start = timeit.default_timer()
+ try:
+ return fn(*args, **kwargs)
+ finally:
+ elapsed = timeit.default_timer() - start
+ job_deployer_fn_histogram.labels(fn.__name__).observe(elapsed)
+ return wrapped
+
+
+# The config will be loaded from default location.
+config.load_kube_config()
+k8s_client = client.CoreV1Api()
+
+
+class JobDeployer:
+
+ def __init__(self):
+ self.v1 = k8s_client
+ self.namespace = "default"
+ self.pretty = "pretty_example"
+
+ @record
+ def create_pod(self, body, dry_run=None):
+ api_response = self.v1.create_namespaced_pod(
+ namespace=self.namespace,
+ body=body,
+ pretty=self.pretty,
+ dry_run=dry_run,
+ )
+ return api_response
+
+ @record
+ def delete_pod(self, name, grace_period_seconds=None, dry_run=None):
+ body = client.V1DeleteOptions()
+ body.grace_period_seconds = grace_period_seconds
+ body.dry_run = dry_run
+ api_response = self.v1.delete_namespaced_pod(
+ name=name,
+ namespace=self.namespace,
+ pretty=self.pretty,
+ body=body,
+ grace_period_seconds=grace_period_seconds,
+ dry_run=dry_run,
+ )
+ return api_response
+
+ @record
+ def create_service(self, body, dry_run=None):
+ api_response = self.v1.create_namespaced_service(
+ namespace=self.namespace,
+ body=body,
+ pretty=self.pretty,
+ dry_run=dry_run,
+ )
+ return api_response
+
+ @record
+ def delete_service(self, name, dry_run=None):
+ api_response = self.v1.delete_namespaced_service(
+ name=name,
+ namespace=self.namespace,
+ pretty=self.pretty,
+ body=client.V1DeleteOptions(),
+ dry_run=dry_run,
+ )
+ return api_response
+
+ @record
+ def cleanup_pods(self, pod_names, force=False):
+ errors = []
+ grace_period_seconds = 0 if force else None
+ for pod_name in pod_names:
+ try:
+ self.delete_pod(pod_name, grace_period_seconds)
+ except Exception as e:
+ if isinstance(e, ApiException) and 404 == e.status:
+ return []
+ message = "Delete pod failed: {}".format(pod_name)
+ logging.warning(message, exc_info=True)
+ errors.append({"message": message, "exception": e})
+ return errors
+
+ @record
+ def cleanup_services(self, services):
+ errors = []
+ for service in services:
+ assert(isinstance(service, client.V1Service))
+ try:
+ service_name = service.metadata.name
+ self.delete_service(service_name)
+ except ApiException as e:
+ message = "Delete service failed: {}".format(service_name)
+ logging.warning(message, exc_info=True)
+ errors.append({"message": message, "exception": e})
+ return errors
+
+ @record
+ def create_pods(self, pods):
+ # TODO instead of delete, we could check update existiong ones. During refactoring, keeping the old way.
+ pod_names = [pod["metadata"]["name"] for pod in pods]
+ self.cleanup_pods(pod_names)
+ created = []
+ for pod in pods:
+ created_pod = self.create_pod(pod)
+ created.append(created_pod)
+ logging.info("Create pod succeed: %s" % created_pod.metadata.name)
+ return created
+
+ @record
+ def get_pods(self, field_selector="", label_selector=""):
+ api_response = self.v1.list_namespaced_pod(
+ namespace=self.namespace,
+ pretty=self.pretty,
+ field_selector=field_selector,
+ label_selector=label_selector,
+ )
+ logging.debug("Get pods: {}".format(api_response))
+ return api_response.items
+
+ @record
+ def get_services_by_label(self, label_selector):
+ api_response = self.v1.list_namespaced_service(
+ namespace=self.namespace,
+ pretty=self.pretty,
+ label_selector=label_selector,
+ )
+ return api_response.items
+
+ @record
+ def delete_job(self, job_id, force=False):
+ label_selector = "run={}".format(job_id)
+
+ # query pods then delete
+ pods = self.get_pods(label_selector=label_selector)
+ pod_names = [pod.metadata.name for pod in pods]
+ pod_errors = self.cleanup_pods(pod_names, force)
+
+ # query services then delete
+ services = self.get_services_by_label(label_selector)
+ service_errors = self.cleanup_services(services)
+
+ errors = pod_errors + service_errors
+ return errors
+
+ @record
+ def pod_exec(self, pod_name, exec_command, timeout=60):
+ """work as the command (with timeout): kubectl exec 'pod_name' 'exec_command'"""
+ try:
+ logging.info("Exec on pod {}: {}".format(pod_name, exec_command))
+ client = stream(
+ self.v1.connect_get_namespaced_pod_exec,
+ name=pod_name,
+ namespace=self.namespace,
+ command=exec_command,
+ stderr=True,
+ stdin=False,
+ stdout=True,
+ tty=False,
+ _preload_content=False,
+ )
+ client.run_forever(timeout=timeout)
+
+ err = yaml.full_load(client.read_channel(ERROR_CHANNEL))
+ if err is None:
+ return [-1, "Timeout"]
+
+ if err["status"] == "Success":
+ status_code = 0
+ else:
+ logging.debug("Exec on pod {} failed. cmd: {}, err: {}.".format(pod_name, exec_command, err))
+ status_code = int(err["details"]["causes"][0]["message"])
+ output = client.read_all()
+ logging.info("Exec on pod {}, status: {}, cmd: {}, output: {}".format(pod_name, status_code, exec_command, output))
+ return [status_code, output]
+ except ApiException as err:
+ logging.error("Exec on pod {} error. cmd: {}, err: {}.".format(pod_name, exec_command, err), exc_info=True)
+ return [-1, err.message]
diff --git a/src/ClusterManager/job_manager.py b/src/ClusterManager/job_manager.py
index 616922800..e18720c7a 100755
--- a/src/ClusterManager/job_manager.py
+++ b/src/ClusterManager/job_manager.py
@@ -7,7 +7,7 @@
import sys
import datetime
import copy
-
+import traceback
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage"))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils"))
@@ -17,12 +17,12 @@
import k8sUtils
import joblog_manager
from osUtils import mkdirsAsUser
+import notify
import yaml
from jinja2 import Environment, FileSystemLoader, Template
from config import config, GetStoragePath, GetWorkPath
from DataHandler import DataHandler
-from node_manager import create_log
from node_manager import get_cluster_status
import base64
from ResourceInfo import ResourceInfo
@@ -35,816 +35,262 @@
import logging
import logging.config
+from job import Job, JobSchema
+from pod_template import PodTemplate
+from dist_pod_template import DistPodTemplate
+from job_deployer import JobDeployer
+from job_role import JobRole
-
-nvidiaDriverPath = config["nvidiaDriverPath"]
-
-
-
-def printlog(msg):
- print("%s - %s" % (datetime.datetime.utcnow().strftime("%x %X"),msg))
-
-def LoadJobParams(jobParamsJsonStr):
- return json.loads(jobParamsJsonStr)
-
-def cmd_exec(cmdStr):
- try:
- output = subprocess.check_output(["bash","-c", cmdStr])
- except Exception as e:
- print(e)
- output = ""
- return output
-
-
+from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time
+def all_pods_not_existing(job_id):
+ job_deployer = JobDeployer()
+ job_roles = JobRole.get_job_roles(job_id)
+ statuses = [job_role.status() for job_role in job_roles]
+ logging.info("Job: {}, status: {}".format(job_id, statuses))
+ return all([status == "NotFound" for status in statuses])
def SubmitJob(job):
- jobParams = json.loads(base64.b64decode(job["jobParams"]))
- if jobParams["jobtrainingtype"] == "RegularJob":
- SubmitRegularJob(job)
- elif jobParams["jobtrainingtype"] == "PSDistJob":
- SubmitPSDistJob(job)
-
-def CheckMountPoints(mplist, mp):
- ret = True
- for item in mplist:
- if item["name"] == mp["name"] or item["containerPath"] == mp["containerPath"] or item["hostPath"] == mp["hostPath"]:
- ret = False
- return ret
+ # check if existing any pod with label: run=job_id
+ assert("jobId" in job)
+ job_id = job["jobId"]
+ if not all_pods_not_existing(job_id):
+ logging.warning("Waiting until previously pods are cleaned up! Job {}".format(job_id))
+ job_deployer = JobDeployer()
+ errors = job_deployer.delete_job(job_id, force=True)
+ if errors:
+ logging.warning("Force delete job {}: {}".format(job_id, errors))
+ return
-def SubmitRegularJob(job):
ret = {}
dataHandler = DataHandler()
try:
- jobParams = json.loads(base64.b64decode(job["jobParams"]))
-
- jobParams["pvc_job"] = "jobs-" + jobParams["jobId"]
- jobParams["pvc_work"] = "work-" + jobParams["jobId"]
- jobParams["pvc_data"] = "storage-" + jobParams["jobId"]
-
-
- if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0:
- dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist")
+ # TODO refine later
+ # before resubmit the job, reset the endpoints
+ # update all endpoint to status 'pending', so it would restart when job is ready
+ endpoints = dataHandler.GetJobEndpoints(job_id)
+ for endpoint_id, endpoint in endpoints.items():
+ endpoint["status"] = "pending"
+ logging.info("Reset endpoint status to 'pending': {}".format(endpoint_id))
+ dataHandler.UpdateEndpoint(endpoint)
+
+ job["cluster"] = config
+ job_object, errors = JobSchema().load(job)
+ # TODO assert job_object is a Job
+ assert(isinstance(job_object, Job))
+
+ job_object.params = json.loads(base64.b64decode(job["jobParams"]))
+
+ # inject gid, uid and user
+ # TODO it should return only one entry
+ user_info = dataHandler.GetIdentityInfo(job_object.params["userName"])[0]
+ job_object.params["gid"] = user_info["gid"]
+ job_object.params["uid"] = user_info["uid"]
+ job_object.params["user"] = job_object.get_alias()
+
+ enable_custom_scheduler = job_object.is_custom_scheduler_enabled()
+ if job_object.params["jobtrainingtype"] == "RegularJob":
+ pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler)
+ elif job_object.params["jobtrainingtype"] == "PSDistJob":
+ pod_template = DistPodTemplate(job_object.get_template())
+ else:
+ dataHandler.SetJobError(job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"])
return False
- if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0:
- dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist")
+ pods, error = pod_template.generate_pods(job_object)
+ if error:
+ dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
return False
- #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0:
- # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist")
- # return False
-
-
- jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"])
-
-
- localJobPath = os.path.join(config["storage-mount-path"],jobPath)
-
- if not os.path.exists(localJobPath):
- if "userId" in jobParams:
- mkdirsAsUser(localJobPath,jobParams["userId"])
- mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"])
- else:
- mkdirsAsUser(localJobPath,"0")
- mkdirsAsUser(os.path.join(localJobPath,"models"),"0")
-
- jobParams["LaunchCMD"] = ""
- if "cmd" not in jobParams:
- jobParams["cmd"] = ""
-
- if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "":
- launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"])
- with open(launchScriptPath, 'w') as f:
- f.write("#!/bin/bash -x\n")
- f.write("mkdir /opt; \n")
- f.write("echo 'localhost slots=%s' | tee -a /opt/hostfile; \n" % jobParams["resourcegpu"])
- # TODO refine it later
- f.write("bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c '%s'\n" % jobParams["cmd"])
- f.close()
- if "userId" in jobParams:
- os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath))
- jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"]
-
-
- jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
-
- jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum())
-
- ENV = Environment(loader=FileSystemLoader("/"))
-
- jobTempDir = os.path.join(config["root-path"],"Jobs_Templete")
- jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template")
-
- jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath)
- jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath)
- jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath)
- jobParams["nvidiaDriverPath"] = nvidiaDriverPath
-
-
- jobParams["rest-api"] = config["rest-api"]
-
- if "mountpoints" not in jobParams:
- jobParams["mountpoints"] = []
- for onemount in jobParams["mountpoints"]:
- onemount["name"] = onemount["containerPath"].replace("/","").lower()
-
- # mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True}
- # if CheckMountPoints(jobParams["mountpoints"],mp):
- # jobParams["mountpoints"].append(mp)
-
- mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True}
- if CheckMountPoints(jobParams["mountpoints"],mp):
- jobParams["mountpoints"].append(mp)
-
- mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True}
- if CheckMountPoints(jobParams["mountpoints"],mp):
- jobParams["mountpoints"].append(mp)
-
- mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True}
- if CheckMountPoints(jobParams["mountpoints"],mp):
- jobParams["mountpoints"].append(mp)
-
- userAlias = getAlias(jobParams["userName"])
- jobParams["user_email"] = jobParams["userName"]
- jobParams["homeFolderHostpath"] = os.path.join(config["storage-mount-path"], GetWorkPath(userAlias))
-
- if CheckMountPoints(jobParams["mountpoints"],mp):
- jobParams["mountpoints"].append(mp)
-
- for idx in range(len(jobParams["mountpoints"])):
- if "name" not in jobParams["mountpoints"][idx]:
- jobParams["mountpoints"][idx]["name"] = str(uuid.uuid4()).replace("-","")
+ job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])
+ job_description_path = "jobfiles/" + time.strftime("%y%m%d") + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml"
+ local_jobDescriptionPath = os.path.realpath(os.path.join(config["storage-mount-path"], job_description_path))
+ if not os.path.exists(os.path.dirname(local_jobDescriptionPath)):
+ os.makedirs(os.path.dirname(local_jobDescriptionPath))
+ with open(local_jobDescriptionPath, 'w') as f:
+ f.write(job_description)
+ job_deployer = JobDeployer()
+ try:
+ pods = job_deployer.create_pods(pods)
+ ret["output"] = "Created pods: {}".format([pod.metadata.name for pod in pods])
+ except Exception as e:
+ ret["output"] = "Error: %s" % e.message
+ logging.error(e, exc_info=True)
- jobParams["pod_ip_range"] = config["pod_ip_range"]
- if "usefreeflow" in config:
- jobParams["usefreeflow"] = config["usefreeflow"]
- else:
- jobParams["usefreeflow"] = False
-
- print ("Render Job: %s" % jobParams)
- jobDescriptionList = []
-
- pods = []
- if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams:
- i = int(jobParams["hyperparameterstartvalue"])
- end = int(jobParams["hyperparameterendvalue"])
- step = int(jobParams["hyperparameterstep"])
- c = 0
- while (i <= end):
- pod = {}
- pod["podName"] = jobParams["jobId"]+"-pod-"+str(c)
- pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}]
- i += step
- c += 1
- pods.append(pod)
- else:
- pod = {}
- pod["podName"] = jobParams["jobId"]
- pod["envs"] = []
- pods.append(pod)
-
- if "env" not in jobParams:
- jobParams["env"] = []
- jobParams["commonenv"] = copy.copy(jobParams["env"])
-
-
- for pod in pods:
- jobParams["podName"] = pod["podName"]
- jobParams["env"] = jobParams["commonenv"] + pod["envs"]
-
- if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]:
- container = {}
- container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])}
- podInfo = {}
- podInfo["podname"] = jobParams["podName"]
- if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]:
- # add topology constraints explicitly - for testing
- # if (jobParams["resourcegpu"] >= 2):
- # # both cards in same inner group
- # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1
- # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1
- # if (jobParams["resourcegpu"] >= 3):
- # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1
- # if (jobParams["resourcegpu"] >= 4):
- # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1
- # if (jobParams["resourcegpu"] >= 5):
- # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1
- # if (jobParams["resourcegpu"] >= 6):
- # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1
- # if (jobParams["resourcegpu"] >= 7):
- # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1
- # if (jobParams["resourcegpu"] >= 8):
- # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1
- podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1}
- else:
- # for cases when desired topology is explictly given or not desired
- podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0}
- podInfo["runningcontainer"] = {jobParams["podName"] : container}
-
- if "annotations" not in jobParams:
- jobParams["annotations"] = {}
- jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'"
- jobParams["resourcegpu"] = 0 # gpu requests specified through annotation
-
- if "gpuType" in jobParams:
- if "nodeSelector" not in jobParams:
- jobParams["nodeSelector"] = {}
- jobParams["nodeSelector"]["gpuType"] = jobParams["gpuType"]
-
- # inject gid, uid and user
- # TODO it should return only one entry
- user_info = dataHandler.GetIdentityInfo(jobParams["userName"])[0]
- jobParams["gid"] = user_info["gid"]
- jobParams["uid"] = user_info["uid"]
- jobParams["user"] = userAlias
-
- template = ENV.get_template(os.path.abspath(jobTemp))
- job_description = template.render(job=jobParams)
- jobDescriptionList.append(job_description)
-
- jobDescription = "\n---\n".join(jobDescriptionList)
-
- jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"])
- if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))):
- os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
- if os.path.isfile(jobDescriptionPath):
- output = k8sUtils.kubectl_delete(jobDescriptionPath)
-
- with open(jobDescriptionPath, 'w') as f:
- f.write(jobDescription)
-
- output = k8sUtils.kubectl_create(jobDescriptionPath)
- logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output))
-
- ret["output"] = output
-
- ret["jobId"] = jobParams["jobId"]
-
-
- if "userName" not in jobParams:
- jobParams["userName"] = ""
-
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling")
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"])
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription))
+ ret["jobId"] = job_object.job_id
+ dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling")
+ dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path)
+ dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description))
+ dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat())
jobMeta = {}
- jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
- jobMeta["jobPath"] = jobParams["jobPath"]
- jobMeta["workPath"] = jobParams["workPath"]
- jobMeta["jobPath"] = jobParams["jobPath"]
- jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]
+ jobMeta["jobDescriptionPath"] = job_description_path
+ jobMeta["jobPath"] = job_object.job_path
+ jobMeta["workPath"] = job_object.work_path
+ # the command of the first container
+ jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command
jobMetaStr = base64.b64encode(json.dumps(jobMeta))
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr)
+ dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr)
except Exception as e:
- print(e)
+ logging.error("Submit job failed: %s" % job, exc_info=True)
ret["error"] = str(e)
- retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
+ retries = dataHandler.AddandGetJobRetries(job["jobId"])
if retries >= 5:
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error")
- dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e))
+ dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error")
+ dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e))
dataHandler.Close()
return ret
-
-def SubmitPSDistJob(job):
- ret = {}
- dataHandler = DataHandler()
-
- try:
- jobParams = json.loads(base64.b64decode(job["jobParams"]))
- jobParams["rest-api"] = config["rest-api"]
- distJobParams = {}
- distJobParams["ps"] = []
- distJobParams["worker"] = []
- assignedRack = None
- if len(config["racks"]) > 0:
- assignedRack = random.choice(config["racks"])
-
- userAlias = getAlias(jobParams["userName"])
- jobParams["user_email"] = jobParams["userName"]
-
- jobParams["homeFolderHostpath"] = os.path.join(config["storage-mount-path"], GetWorkPath(userAlias))
-
- if jobParams["jobtrainingtype"] == "PSDistJob":
- jobDescriptionList = []
- nums = {"ps":int(jobParams["numps"]),"worker":int(jobParams["numpsworker"])}
- for role in ["ps","worker"]:
- for i in range(nums[role]):
- distJobParam=copy.deepcopy(jobParams)
- distJobParam["distId"] = "%s%d" % (role,i)
- distJobParam["distRole"] = role
- distJobParam["distRoleIdx"] = i
-
- if "jobPath" not in distJobParam or len(distJobParam["jobPath"].strip()) == 0:
- dataHandler.SetJobError(distJobParam["jobId"],"ERROR: job-path does not exist")
- return False
- if "workPath" not in distJobParam or len(distJobParam["workPath"].strip()) == 0:
- dataHandler.SetJobError(distJobParam["jobId"],"ERROR: work-path does not exist")
- return False
- #if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0:
- # dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist")
- # return False
- distJobParam["distJobPath"] = os.path.join(distJobParam["jobPath"],distJobParam["distId"])
- jobPath,workPath,dataPath = GetStoragePath(distJobParam["distJobPath"],distJobParam["workPath"],distJobParam["dataPath"])
-
- localJobPath = os.path.join(config["storage-mount-path"],jobPath)
- if not os.path.exists(localJobPath):
- if "userId" in distJobParam:
- mkdirsAsUser(localJobPath,distJobParam["userId"])
- else:
- mkdirsAsUser(localJobPath,0)
-
- # TODO ???
- if "cmd" not in distJobParam:
- distJobParam["cmd"] = ""
-
-#change ssh folder permission here because the setup permission script in launch_ps_job function may have race condition with init_user.sh script. results in no such user error
- if role == "ps":
- launchCMD = """
-#!/bin/bash
-echo "[DLWorkspace System]: Waiting for all containers are ready..."
-while [ ! -f /opt/run_dist_job ]; do
- sleep 3
-done
-
-sudo chmod 600 -R /home/%s/.ssh &>/dev/null;
-sudo chmod 700 /home/%s/.ssh &>/dev/null;
-sudo chown -R %s /home/%s/.ssh &>/dev/null;
-
-sudo mkdir -p /root/.ssh &>/dev/null ;
-sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null;
-sudo mkdir -p /opt &>/dev/null;
-sudo ln -s /job/hostfile /opt/hostfile &>/dev/null;
-
-JOB_DIR='/home/%s'
-WORKER_NUM=%s
-echo $JOB_DIR $WORKER_NUM
-
-all_workers_ready=false
-while [ "$all_workers_ready" != true ]
-do
- # update it to false if any woker is not ready
- all_workers_ready=true
-
- for i in $(seq 0 $(( ${WORKER_NUM} - 1)) )
- do
- worker="worker${i}"
- file="$JOB_DIR/${worker}/WORKER_READY"
- #echo $file
-
- if [ ! -f $file ]; then
- echo "${worker} not ready!"
- all_workers_ready=false
- sleep 10
- fi
- done
-done
-
-echo "[DLWorkspace System]: All containers are ready, launching training job..."
-%s
-""" % (userAlias,userAlias,userAlias,userAlias,userAlias,distJobParam["jobPath"],jobParams["numpsworker"],distJobParam["cmd"])
- else:
- launchCMD = """
-while [ ! -f /opt/run_dist_job ]; do
- sleep 3
-done
-sudo chmod 600 -R /home/%s/.ssh &>/dev/null;
-sudo chmod 700 /home/%s/.ssh &>/dev/null;
-sudo chown -R %s /home/%s/.ssh &>/dev/null;
-sudo mkdir -p /root/.ssh &>/dev/null;
-sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null;
-sudo mkdir -p /opt && sudo ln -s /job/hostfile /opt/hostfile &>/dev/null;
-
-# TODO mark the worker as 'READY', better to change to '/pod/READY' later
-sudo touch /job/WORKER_READY
-
-sleep infinity
-""" % (userAlias,userAlias,userAlias,userAlias,userAlias)
-
-
- launchScriptPath = os.path.join(localJobPath,"launch-%s-%s%d.sh" % (distJobParam["jobId"],role,i))
- # TODO need to set up user for distribute jobs
- with open(launchScriptPath, 'w') as f:
- f.write(launchCMD)
- f.close()
-
-
- launchScriptInContainer = "bash /job/launch-%s-%s%d.sh" % (distJobParam["jobId"],role,i)
-
- distJobParam["LaunchCMD"] = '["bash", "-c", "bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c \'%s\'"]' % launchScriptInContainer
-
- distJobParam["jobNameLabel"] = ''.join(e for e in distJobParam["jobName"] if e.isalnum())
- ENV = Environment(loader=FileSystemLoader("/"))
-
- jobTempDir = os.path.join(config["root-path"],"Jobs_Templete")
- jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template")
-
- distJobParam["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath)
- distJobParam["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath)
- distJobParam["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath)
- distJobParam["nvidiaDriverPath"] = nvidiaDriverPath
-
- if "mountpoints" not in distJobParam:
- distJobParam["mountpoints"] = []
-
- # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath})
- distJobParam["mountpoints"].append({"name":"job","containerPath":"/job","hostPath":distJobParam["hostjobPath"]})
- distJobParam["mountpoints"].append({"name":"work","containerPath":"/work","hostPath":distJobParam["hostworkPath"]})
- distJobParam["mountpoints"].append({"name":"data","containerPath":"/data","hostPath":distJobParam["hostdataPath"]})
-
- for idx in range(len(distJobParam["mountpoints"])):
- if "name" not in distJobParam["mountpoints"][idx]:
- distJobParam["mountpoints"][idx]["name"] = str(uuid.uuid4()).replace("-","")
-
-
- distJobParam["pod_ip_range"] = config["pod_ip_range"]
- if "usefreeflow" in config:
- distJobParam["usefreeflow"] = config["usefreeflow"]
- else:
- distJobParam["usefreeflow"] = False
-
- distJobParam["numworker"] = int(jobParams["numpsworker"])
- distJobParam["numps"] = int(jobParams["numps"])
-
-
-
- random.seed(datetime.datetime.now())
- if "hostNetwork" in jobParams and jobParams["hostNetwork"]:
- distJobParam["containerPort"] = random.randint(40000, 49999)
- else:
- distJobParam["containerPort"] = int(random.random()*1000+3000)
-
- if assignedRack is not None:
- if "nodeSelector" not in distJobParam:
- distJobParam["nodeSelector"] = {}
- distJobParam["nodeSelector"]["rack"] = assignedRack
-
- if "gpuType" in distJobParam:
- if "nodeSelector" not in distJobParam:
- distJobParam["nodeSelector"] = {}
- distJobParam["nodeSelector"]["gpuType"] = distJobParam["gpuType"]
-
- # inject gid, uid and user
- # TODO it should return only one entry
- user_info = dataHandler.GetIdentityInfo(jobParams["userName"])[0]
- distJobParam["gid"] = user_info["gid"]
- distJobParam["uid"] = user_info["uid"]
- distJobParam["user"] = userAlias
-
- template = ENV.get_template(os.path.abspath(jobTemp))
- job_description = template.render(job=distJobParam)
-
- jobDescriptionList.append(job_description)
-
- distJobParams[role].append(distJobParam)
-
-
- jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
- jobDescription = "\n---\n".join(jobDescriptionList)
-
-
- jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"])
- if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))):
- os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
- if os.path.isfile(jobDescriptionPath):
- output = k8sUtils.kubectl_delete(jobDescriptionPath)
-
- with open(jobDescriptionPath, 'w') as f:
- f.write(jobDescription)
-
- output = k8sUtils.kubectl_create(jobDescriptionPath)
-
- ret["output"] = output
-
- ret["jobId"] = jobParams["jobId"]
-
-
- if "userName" not in jobParams:
- jobParams["userName"] = ""
-
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling")
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"])
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription))
-
-
- jobMeta = {}
- jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
- jobMeta["jobPath"] = jobParams["jobPath"]
- jobMeta["workPath"] = jobParams["workPath"]
- jobMeta["jobPath"] = jobParams["jobPath"]
- jobMeta["LaunchCMD"] = jobParams["cmd"]
- jobMeta["distJobParams"] = distJobParams
-
- jobMetaStr = base64.b64encode(json.dumps(jobMeta))
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr)
- except Exception as e:
- import traceback
- traceback.print_exc()
- print(e)
- ret["error"] = str(e)
- retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
- if retries >= 5:
- dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error")
- dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e))
- dataHandler.Close()
- return ret
-
-def KillJob(job, desiredState="killed"):
+def KillJob(job_id, desiredState="killed"):
dataHandler = DataHandler()
- result, detail = k8sUtils.GetJobStatus(job["jobId"])
- dataHandler.UpdateJobTextField(job["jobId"],"jobStatusDetail",base64.b64encode(json.dumps(detail)))
- logging.info("Killing job %s, with status %s, %s" %(job["jobId"], result,detail))
- if "jobDescriptionPath" in job and job["jobDescriptionPath"] is not None:
- jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"])
- if os.path.isfile(jobDescriptionPath):
- if k8sUtils.kubectl_delete(jobDescriptionPath) == 0:
- dataHandler.UpdateJobTextField(job["jobId"],"jobStatus", desiredState)
- return True
- else:
- dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","Cannot delete job from Kubernetes Cluster!")
+ result, detail = k8sUtils.GetJobStatus(job_id)
+ dataHandler.UpdateJobTextField(job_id, "jobStatusDetail", base64.b64encode(json.dumps(detail)))
+ logging.info("Killing job %s, with status %s, %s" % (job_id, result, detail))
+
+ job_deployer = JobDeployer()
+ errors = job_deployer.delete_job(job_id, force=True)
+
+ if len(errors) == 0:
+ dataHandler.UpdateJobTextField(job_id, "jobStatus", desiredState)
+ dataHandler.UpdateJobTextField(job_id, "lastUpdated", datetime.datetime.now().isoformat())
+ dataHandler.Close()
+ return True
else:
- dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","Cannot find job description file!")
-
- dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","error")
- dataHandler.Close()
- return False
-
-
-def getAlias(username):
- if "@" in username:
- username = username.split("@")[0].strip()
-
- if "/" in username:
- username = username.split("/")[1].strip()
-
- return username
+ dataHandler.UpdateJobTextField(job_id, "jobStatus", "error")
+ dataHandler.UpdateJobTextField(job_id, "lastUpdated", datetime.datetime.now().isoformat())
+ dataHandler.Close()
+ logging.error("Kill job failed with errors: {}".format(errors))
+ return False
-def ApproveJob(job):
+def ApproveJob(job_id):
dataHandler = DataHandler()
- dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "queued")
+ dataHandler.UpdateJobTextField(job_id, "jobStatus", "queued")
dataHandler.Close()
return True
-def AutoApproveJob(job):
- # TODO: All jobs are currently auto-approved. We need to allow
- # configuring different policies for different VC.
- ApproveJob(job)
-
- # This block is kept here for reference of the original code.
- # cluster_status = get_cluster_status()
- # jobUser = getAlias(job["userName"])
- # jobParams = json.loads(base64.b64decode(job["jobParams"]))
- # jobGPU = GetJobTotalGpu(jobParams)
- #
- # currentGPU = 0
- # for user in cluster_status["user_status"]:
- # if user["userName"] == jobUser:
- # currentGPU = int(user["userGPU"])
- #
- # if True or currentGPU == 0 or currentGPU + jobGPU <= 4:
- # ApproveJob(job)
-
-
UnusualJobs = {}
-def UpdateJobStatus(job):
+def UpdateJobStatus(job, notifier=None):
+ assert(job["jobStatus"] == "scheduling" or job["jobStatus"] == "running")
dataHandler = DataHandler()
jobParams = json.loads(base64.b64decode(job["jobParams"]))
- if job["jobStatus"] == "scheduling" and jobParams["jobtrainingtype"] == "PSDistJob":
- # launch user command only all pods are ready
- result, detail = k8sUtils.GetJobStatus(job["jobId"])
- if result in ["Failed", "Succeeded"]:
- # TODO shoudn't be here, update status
- dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", result)
- pass
- else:
- # previously status is 'scheduling', and now all pods are ready
- # TODO check all pods are ready
- if k8sUtils.all_pod_ready(job["jobId"]):
- try:
- launch_ps_dist_job(jobParams)
- except Exception as e:
- print(e)
- return
-
- jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"])
- localJobPath = os.path.join(config["storage-mount-path"],jobPath)
- logPath = os.path.join(localJobPath,"logs/joblog.txt")
-
+ result = check_job_status(job["jobId"])
+ logging.info("++++++++ Job status: {} {}".format(job["jobId"], result))
- result, detail = k8sUtils.GetJobStatus(job["jobId"])
- dataHandler.UpdateJobTextField(job["jobId"],"jobStatusDetail",base64.b64encode(json.dumps(detail)))
-
- logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail)))
+ jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"])
+ localJobPath = os.path.join(config["storage-mount-path"], jobPath)
+ logPath = os.path.join(localJobPath, "logs/joblog.txt")
jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None
if "userId" not in jobParams:
- jobParams["userId"] = "0"
- if result.strip() == "Succeeded":
- joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"])
- dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","finished")
+ jobParams["userId"] = "0"
+
+ if result == "Succeeded":
+ joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"])
+ dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")
if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath):
k8sUtils.kubectl_delete(jobDescriptionPath)
- elif result.strip() == "Running":
+
+ if notifier is not None:
+ notifier.notify(notify.new_job_state_change_message(
+ job["userName"], job["jobId"], result.strip()))
+ elif result == "Running":
if job["jobStatus"] != "running":
- dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","running")
+ started_at = datetime.datetime.now().isoformat()
+ detail = [{"startedAt": started_at, "message": "started at: {}".format(started_at)}]
+ dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail)))
+ dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running")
+
+ elif result == "Failed":
+ logging.warning("Job %s fails, cleaning...", job["jobId"])
+
+ if notifier is not None:
+ notifier.notify(notify.new_job_state_change_message(
+ job["userName"], job["jobId"], result.strip()))
+
+ joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"])
+ dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
+ dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "pod failed")
- elif result.strip() == "Failed":
- printlog("Job %s fails, cleaning..." % job["jobId"])
- joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"])
- dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","failed")
- dataHandler.UpdateJobTextField(job["jobId"],"errorMsg",detail)
if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath):
k8sUtils.kubectl_delete(jobDescriptionPath)
- elif result.strip() == "Unknown":
+ elif result == "Unknown" or result == "NotFound":
if job["jobId"] not in UnusualJobs:
+ logging.warning("!!! Job status ---{}---, job: {}".format(result, job["jobId"]))
UnusualJobs[job["jobId"]] = datetime.datetime.now()
- elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300:
+ # TODO
+ # 1) May need to reduce the timeout.
+ # It takes minutes before pod turns into "Unknown", we may don't need to wait so long.
+ # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'.
+ elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 30:
del UnusualJobs[job["jobId"]]
- retries = dataHandler.AddandGetJobRetries(job["jobId"])
- if retries >= 5:
- printlog("Job %s fails for more than 5 times, abort" % job["jobId"])
- dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","error")
- dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","cannot launch the job.")
- if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath):
- k8sUtils.kubectl_delete(jobDescriptionPath)
- else:
- printlog("Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"] , retries))
- SubmitJob(job)
- elif result.strip() == "PendingHostPort":
- printlog("Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"]))
- SubmitJob(job)
+ # TODO refine later
+ # before resubmit the job, reset the endpoints
+ # update all endpoint to status 'pending', so it would restart when job is ready
+ endpoints = dataHandler.GetJobEndpoints(job["jobId"])
+ for endpoint_id, endpoint in endpoints.items():
+ endpoint["status"] = "pending"
+ logging.info("Reset endpoint status to 'pending': {}".format(endpoint_id))
+ dataHandler.UpdateEndpoint(endpoint)
- if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
+ logging.warning("Job {} fails in Kubernetes as {}, delete and re-submit.".format(job["jobId"], result))
+ KillJob(job["jobId"], "queued")
+
+ if result != "Unknown" and result != "NotFound" and job["jobId"] in UnusualJobs:
del UnusualJobs[job["jobId"]]
dataHandler.Close()
-def run_dist_cmd_on_pod(podId, cmd, outputfile):
- remotecmd = "exec %s -- %s" % (podId,cmd)
- print(remotecmd)
- k8sUtils.kubectl_exec_output_to_file(remotecmd,outputfile)
-
+# TODO refine later
+def check_job_status(job_id):
+ job_deployer = JobDeployer()
+ job_roles = JobRole.get_job_roles(job_id)
-class Kube_RemoteCMD_Thread(threading.Thread):
- def __init__(self, jobId, podId, cmd, outputfile):
- threading.Thread.__init__(self)
- self.jobId = jobId
- self.podId = podId
- self.cmd = cmd
- self.outputfile = outputfile
- def run(self):
- run_dist_cmd_on_pod(self.podId, self.cmd, self.outputfile)
+ if len(job_roles) < 1:
+ return "NotFound"
-
-# TODO remove duplicate code later
-def is_ssh_server_ready(pod_name):
- bash_script = "sudo service ssh status"
- output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script))
- if output == "":
- return False
- return True
-
-# TODO remove duplicate code later
-def query_ssh_port(pod_name):
- bash_script = "grep ^Port /etc/ssh/sshd_config | cut -d' ' -f2"
- ssh_port = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script))
- return int(ssh_port)
-
-# TODO remove duplicate code later
-def start_ssh_server(pod_name, user_name, host_network=False, ssh_port=22):
- '''Setup the ssh server in container, and return the listening port.'''
- bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'"
-
- # ssh_port = 22
-
- # modify the script for HostNewtork
- if host_network:
- # if the ssh_port is default value 22, randomly choose one
- if ssh_port == 22:
- ssh_port = random.randint(40000, 49999)
- # bash_script = "sed -i '/^Port 22/c Port "+str(ssh_port)+"' /etc/ssh/sshd_config && "+bash_script
- # TODO refine the script later
- bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && sed -i \"s/^Port 22/Port " + str(ssh_port) + "/\" /etc/ssh/sshd_config && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'"
-
- # TODO setup reasonable timeout
- # output = k8sUtils.kubectl_exec("exec %s %s" % (jobId, " -- " + bash_script), 1)
- output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script))
- if output == "":
- raise Exception("Failed to setup ssh server in container. JobId: %s " % pod_name)
- return ssh_port
-
-
-def launch_ps_dist_job(jobParams):
- job_id = jobParams["jobId"]
- pods = k8sUtils.GetPod("run=" + job_id)
-
- # if any pod is not up, return
- if "items" not in pods or len(pods["items"]) != (int(jobParams["numpsworker"]) + int(jobParams["numps"])):
- return
- # if any pod is not ready, return
- pod_status = [k8sUtils.check_pod_status(pod) for pod in pods["items"]]
- if any([status != "Running" for status in pod_status]):
- return
-
- user_name = getAlias(jobParams["userName"])
- if "hostNetwork" in jobParams and jobParams["hostNetwork"]:
- host_network = True
- else:
- host_network = False
-
- # setup ssh server
- for [idx, pod] in enumerate(pods["items"]):
- pod_name = pod["metadata"]["name"]
- dist_port = pod["metadata"]["labels"]["distPort"]
- # quit if can't setup ssh server
- ssh_port = start_ssh_server(pod_name, user_name, host_network, dist_port)
-
- # generate ssh config
- ssh_config = """
-Host %s
- HostName %s
- Port %s
- User %s
- StrictHostKeyChecking no
- UserKnownHostsFile /dev/null
- """
- sshconfigstr = ""
- for [idx, pod] in enumerate(pods["items"]):
- pod_ip = pod["status"]["podIP"]
- dist_port = pod["metadata"]["labels"]["distPort"]
- role = pod["metadata"]["labels"]["distRole"]
- role_idx = pod["metadata"]["labels"]["distRoleIdx"]
-
- # TODO hostNetwork
- if host_network:
- sshconfigstr += (ssh_config % (role + "-"+str(role_idx), pod_ip, str(dist_port), user_name) + "\n")
- else:
- sshconfigstr += (ssh_config % (role + "-"+str(role_idx), pod_ip, 22, user_name) + "\n")
-
- # config ssh client
- for [idx, pod] in enumerate(pods["items"]):
- pod_name = pod["metadata"]["name"]
- bash_script = "cat > /home/" + user_name + "/.ssh/config < WORKER_READY -> JOB_READY (then the job finally in "Running" status.)
+ """
+ # pod-phase: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
+ # node condition: https://kubernetes.io/docs/concepts/architecture/nodes/#condition
+ deployer = JobDeployer()
+ pods = deployer.get_pods(field_selector="metadata.name={}".format(self.pod_name))
+ logging.debug("Pods: {}".format(pods))
+ if(len(pods) < 1):
+ return "NotFound"
+
+ assert(len(pods) == 1)
+ pod = pods[0]
+ phase = pod.status.phase
+
+ # !!! Pod is running, doesn't mean "Role" is ready and running.
+ if(phase == "Running"):
+ # Found that phase won't turn into "Unkonwn" even when we get 'unknown' from kubectl
+ if pod.status.reason == "NodeLost":
+ return "Unknown"
+
+ # Check if the user command had been ran.
+ if not self.isRoleReady():
+ return "Pending"
+
+ return phase
+
+ def isFileExisting(self, file):
+ deployer = JobDeployer()
+ status_code, _ = deployer.pod_exec(self.pod_name, ["/bin/sh", "-c", "ls -lrt {}".format(file)])
+ return status_code == 0
+
+ def isRoleReady(self):
+ return self.isFileExisting(JobRole.MARK_ROLE_READY_FILE)
diff --git a/src/ClusterManager/job_status.pdf b/src/ClusterManager/job_status.pdf
new file mode 100644
index 000000000..c9756f120
Binary files /dev/null and b/src/ClusterManager/job_status.pdf differ
diff --git a/src/ClusterManager/joblog_manager.py b/src/ClusterManager/joblog_manager.py
index b10630c1e..b43232a3b 100755
--- a/src/ClusterManager/joblog_manager.py
+++ b/src/ClusterManager/joblog_manager.py
@@ -23,8 +23,6 @@
from multiprocessing import Process, Manager
-
-
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage"))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils"))
@@ -34,10 +32,13 @@
from config import config, GetStoragePath
from DataHandler import DataHandler
+from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time
-def create_log( logdir = '/var/log/dlworkspace' ):
+logger = logging.getLogger(__name__)
+
+def create_log(logdir = '/var/log/dlworkspace'):
if not os.path.exists( logdir ):
- os.system("mkdir -p " + logdir )
+ os.system("mkdir -p " + logdir)
with open('logging.yaml') as f:
logging_config = yaml.load(f)
f.close()
@@ -109,7 +110,7 @@ def extract_job_log(jobId,logPath,userId):
f.close()
os.system("chown -R %s %s" % (userId, containerLogPath))
except Exception as e:
- print e
+ logger.exception("write container log failed")
if len(trimlogstr.strip()) > 0:
@@ -149,15 +150,24 @@ def update_job_logs():
def Run():
+ register_stack_trace_dump()
create_log()
logging.info("start to update job logs ...")
while True:
- try:
- update_job_logs()
- except Exception as e:
- print e
+ update_file_modification_time("joblog_manager")
+
+ with manager_iteration_histogram.labels("joblog_manager").time():
+ try:
+ update_job_logs()
+ except Exception as e:
+ logger.exception("update job logs failed")
time.sleep(1)
if __name__ == '__main__':
- Run()
\ No newline at end of file
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9203)
+ args = parser.parse_args()
+ setup_exporter_thread(args.port)
+
+ Run()
diff --git a/src/ClusterManager/logging.yaml b/src/ClusterManager/logging.yaml
index b276c6d8d..a486bc5aa 100755
--- a/src/ClusterManager/logging.yaml
+++ b/src/ClusterManager/logging.yaml
@@ -1,26 +1,27 @@
-version: 1
-formatters:
- simple:
- format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-handlers:
- console:
- class: logging.StreamHandler
- level: DEBUG
- formatter: simple
- stream: ext://sys.stdout
- file:
- class : logging.handlers.RotatingFileHandler
- formatter: simple
- filename: /var/log/dlworkspace/clustermanager.log
- # roll over at 10MB
- maxBytes: 10240000
- # At most 10 logging files
- backupCount: 10
-loggers:
- basic:
- level: DEBUG
- handlers: ['console','file']
- propagate: no
-root:
- level: DEBUG
- handlers: ['console','file']
\ No newline at end of file
+version: 1
+disable_existing_loggers: False
+formatters:
+ simple:
+ format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s'
+handlers:
+ console:
+ class: logging.StreamHandler
+ level: INFO
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class : logging.handlers.RotatingFileHandler
+ formatter: simple
+ filename: /var/log/dlworkspace/clustermanager.log
+ # roll over at 10MB
+ maxBytes: 10240000
+ # At most 10 logging files
+ backupCount: 10
+loggers:
+ basic:
+ level: INFO
+ handlers: ['console','file']
+ propagate: no
+root:
+ level: INFO
+ handlers: ['console','file']
diff --git a/src/ClusterManager/node_manager.py b/src/ClusterManager/node_manager.py
index 326a6a2ba..fb0de3193 100755
--- a/src/ClusterManager/node_manager.py
+++ b/src/ClusterManager/node_manager.py
@@ -39,11 +39,12 @@
from config import config
from DataHandler import DataHandler
+from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time
-def create_log( logdir = '/var/log/dlworkspace' ):
- if not os.path.exists( logdir ):
- os.system("mkdir -p " + logdir )
+def create_log(logdir = '/var/log/dlworkspace'):
+ if not os.path.exists(logdir):
+ os.system("mkdir -p " + logdir)
with open('logging.yaml') as f:
logging_config = yaml.load(f)
f.close()
@@ -139,7 +140,7 @@ def get_cluster_status():
node_status["unschedulable"] = False
if "status" in node and "conditions" in node["status"]:
- for condi in node["status"]:
+ for condi in node["status"]["conditions"]:
if "type" in condi and condi["type"] == "Ready" and "status" in condi and condi["status"] == "Unknown":
node_status["unschedulable"] = True
@@ -203,12 +204,13 @@ def get_cluster_status():
for node_name, node_status in nodes_status.iteritems():
if node_status["unschedulable"]:
gpu_unschedulable.Add(ResourceInfo(node_status["gpu_capacity"]))
+ gpu_reserved.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_used"])))
else:
gpu_avaliable.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_allocatable"]), ResourceInfo(node_status["gpu_used"])))
gpu_schedulable.Add(ResourceInfo(node_status["gpu_capacity"]))
gpu_unschedulable.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"])))
+ gpu_reserved.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"])))
- gpu_reserved.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"])))
gpu_used.Add(ResourceInfo(node_status["gpu_used"]))
gpu_capacity.Add(ResourceInfo(node_status["gpu_capacity"]))
@@ -224,7 +226,7 @@ def get_cluster_status():
cluster_status["node_status"] = [node_status for node_name, node_status in nodes_status.iteritems()]
except Exception as e:
- print(e)
+ logging.exception("get cluster status")
dataHandler = DataHandler()
cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount()
@@ -241,16 +243,25 @@ def get_cluster_status():
def Run():
+ register_stack_trace_dump()
create_log()
logging.info("start to update nodes usage information ...")
config["cluster_status"] = None
+
while True:
- try:
- get_cluster_status()
- except Exception as e:
- print e
- logging.info(str(e))
+ update_file_modification_time("node_manager")
+
+ with manager_iteration_histogram.labels("node_manager").time():
+ try:
+ get_cluster_status()
+ except Exception as e:
+ logging.exception("get cluster status failed")
time.sleep(30)
if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9202)
+ args = parser.parse_args()
+ setup_exporter_thread(args.port)
+
Run()
diff --git a/src/ClusterManager/pod_template.py b/src/ClusterManager/pod_template.py
new file mode 100644
index 000000000..0de62e5e4
--- /dev/null
+++ b/src/ClusterManager/pod_template.py
@@ -0,0 +1,144 @@
+import os
+import sys
+import json
+import yaml
+from jinja2 import Template
+from job import Job
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))
+from osUtils import mkdirsAsUser
+
+
+class PodTemplate():
+ def __init__(self, template, enable_custom_scheduler=False):
+ self.template = template
+ self.enable_custom_scheduler = enable_custom_scheduler
+
+ @staticmethod
+ def generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script):
+ if not os.path.exists(path_to_save):
+ mkdirsAsUser(path_to_save, user_id)
+
+ file_name = "job_command.sh"
+ launch_script_file = os.path.join(path_to_save, file_name)
+ with open(launch_script_file, 'w') as f:
+ f.write(user_script)
+ os.system("sudo chown %s %s" % (user_id, launch_script_file))
+ luanch_cmd = ["bash", "/pod/scripts/bootstrap.sh"]
+ return luanch_cmd
+
+ def generate_pod(self, pod):
+ assert(isinstance(self.template, Template))
+ if self.enable_custom_scheduler:
+ if "useGPUTopology" in pod and pod["useGPUTopology"]:
+ gpu_topology_flag = 1
+ else:
+ # for cases when desired topology is explictly given or not desired
+ gpu_topology_flag = 0
+ pod_name = pod["podName"]
+ request_gpu = int(pod["gpuLimit"])
+
+ podInfo = {
+ "podname": pod_name,
+ "requests": {
+ "alpha.gpu/gpu-generate-topology": gpu_topology_flag
+ },
+ "runningcontainer": {
+ pod_name: {
+ "requests": {"alpha.gpu/numgpu": request_gpu}
+ },
+ },
+ }
+
+ if "annotations" not in pod:
+ pod["annotations"] = {}
+ pod["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'"
+ # gpu requests specified through annotation
+ pod["gpuLimit"] = 0
+
+ pod_yaml = self.template.render(job=pod)
+ return yaml.full_load(pod_yaml)
+
+ def generate_pods(self, job):
+ """
+ Return (pods, errors)
+ """
+
+ assert(isinstance(job, Job))
+ params = job.params
+ if any(required_field not in params for required_field in
+ [
+ "jobtrainingtype",
+ "jobName",
+ "jobPath",
+ "workPath",
+ "dataPath",
+ "cmd",
+ "userId",
+ "resourcegpu",
+ "userName",
+ ]):
+ return None, "Missing required parameters!"
+
+ job.job_path = params["jobPath"]
+ job.work_path = params["workPath"]
+ job.data_path = params["dataPath"]
+ # TODO user's mountpoints first, but should after 'job_path'
+ job.add_mountpoints(job.job_path_mountpoint())
+ if "mountpoints" in params:
+ job.add_mountpoints(params["mountpoints"])
+ job.add_mountpoints(job.work_path_mountpoint())
+ job.add_mountpoints(job.data_path_mountpoint())
+ params["mountpoints"] = job.mountpoints
+
+ params["user_email"] = params["userName"]
+ params["homeFolderHostpath"] = job.get_homefolder_hostpath()
+ params["pod_ip_range"] = job.get_pod_ip_range()
+ params["usefreeflow"] = job.is_freeflow_enabled()
+ params["jobNameLabel"] = ''.join(e for e in params["jobName"] if e.isalnum())
+ params["rest-api"] = job.get_rest_api_url()
+
+ if "nodeSelector" not in params:
+ params["nodeSelector"] = {}
+ if "gpuType" in params:
+ params["nodeSelector"]["gpuType"] = params["gpuType"]
+
+ local_pod_path = job.get_hostpath(job.job_path, "master")
+ params["LaunchCMD"] = PodTemplate.generate_launch_script(params["jobId"], local_pod_path, params["userId"], params["resourcegpu"], params["cmd"])
+
+ if "envs" not in params:
+ params["envs"] =[]
+ params["envs"].append({"name": "DLWS_ROLE_NAME", "value": "master"})
+ params["envs"].append({"name": "DLWS_NUM_GPU_PER_WORKER", "value": params["resourcegpu"]})
+
+ pods = []
+ if all(hyper_parameter in params for hyper_parameter in ["hyperparametername", "hyperparameterstartvalue", "hyperparameterendvalue", "hyperparameterstep"]):
+ env_name = params["hyperparametername"]
+ start = int(params["hyperparameterstartvalue"])
+ end = int(params["hyperparameterendvalue"])
+ step = int(params["hyperparameterstep"])
+
+ for idx, val in enumerate(range(start, end, step)):
+ pod = params.copy()
+ pod["podName"] = "{0}-pod-{1}".format(job.job_id, idx)
+ pod["envs"].append({"name": env_name, "value": val})
+ pods.append(pod)
+ else:
+ pod = params.copy()
+ pod["podName"] = job.job_id
+ pods.append(pod)
+
+ k8s_pods = []
+ for pod in pods:
+ pod["numps"] = 0
+ pod["numworker"] = 1
+ pod["fragmentGpuJob"] = True
+ pod["gpuLimit"] = pod["resourcegpu"]
+
+ # mount /pod
+ pod_path = job.get_hostpath(job.job_path, "master")
+ pod["mountpoints"].append({"name": "pod", "containerPath": "/pod", "hostPath": pod_path, "enabled": True})
+
+ k8s_pod = self.generate_pod(pod)
+ k8s_pods.append(k8s_pod)
+ return k8s_pods, None
diff --git a/src/ClusterManager/requirements.txt b/src/ClusterManager/requirements.txt
new file mode 100644
index 000000000..f1363b2db
--- /dev/null
+++ b/src/ClusterManager/requirements.txt
@@ -0,0 +1,5 @@
+marshmallow==2.19.5
+kubernetes==9.0.0
+PyYAML>=5.1.1
+prometheus-client==0.7.1
+twisted==19.2.1
diff --git a/src/ClusterManager/test_job.py b/src/ClusterManager/test_job.py
new file mode 100644
index 000000000..76fa8e299
--- /dev/null
+++ b/src/ClusterManager/test_job.py
@@ -0,0 +1,176 @@
+import unittest
+import json
+import sys
+import os
+from job import Job, JobSchema
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))
+from config import config
+
+
+VALID_JOB_ATTRIBUTES = {
+ "cluster": config,
+ "jobId": "ce7dca49-28df-450a-a03b-51b9c2ecc69c",
+ "userName": "user@foo.com",
+ "jobPath": "user_alias/jobs/date/job_id",
+}
+
+
+class TestJobSchema(unittest.TestCase):
+
+ def test_loads(self):
+ job_json = json.dumps(VALID_JOB_ATTRIBUTES)
+
+ job, errors = JobSchema().loads(job_json)
+ self.assertFalse(errors)
+ self.assertEqual(job.job_id, VALID_JOB_ATTRIBUTES["jobId"])
+ self.assertEqual(job.email, VALID_JOB_ATTRIBUTES["userName"])
+
+ def test_job_id_schema(self):
+ job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES)
+ self.assertFalse(errors)
+
+ # uppercase
+ attrs = VALID_JOB_ATTRIBUTES.copy()
+ attrs.update({"jobId": "First-job"})
+ job, errors = JobSchema().load(attrs)
+ self.assertTrue("jobId" in errors)
+
+ # space
+ attrs = VALID_JOB_ATTRIBUTES.copy()
+ attrs.update({"jobId": "first job"})
+ job, errors = JobSchema().load(attrs)
+ self.assertTrue("jobId" in errors)
+
+ def test_dump(self):
+ job = Job(
+ cluster=config,
+ job_id="test-job",
+ email="user@foo.com"
+ )
+
+ result, errors = JobSchema().dump(job)
+
+ self.assertFalse(errors)
+ self.assertEqual(result["jobId"], "test-job")
+ self.assertEqual(result["userName"], "user@foo.com")
+
+
+class TestJob(unittest.TestCase):
+
+ def create_a_job(self):
+ job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES)
+ self.assertFalse(errors)
+ return job
+
+ def test_add_mountpoints_with_none(self):
+ job = self.create_a_job()
+ job.add_mountpoints(None)
+
+ def test_add_mountpoints_without_name(self):
+ job = self.create_a_job()
+
+ # add one mountpoint without "name"
+ mountpoint1 = {
+ "enabled": True,
+ "containerPath": "/home/username",
+ "hostPath": "/dlwsdata/work/username",
+ }
+ job.add_mountpoints(mountpoint1)
+ self.assertEqual(1, len(job.mountpoints))
+
+ def test_add_mountpoints(self):
+ job = self.create_a_job()
+
+ # add one mountpoint
+ mountpoint1 = {
+ "enabled": True,
+ "containerPath": "/home/username",
+ "hostPath": "/dlwsdata/work/username",
+ "name": "homefolder"
+ }
+ job.add_mountpoints(mountpoint1)
+ self.assertEqual(1, len(job.mountpoints))
+
+ # would silently skip
+ job.add_mountpoints(mountpoint1)
+ self.assertEqual(1, len(job.mountpoints))
+
+ # name would be normalized, only allow alphanumeric, so it would be a duplicate
+ mountpoint1a = {
+ "enabled": True,
+ "containerPath": "/home/path",
+ "hostPath": "/dlwsdata/work/path",
+ "name": "homefolder-"
+ }
+ job.add_mountpoints(mountpoint1a)
+ self.assertEqual(1, len(job.mountpoints))
+
+ # add another mountpoint
+ mountpoint2 = {
+ "enabled": True,
+ "containerPath": "/home/path1",
+ "hostPath": "/dlwsdata/work/path1",
+ "name": "homepath1"
+ }
+ job.add_mountpoints(mountpoint2)
+ self.assertEqual(2, len(job.mountpoints))
+
+ # add a list
+ mountpoints = [{
+ "enabled": True,
+ "containerPath": "/home/path2",
+ "hostPath": "/dlwsdata/work/path2",
+ "name": "homepath2"
+ }]
+ job.add_mountpoints(mountpoints)
+ self.assertEqual(3, len(job.mountpoints))
+
+ def test_get_homefolder_hostpath(self):
+ job = self.create_a_job()
+ self.assertEqual("/dlwsdata/work/user", job.get_homefolder_hostpath())
+
+ def test_get_hostpath(self):
+ job = self.create_a_job()
+ self.assertEqual("user_alias/jobs/date/job_id", job.job_path)
+ self.assertEqual("/dlwsdata/work/user_alias/jobs/date/job_id", job.get_hostpath(job.job_path))
+
+ def test_job_work_data_mountpoints(self):
+ job = self.create_a_job()
+
+ job.job_path = "user_alias/jobs/date/job_id"
+ job.work_path = "user_alias"
+ job.data_path = ""
+
+ self.assertEqual("/dlwsdata/work/user_alias/jobs/date/job_id", job.job_path_mountpoint()["hostPath"])
+ self.assertEqual("/dlwsdata/work/user_alias", job.work_path_mountpoint()["hostPath"])
+ self.assertEqual("/dlwsdata/storage/", job.data_path_mountpoint()["hostPath"])
+
+ job.add_mountpoints(job.job_path_mountpoint())
+ job.add_mountpoints(job.work_path_mountpoint())
+ job.add_mountpoints(job.data_path_mountpoint())
+ self.assertEquals(3, len(job.mountpoints))
+
+ def test_get_template(self):
+ job = self.create_a_job()
+
+ self.assertIsNotNone(job.get_template())
+
+ def test_is_custom_scheduler_enabled(self):
+ job = self.create_a_job()
+
+ self.assertFalse(job.is_custom_scheduler_enabled())
+
+ # TODO !!! notice, it would change all the 'cluster' settings
+ job.cluster["kube_custom_scheduler"] = True
+ self.assertTrue(job.is_custom_scheduler_enabled())
+
+ def test_get_rest_api_url(self):
+ job = self.create_a_job()
+
+ self.assertEqual("http://faked.uri/", job.get_rest_api_url())
+
+ def test_get_rack(self):
+ job = self.create_a_job()
+
+ self.assertEqual(None, job.get_rack())
diff --git a/src/ClusterManager/test_job_deployer.py b/src/ClusterManager/test_job_deployer.py
new file mode 100644
index 000000000..b5f0ff9df
--- /dev/null
+++ b/src/ClusterManager/test_job_deployer.py
@@ -0,0 +1,135 @@
+import unittest
+import kubernetes
+import yaml
+import string
+import random
+import time
+from kubernetes.client.rest import ApiException
+
+from job_deployer import JobDeployer
+
+import logging
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s",
+ handlers=[
+ logging.StreamHandler()
+ ]
+)
+
+
+class TestJobDeployer(unittest.TestCase):
+
+ def create_job_deployer(self):
+ job_deployer = JobDeployer()
+ self.assertIsNotNone(job_deployer)
+ return job_deployer
+
+ def create_pod(self, pod_name):
+ job_deployer = self.create_job_deployer()
+ raw_yaml = """
+apiVersion: v1
+kind: Pod
+metadata:
+ name: {}
+spec:
+ containers:
+ - name: busybox
+ image: busybox
+ args:
+ - sleep
+ - "1000000"
+ """.format(pod_name)
+ body = yaml.full_load(raw_yaml)
+
+ # with self.assertRaises(ApiException):
+ job_deployer.create_pod(body)
+
+ def test_delete_pod(self):
+ pod_name = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
+ self.create_pod(pod_name)
+
+ job_deployer = self.create_job_deployer()
+
+ job_deployer.delete_pod(pod_name)
+
+ def test_cleanup_pods(self):
+ job_deployer = self.create_job_deployer()
+ pod_names = ["pod-1", "pod-2"]
+
+ job_deployer.cleanup_pods(pod_names)
+
+ def test_get_pod_by_label(self):
+ job_deployer = self.create_job_deployer()
+ label_selector = "run=some_job_id"
+
+ pods = job_deployer.get_pods(label_selector=label_selector)
+
+ self.assertEqual(0, len(pods))
+
+ def test_get_services_by_label(self):
+ job_deployer = self.create_job_deployer()
+ label_selector = "run=some_job_id"
+
+ services = job_deployer.get_services_by_label(label_selector)
+
+ self.assertEqual(0, len(services))
+
+ def test_create_endpoint(self):
+ job_deployer = self.create_job_deployer()
+ raw_yaml = """
+apiVersion: v1
+kind: Service
+metadata:
+ name: test-service
+spec:
+ selector:
+ app: MyApp
+ ports:
+ - protocol: TCP
+ port: 80
+ targetPort: 9376
+ """
+ body = yaml.full_load(raw_yaml)
+
+ # with self.assertRaises(ApiException):
+ job_deployer.create_service(body)
+
+ def test_delete_service(self):
+ job_deployer = self.create_job_deployer()
+
+ job_deployer.delete_service("test-service")
+
+ def test_pod_exec(self):
+ job_deployer = self.create_job_deployer()
+
+ pod_name = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
+ self.create_pod(pod_name)
+ time.sleep(3)
+
+ exec_command = [
+ '/bin/sh',
+ '-c',
+ 'echo This message goes to stderr >&2 && echo This message goes to stdout'
+ ]
+
+ status_code, ouput = job_deployer.pod_exec(pod_name, exec_command)
+ self.assertEqual(0, status_code)
+
+ bad_command = [
+ '/bin/sh',
+ '-c',
+ 'echo This message goes to stderr >&2 && xecho This message goes to stdout; sleep 3; exit 8'
+ ]
+ status_code, ouput = job_deployer.pod_exec(pod_name, bad_command)
+ self.assertEqual(8, status_code)
+
+ bad_command = [
+ '/bin/sh',
+ '-c',
+ 'echo This message goes to stderr >&2 && xecho This message goes to stdout; sleep 3; exit 8'
+ ]
+ status_code, ouput = job_deployer.pod_exec(pod_name, bad_command, 1)
+ self.assertEqual(-1, status_code)
+
+ job_deployer.delete_pod(pod_name)
diff --git a/src/ClusterManager/test_job_role.py b/src/ClusterManager/test_job_role.py
new file mode 100644
index 000000000..919332053
--- /dev/null
+++ b/src/ClusterManager/test_job_role.py
@@ -0,0 +1,34 @@
+import unittest
+from job_role import JobRole
+
+
+class TestJobRole(unittest.TestCase):
+
+ def test_status_Running(self):
+ job_role = JobRole("master", "bd3d090a-53b6-4616-9b6c-fe4a86fd68ea-ps0")
+
+ role_status = job_role.status()
+ self.assertEqual("Running", role_status)
+
+ def test_status_NotFound(self):
+ job_role = JobRole("master", "bd3d090a-53b6-4616-9b6c-fe4a86fd68ea-ps0-not-found")
+
+ role_status = job_role.status()
+ self.assertEqual("NotFound", role_status)
+
+ def test_status_Pending(self):
+ # Pod is running, but mark file not existing: JobRole.MARK_POD_READY_FILE
+ job_role = JobRole("master", "nginx-cm7kf")
+
+ role_status = job_role.status()
+ self.assertEqual("Pending", role_status)
+
+ def test_get_job_roles_dist_job(self):
+ job_roles = JobRole.get_job_roles("bd3d090a-53b6-4616-9b6c-fe4a86fd68ea")
+
+ self.assertEqual(3, len(job_roles))
+
+ def test_get_job_roles_regular_job(self):
+ job_roles = JobRole.get_job_roles("8ca7fcdf-c4e7-4687-a3fa-1eeea97415c4")
+
+ self.assertEqual(1, len(job_roles))
diff --git a/src/ClusterManager/test_pod_template.py b/src/ClusterManager/test_pod_template.py
new file mode 100644
index 000000000..f7b00537b
--- /dev/null
+++ b/src/ClusterManager/test_pod_template.py
@@ -0,0 +1,196 @@
+import unittest
+import json
+import yaml
+import sys
+import os
+from job import Job, JobSchema
+from pod_template import PodTemplate
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))
+from config import config
+
+VALID_JOB_ATTRIBUTES = {
+ "cluster": config,
+ "jobId": "ce7dca49-28df-450a-a03b-51b9c2ecc69c",
+ "userName": "user@foo.com",
+ "jobPath": "user_alias/jobs/date/job_id"
+}
+
+job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES)
+assert(not errors)
+
+
+class TestPodTemplate(unittest.TestCase):
+
+ def test_generate_launch_script(self):
+ job_id = "ce7dca49-28df-450a-a03b-51b9c2ecc69c"
+ path_to_save = "/tmp"
+ user_id = "20000"
+ gpu_num = 3
+ user_script = "sleep infinity"
+
+ script_file = PodTemplate.generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script)
+
+ # return the container command
+ self.assertListEqual(["bash", "/pod/scripts/bootstrap.sh"], script_file)
+
+ def test_pod_template_without_custer_scheduler(self):
+ enable_custom_scheduler = False
+ pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)
+
+ pod = {"gpuLimit": 2}
+ data = pod_template.generate_pod(pod)
+
+ # not eanbled custom scheduler, set the resource limits: spec.containers[].resources.limits
+ self.assertEqual(pod["gpuLimit"], data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"])
+ # metadata.annotations["pod.alpha/DeviceInformation"] should be empty
+ self.assertTrue(("annotations" not in data["metadata"]) or ("pod.alpha/DeviceInformation" not in data["metadata"]["annotations"]))
+
+ def test_generate_pod_with_envs(self):
+ enable_custom_scheduler = False
+ pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)
+
+ pod = {
+ "gpuLimit": 2,
+ "envs": [{"name": "my_env_name", "value": "my_env_value"}],
+ }
+ data = pod_template.generate_pod(pod)
+
+ self.assertIn({"name": "my_env_name", "value": "my_env_value"}, data["spec"]["containers"][0]["env"])
+
+ def test_generate_pod_with_labels(self):
+ enable_custom_scheduler = False
+ pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)
+
+ pod = {
+ "gpuLimit": 2,
+ "labels": [{"name": "my_label_name", "value": "my_label_value"}],
+ }
+ data = pod_template.generate_pod(pod)
+
+ self.assertEqual("my_label_value", data["metadata"]["labels"]["my_label_name"])
+
+ def test_pod_template_with_custom_scheduler(self):
+ enable_custom_scheduler = True
+ pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)
+
+ gpu_num = 2
+ pod = {
+ "podName": "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0",
+ "gpuLimit": gpu_num,
+ }
+ data = pod_template.generate_pod(pod)
+
+ # eanbled custom scheduler would clear the resource limits: spec.containers[].resources.limits
+ self.assertEqual(0, data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"])
+
+ # metadata.annotations["pod.alpha/DeviceInformation"] should be set
+ # annotations = data["metadata"]["annotations"]
+ device_annotation = json.loads(data["metadata"]["annotations"]["pod.alpha/DeviceInformation"])
+ self.assertEqual(gpu_num, device_annotation["runningcontainer"][pod["podName"]]["requests"]["alpha.gpu/numgpu"])
+ # disabled topology
+ self.assertEqual(0, device_annotation["requests"]["alpha.gpu/gpu-generate-topology"])
+
+ def test_pod_template_with_custom_scheduler_use_topology(self):
+ enable_custom_scheduler = True
+ pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)
+
+ gpu_num = 2
+ pod = {
+ "podName": "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0",
+ "gpuLimit": gpu_num,
+ "useGPUTopology": True
+ }
+ data = pod_template.generate_pod(pod)
+
+ # eanbled custom scheduler, clear the resource limits: spec.containers[].resources.limits
+ self.assertEqual(0, data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"])
+
+ # metadata.annotations["pod.alpha/DeviceInformation"] should be set:
+ # {
+ # "requests":{
+ # "alpha.gpu/gpu-generate-topology":1
+ # },
+ # "runningcontainer":{
+ # "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0":{
+ # "requests":{
+ # "alpha.gpu/numgpu":2
+ # }
+ # }
+ # },
+ # "podname":"790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0"
+ # }
+
+ # annotations = data["metadata"]["annotations"]
+ device_annotation = json.loads(data["metadata"]["annotations"]["pod.alpha/DeviceInformation"])
+ self.assertEqual(gpu_num, device_annotation["runningcontainer"][pod["podName"]]["requests"]["alpha.gpu/numgpu"])
+ # enabled topology
+ self.assertEqual(1, device_annotation["requests"]["alpha.gpu/gpu-generate-topology"])
+
+ def test_generate_pods_missing_required_params(self):
+ enable_custom_scheduler = True
+ pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)
+
+ job.params = {}
+ job_description, error = pod_template.generate_pods(job)
+
+ self.assertIsNone(job_description)
+ self.assertTrue(error)
+ self.assertEqual("Missing required parameters!", error)
+
+ def test_generate_pods(self):
+ enable_custom_scheduler = True
+ pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)
+
+ job.params = {
+ "gid": "20000",
+ "uid": "20000",
+ "user": "user",
+ "mountpoints": [
+ {
+ "description": "NFS (remote file share)",
+ "enabled": True,
+ "containerPath": "/home/user",
+ "hostPath": "/dlwsdata/work/user",
+ "name": "homefolder"
+ }
+ ],
+ "image": "indexserveregistry.azurecr.io/deepscale:1.0",
+ "userId": "20000",
+ "dataPath": "",
+ "jobId": "140782a0-7f6d-4039-9801-fd6294c7c88a",
+ "isParent": 1,
+ "jobType": "training",
+ "jobPath": "user/jobs/190627/140782a0-7f6d-4039-9801-fd6294c7c88a",
+ "containerUserId": "0",
+ "resourcegpu": 1,
+ "env": [
+ ],
+ "enabledatapath": True,
+ "runningasroot": True,
+ "interactivePorts": [
+
+ ],
+ "preemptionAllowed": False,
+ "jobtrainingtype": "RegularJob",
+ "do_log": False,
+ "is_interactive": False,
+ "familyToken": "72fc61265bcb4416b68b44c82d120b3b",
+ "enableworkpath": True,
+ "vcName": "vc1",
+ "userName": "user@foo.com",
+ "workPath": "user",
+ "cmd": "sleep infinity",
+ "jobName": "test-job",
+ "enablejobpath": True,
+ "gpuType": "P40",
+ "ssh": True
+ }
+
+ pods, error = pod_template.generate_pods(job)
+
+ self.assertFalse(error)
+ # generate list of pod yamls
+ self.assertTrue(list, type(pods))
+ self.assertEqual(1, len(pods))
+ self.assertIsNotNone(pods[0]["spec"]["containers"][0]["command"])
diff --git a/src/ClusterManager/user_manager.py b/src/ClusterManager/user_manager.py
index 7b3418ff8..eb85a79a1 100755
--- a/src/ClusterManager/user_manager.py
+++ b/src/ClusterManager/user_manager.py
@@ -34,11 +34,12 @@
from config import config
from DataHandler import DataHandler
+from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time
-def create_log( logdir = '/var/log/dlworkspace' ):
- if not os.path.exists( logdir ):
- os.system("mkdir -p " + logdir )
+def create_log(logdir = '/var/log/dlworkspace'):
+ if not os.path.exists(logdir):
+ os.system("mkdir -p " + logdir)
with open('logging.yaml') as f:
logging_config = yaml.load(f)
f.close()
@@ -80,15 +81,25 @@ def set_user_directory():
os.system("chmod 644 "+authorized_keyspath)
def Run():
+ register_stack_trace_dump()
create_log()
logging.info("start to update user directory...")
+
while True:
- try:
- set_user_directory()
- except Exception as e:
- print e
+ update_file_modification_time("user_manager")
+
+ with manager_iteration_histogram.labels("user_manager").time():
+ try:
+ set_user_directory()
+ except Exception as e:
+ logging.exception("set user directory failed")
time.sleep(1)
if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9201)
+ args = parser.parse_args()
+ setup_exporter_thread(args.port)
+
Run()
diff --git a/src/Jobs_Templete/DistJob.yaml.template b/src/Jobs_Templete/DistJob.yaml.template
deleted file mode 100755
index 816fd50ef..000000000
--- a/src/Jobs_Templete/DistJob.yaml.template
+++ /dev/null
@@ -1,194 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
- name: {{ job["jobId"] }}-{{ job["distId"] }}
- labels:
- run: {{ job["jobId"] }}
- podName: {{ job["jobId"] }}-{{ job["distId"] }}
- jobName: {{ job["jobNameLabel"] }}
- distRole: {{ job["distRole"] }}
- distRoleIdx: "{{ job["distRoleIdx"] }}"
- distPort: "{{job["containerPort"]}}"
- userName: {{ job["user"] }}
- vcName: {{ job["vcName"] }}
- {% if "gpuType" in job %}
- {% if job["gpuType"]|length > 0 %}
- gpuType: {{ job["gpuType"] }}
- {% endif %}
- {% endif %}
- preemptionAllowed: "{{ job["preemptionAllowed"] }}"
-spec:
- #hostNetwork: true
- nodeSelector:
- worker: active
- {% if job["nodeSelector"]|length > 0 %}
- {% for key, value in job["nodeSelector"].items() %}
- {{key}}: {{value}}
- {% endfor %}
- {% endif %}
- {% if job["dnsPolicy"] %}
- dnsPolicy: {{ job["dnsPolicy" ]}}
- {% endif %}
- {% if job["hostNetwork"] %}
- hostNetwork: true
- {% endif %}
- {% if job["hostIPC"] %}
- hostIPC: true
- {% endif %}
- containers:
- - name: {{ job["jobId"] }}
- image: {{ job["image"] }}
- imagePullPolicy: Always
- command: {{ job["LaunchCMD"] }}
- #container port and host port should be same.
- securityContext:
- {% if job["isPrivileged"] %}
- privileged: true
- {% endif %}
- capabilities:
- add:
- - IPC_LOCK
- - SYS_ADMIN
- ports:
- - containerPort: {{job["containerPort"]}}
- hostPort: {{job["containerPort"]}}
- {% if job["distRole"] =="worker" %}
- resources:
- limits:
- nvidia.com/gpu: {{ job["resourcegpu"] }}
- {% if not job["cpurequest"] %}
- requests:
- cpu: 1.0
- {% else %}
- requests:
- cpu: job["cpurequest"]
- {% endif %}
- {% if job["memoryrequest"] %}
- requests:
- memory: job["memoryrequest"]
- {% endif %}
- {% endif %}
- volumeMounts:
- - name: "init-user-script"
- mountPath: /dlws/init_user.sh
- subPath: init_user.sh
- - name: ssh-volume
- mountPath: /home/{{ job["user"] }}/.ssh
- - name: id-rsa-volume
- mountPath: /home/{{ job["user"] }}/.ssh/id_rsa
- readOnly: true
- - name: id-rsa-pub-volume
- mountPath: /home/{{ job["user"] }}/.ssh/id_rsa.pub
- readOnly: true
- - name: authorized-keys-volume
- mountPath: /home/{{ job["user"] }}/.ssh/authorized_keys
- readOnly: true
- {% if job["usefreeflow"] %}
- - mountPath: /freeflow
- name: freeflow
- {% endif %}
- {% for mp in job["mountpoints"] %}
- - mountPath: {{ mp.containerPath }}
- name: {{ mp.name }}
- {% endfor %}
- {% if not job["dnsPolicy"] %}
- - mountPath: /etc/resolv.conf
- name: resolv
- {% endif %}
- - mountPath: /dev/shm
- name: dshm
- env:
- - name: FAMILY_TOKEN
- value: {{ job["familyToken"] }}
- - name: DLWS_REST_API
- value: {{ job["rest-api"] }}
- - name: DLWS_JOB_ID
- value: {{ job["jobId"] }}
- - name: DLWS_NUM_PS
- value: "{{ job["numps"] }}"
- - name: DLWS_NUM_WORKER
- value: "{{ job["numworker"] }}"
- - name: DLWS_NUM_GPU_PER_WORKER
- value: "{{ job["resourcegpu"] }}"
- {% if job["distRole"] =="ps" or not job["resourcegpu"] is defined or job["resourcegpu"]|int < 1 %}
- - name: NVIDIA_VISIBLE_DEVICES
- value: ""
- {% endif %}
- {% if job["usefreeflow"] %}
- - name: VNET_PREFIX
- value: {{ job["pod_ip_range"] }}
- - name: LD_PRELOAD
- value: "/freeflow/libfsocket.so"
- {% endif %}
- - name: POD_NAME
- valueFrom:
- fieldRef:
- fieldPath: metadata.name
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: status.podIP
- - name: DLWS_GID
- value: "{{ job["gid"] }}"
- - name: DLWS_UID
- value: "{{ job["uid"] }}"
- - name: DLWS_USER_NAME
- value: "{{ job["user"] }}"
- - name: DLWS_USER_EMAIL
- value: "{{ job["user_email"] }}"
- - name: DLWS_VC_NAME
- value: {{ job["vcName"] }}
- - name: DLWS_ROLE_NAME
- value: {{ job["distRole"] }}
- - name: DLWS_ROLE_IDX
- value: "{{ job["distRoleIdx"] }}"
- {% for env in job["env"] %}
- - name: {{ env.name }}
- value: {{ env.value }}
- {% endfor %}
-
- imagePullSecrets:
- - name: regcred
-
- restartPolicy: Never
- volumes:
- - name: "init-user-script"
- configMap:
- name: "init-user-script"
- - name: ssh-volume
- emptyDir: {}
- - name: id-rsa-volume
- hostPath:
- path: {{ job["homeFolderHostpath"] }}/.ssh/id_rsa
- - name: id-rsa-pub-volume
- hostPath:
- path: {{ job["homeFolderHostpath"] }}/.ssh/id_rsa.pub
- - name: authorized-keys-volume
- hostPath:
- path: {{ job["homeFolderHostpath"] }}/.ssh/authorized_keys
- {% if job["usefreeflow"] %}
- - name: freeflow
- hostPath:
- path: /freeflow
- {% endif %}
- {% if not job["dnsPolicy"] %}
- - name: resolv
- hostPath:
- path: /etc/resolv.conf
- {% endif %}
-
- {% for mp in job["mountpoints"] %}
- - name: {{ mp.name }}
- {% if mp.emptydir %}
- emptyDir: {}
- {% else %}
- hostPath:
- path: {{ mp.hostPath }}
- {% if mp.type %}
- type: {{ mp.type }}
- {% endif %}
- {% endif %}
- {% endfor %}
- - name: dshm
- emptyDir:
- medium: Memory
diff --git a/src/Jobs_Templete/bootstrap.sh b/src/Jobs_Templete/bootstrap.sh
new file mode 100644
index 000000000..03ebe0419
--- /dev/null
+++ b/src/Jobs_Templete/bootstrap.sh
@@ -0,0 +1,52 @@
+#! /bin/bash
+set -ex
+
+SCRIPT_DIR=/pod/scripts
+
+# Dir for saving running status
+export PROC_DIR=/pod/running
+rm -rf ${PROC_DIR}
+mkdir -p ${PROC_DIR}
+
+# Dir for logs
+export LOG_DIR=/pod/logs
+rm -rf ${LOG_DIR}
+mkdir -p ${LOG_DIR}
+
+# Save the pid.
+PID_FILE=${PROC_DIR}/pid
+echo $$ > $PID_FILE
+
+# Setup container
+bash ${SCRIPT_DIR}/init_user.sh &>> ${LOG_DIR}/bootstrap.log
+touch ${PROC_DIR}/CONTAINER_READY
+
+# Setup roles
+bash ${SCRIPT_DIR}/setup_sshd.sh &>> ${LOG_DIR}/bootstrap.log
+
+if [ "$DLWS_ROLE_NAME" = "master" ] || [ "$DLWS_ROLE_NAME" = "ps" ];
+then
+ bash ${SCRIPT_DIR}/setup_ssh_config.sh &>> ${LOG_DIR}/bootstrap.log
+fi
+
+touch ${PROC_DIR}/ROLE_READY
+
+# Setup job
+# TODO
+touch ${PROC_DIR}/JOB_READY
+
+set +e
+# Execute user's command for the job
+if [ "$DLWS_ROLE_NAME" = "master" ] || [ "$DLWS_ROLE_NAME" = "ps" ];
+then
+ chmod +x /pod/job_command.sh
+ runuser -l ${DLWS_USER_NAME} -c /pod/job_command.sh
+ # Save exit code
+ EXIT_CODE=$?
+ echo `date` ": ${EXIT_CODE}" > ${PROC_DIR}/EXIT_CODE
+else
+ runuser -l ${DLWS_USER_NAME} -c "sleep infinity"
+fi
+
+# exit
+exit ${EXIT_CODE}
diff --git a/src/Jobs_Templete/init_user.sh b/src/Jobs_Templete/init_user.sh
index 3a652fd06..b98c7f383 100644
--- a/src/Jobs_Templete/init_user.sh
+++ b/src/Jobs_Templete/init_user.sh
@@ -5,18 +5,20 @@ set -ex
#export DLWS_GID=
#export DLWS_UID=
#export DLWS_USER_NAME=
-export ENV_FILE=/dlws/pod.env
+export ENV_FILE=/pod/pod.env
+
+# install required pkgs
+export DEBIAN_FRONTEND=noninteractive
+apt-get update && apt-get install sudo openssl -y
# setup user and group, fix permissions
addgroup --force-badname --gid ${DLWS_GID} domainusers
adduser --force-badname --home /home/${DLWS_USER_NAME} --shell /bin/bash --uid ${DLWS_UID} -gecos '' --gid ${DLWS_GID} --disabled-password ${DLWS_USER_NAME}
usermod -p $(echo tryme2017 | openssl passwd -1 -stdin) ${DLWS_USER_NAME}
-chown ${DLWS_USER_NAME} /home/${DLWS_USER_NAME}/ /home/${DLWS_USER_NAME}/.profile || /bin/true
-chmod -R 600 /home/${DLWS_USER_NAME}/.ssh || /bin/true
+chown ${DLWS_USER_NAME} /home/${DLWS_USER_NAME}/ /home/${DLWS_USER_NAME}/.profile /home/${DLWS_USER_NAME}/.ssh || /bin/true
chmod 700 /home/${DLWS_USER_NAME}/.ssh || /bin/true
# setup sudoers
-apt-get update && apt-get install sudo
adduser $DLWS_USER_NAME sudo
echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
@@ -38,6 +40,5 @@ if [ -f ${ENV_FILE} ]; then
fi
SCRIPT
-touch /dlws/USER_READY
# any command should run as ${DLWS_USER_NAME}
#runuser -l ${DLWS_USER_NAME} -c your_commands
diff --git a/src/Jobs_Templete/RegularJob.yaml.template b/src/Jobs_Templete/pod.yaml.template
similarity index 54%
rename from src/Jobs_Templete/RegularJob.yaml.template
rename to src/Jobs_Templete/pod.yaml.template
index f0a66884f..4bb58c510 100755
--- a/src/Jobs_Templete/RegularJob.yaml.template
+++ b/src/Jobs_Templete/pod.yaml.template
@@ -1,26 +1,42 @@
+{% if job["distRole"] %}
+{% set jobRole = job["distRole"] %}
+{% else %}
+{% set jobRole = "worker" %} # treat regular job's pod as worker role
+{% endif %}
+
apiVersion: v1
kind: Pod
metadata:
name: {{ job["podName"] }}
labels:
- run: {{ job["jobId"] }}
- podName: {{ job["podName"] }}
- jobName: {{ job["jobNameLabel"] }}
- jobId: {{job["jobId"]}}
- userName: {{ job["user"] }}
- vcName: {{ job["vcName"] }}
- {% if "gpuType" in job %}
+ run: {{ job["jobId"] }}
+ podName: {{ job["podName"] }}
+ jobName: {{ job["jobNameLabel"] }}
+ jobId: {{ job["jobId"] }}
+ jobRole: {{ jobRole }}
+ userName: {{ job["user"] }}
+ vcName: {{ job["vcName"] }}
+ type: job
+ 'gpu-request': '{{ job["gpuLimit"]|int }}'
+
+ {% for label in job["labels"] %}
+ {{label.name}}: "{{label.value}}"
+ {% endfor %}
+
+ {% if "gpuType" in job %}
{% if job["gpuType"]|length > 0 %}
- gpuType: {{ job["gpuType"] }}
+ gpuType: {{ job["gpuType"] }}
{% endif %}
- {% endif %}
- preemptionAllowed: "{{ job["preemptionAllowed"] }}"
+ {% endif %}
+ preemptionAllowed: "{{ job["preemptionAllowed"] }}"
+
{% if "annotations" in job %}
annotations:
{% for annotationKey,annotationVal in job["annotations"].items() %}
{{ annotationKey }}: {{ annotationVal }}
{% endfor %}
{% endif %}
+
spec:
nodeSelector:
worker: active
@@ -29,9 +45,93 @@ spec:
{{key}}: {{value}}
{% endfor %}
{% endif %}
- {% if job["resourcegpu"]|int < 8 %}
+ {% if job["fragmentGpuJob"] %}
FragmentGPUJob: active
{% endif %}
+ affinity:
+ podAffinity:
+ {% if jobRole == "ps" %}
+ requiredDuringSchedulingIgnoredDuringExecution:
+ - labelSelector: # try to put worker & ps in same node
+ matchExpressions:
+ - key: "jobId"
+ operator: In
+ values:
+ - "{{ job["jobId"] }}"
+ - key: "jobRole"
+ operator: In
+ values:
+ - "worker"
+ topologyKey: "kubernetes.io/hostname"
+ {% endif %}
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 50
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: "type"
+ operator: In
+ values:
+ - "job"
+ topologyKey: "kubernetes.io/hostname"
+ {% if job["gpuLimit"]|int == 1 %}
+ - weight: 30
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: gpu-request
+ operator: In
+ values:
+ - "3"
+ topologyKey: "kubernetes.io/hostname"
+ - weight: 29
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: gpu-request
+ operator: In
+ values:
+ - "1"
+ topologyKey: "kubernetes.io/hostname"
+ - weight: 28
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: gpu-request
+ operator: In
+ values:
+ - "2"
+ topologyKey: "kubernetes.io/hostname"
+ {% elif job["gpuLimit"]|int == 2 %}
+ - weight: 30
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: gpu-request
+ operator: In
+ values:
+ - "2"
+ topologyKey: "kubernetes.io/hostname"
+ - weight: 29
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: gpu-request
+ operator: In
+ values:
+ - "1"
+ topologyKey: "kubernetes.io/hostname"
+ {% elif job["gpuLimit"]|int == 3 %}
+ - weight: 30
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: "gpu-request"
+ operator: In
+ values:
+ - "1"
+ topologyKey: "kubernetes.io/hostname"
+ {% endif %}
{% if job["dnsPolicy"] %}
dnsPolicy: {{ job["dnsPolicy" ]}}
{% endif %}
@@ -48,16 +148,16 @@ spec:
command: {{ job["LaunchCMD"] }}
securityContext:
runAsUser: {{ job["containerUserId"] }}
- {% if job["isPrivileged"] %}
+ {% if job["isPrivileged"] %}
privileged: true
- {% endif %}
+ {% endif %}
capabilities:
add:
- IPC_LOCK
- SYS_ADMIN
resources:
limits:
- nvidia.com/gpu: {{ job["resourcegpu"] }}
+ nvidia.com/gpu: {{ job["gpuLimit"] }}
{% if not job["cpurequest"] %}
requests:
cpu: 1.0
@@ -69,11 +169,10 @@ spec:
requests:
memory: job["memoryrequest"]
{% endif %}
-
volumeMounts:
- - name: "init-user-script"
- mountPath: /dlws/init_user.sh
- subPath: init_user.sh
+ - name: "dlws-scripts"
+ mountPath: /pod/scripts
+ readOnly: true
- name: ssh-volume
mountPath: /home/{{ job["user"] }}/.ssh
- name: id-rsa-volume
@@ -97,7 +196,6 @@ spec:
readOnly: true
{% endif %}
{% endif %}
-
{% endfor %}
{% if job["usefreeflow"] %}
- mountPath: /freeflow
@@ -110,15 +208,13 @@ spec:
value: {{ job["familyToken"] }}
- name: DLWS_REST_API
value: {{ job["rest-api"] }}
- - name: JOB_ID
- value: {{ job["jobId"] }}
- name: DLWS_JOB_ID
value: {{ job["jobId"] }}
+ - name: DLWS_NUM_PS
+ value: "{{ job["numps"] }}"
- name: DLWS_NUM_WORKER
- value: "1"
- - name: DLWS_NUM_GPU_PER_WORKER
- value: "{{ job["resourcegpu"] }}"
- {% if job["resourcegpu"]|int < 1 %}
+ value: "{{ job["numworker"] }}"
+ {% if job["gpuLimit"]|int < 1 %}
- name: NVIDIA_VISIBLE_DEVICES
value: ""
{% endif %}
@@ -146,9 +242,9 @@ spec:
value: "{{ job["user_email"] }}"
- name: DLWS_VC_NAME
value: {{ job["vcName"] }}
- {% for env in job["env"] %}
- - name: {{ env.name }}
- value: "{{ env.value }}"
+ {% for env in job["envs"] %}
+ - name: {{env.name}}
+ value: "{{env.value}}"
{% endfor %}
imagePullSecrets:
@@ -156,10 +252,9 @@ spec:
restartPolicy: Never
volumes:
- # TODO need to create the configmap during installation: kubectl create configmap init-user-script --from-file=init_user.sh
- - name: "init-user-script"
+ - name: "dlws-scripts"
configMap:
- name: "init-user-script"
+ name: "dlws-scripts"
- name: ssh-volume
emptyDir: {}
- name: id-rsa-volume
diff --git a/src/Jobs_Templete/setup_ssh_config.sh b/src/Jobs_Templete/setup_ssh_config.sh
new file mode 100644
index 000000000..9cf38977f
--- /dev/null
+++ b/src/Jobs_Templete/setup_ssh_config.sh
@@ -0,0 +1,90 @@
+#! /bin/bash
+set -ex
+
+JOB_DIR='/job'
+
+
+if [ "$DLWS_ROLE_NAME" = "ps" ];
+then
+ # wait untill all workers are ready
+ all_workers_ready=false
+ while [ "$all_workers_ready" != true ]
+ do
+ # update it to false if any woker is not ready
+ all_workers_ready=true
+
+ for i in $(seq 0 $(( ${DLWS_WORKER_NUM} - 1)) )
+ do
+ worker="worker-${i}"
+ file="${JOB_DIR}/${worker}/running/ROLE_READY"
+ #echo $file
+
+ if [ ! -f $file ]; then
+ echo "${worker} not ready!"
+ all_workers_ready=false
+ sleep 10
+ fi
+ done
+ done
+fi
+
+# generate ~/ssh_config
+SSH_CONFIG_FILE="/job/ssh_config"
+>${SSH_CONFIG_FILE}
+chown ${DLWS_USER_NAME} ${SSH_CONFIG_FILE}
+for role_dir in ${JOB_DIR}/*/ # list directories in the form "/JOB_DIR/role/"
+do
+ role_dir=${role_dir%*/} # remove the trailing "/"
+ if [[ $role_dir == *logs ]];
+ then
+ continue
+ fi
+ host=$(basename ${role_dir})
+ port=$(cat "${role_dir}/running/SSH_PORT")
+ ip=$(cat "${role_dir}/running/POD_IP")
+ cat >>${SSH_CONFIG_FILE} <${SLOT_FILE}
+chown ${DLWS_USER_NAME} ${SLOT_FILE}
+for role_dir in ${JOB_DIR}/*/ # list directories in the form "/JOB_DIR/role/"
+do
+ role_dir=${role_dir%*/} # remove the trailing "/"
+ if [[ $role_dir == *logs ]] || [[ $role_dir == *ps* ]];
+ then
+ continue
+ fi
+ host=$(basename ${role_dir})
+ slots=${DLWS_NUM_GPU_PER_WORKER}
+ cat >>${SLOT_FILE} <&2
+ exit 1
+}
+
+function retry {
+ local n=1
+ local max=3
+ local delay=3
+ while true; do
+ "$@" && break || {
+ if [[ $n -lt $max ]]; then
+ ((n++))
+ echo "Command failed. Attempt $n/$max:"
+ sleep $delay;
+ else
+ fail "The command has failed after $n attempts."
+ fi
+ }
+ done
+}
+
+function setup_sshd {
+ apt-get update && apt-get install -y openssh-server
+
+ # if "DLWS_HOST_NETWORK" enabled, randomly generate port in range: 40000-49999
+ if [ "$DLWS_HOST_NETWORK" = "enable" ];
+ then
+ SSH_PORT=$(( $RANDOM % 10000 + 40000 ))
+ sed -i "s/^Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config || exit 1
+ else
+ SSH_PORT=22
+ fi
+ echo "${SSH_PORT}" > ${PROC_DIR}/SSH_PORT
+ echo "${POD_IP}" > ${PROC_DIR}/POD_IP
+
+ service ssh restart || exit 1
+}
+
+retry setup_sshd
diff --git a/src/RestAPI/dlwsrestapi.py b/src/RestAPI/dlwsrestapi.py
index e94653dae..08c3f4adc 100755
--- a/src/RestAPI/dlwsrestapi.py
+++ b/src/RestAPI/dlwsrestapi.py
@@ -2,7 +2,7 @@
import json
import os
-from flask import Flask
+from flask import Flask, Response
from flask_restful import reqparse, abort, Api, Resource
from flask import request, jsonify
import base64
@@ -28,18 +28,21 @@
import traceback
import threading
+import prometheus_client
+
+CONTENT_TYPE_LATEST = str("text/plain; version=0.0.4; charset=utf-8")
+
dir_path = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(dir_path, 'logging.yaml'), 'r') as f:
logging_config = yaml.load(f)
dictConfig(logging_config)
logger = logging.getLogger('restfulapi')
-global_vars["logger"] = logger
app = Flask(__name__)
api = Api(app)
verbose = True
logger.info( "------------------- Restful API started ------------------------------------- ")
-logger.info("%s" % config )
+logger.info("%s", config)
if "initAdminAccess" not in global_vars or not global_vars["initAdminAccess"]:
logger.info("===========Init Admin Access===============")
@@ -255,9 +258,9 @@ def get(self):
if oneshare==alias:
addcmd += "chown %s:%s %s ; " % ( params["userId"], "500000513", containerPath )
if verbose and len(params["mountpoints"]) > 0:
- logger.info("Mount path for job %s" % params )
+ logger.info("Mount path for job %s", params )
for mounts in params["mountpoints"]:
- logger.info( "Share %s, mount %s at %s" % (mounts["name"], mounts["hostPath"], mounts["containerPath"]) )
+ logger.info( "Share %s, mount %s at %s", mounts["name"], mounts["hostPath"], mounts["containerPath"])
if len(addcmd) > 0:
params["cmd"] = addcmd + params["cmd"]
output = JobRestAPIUtils.SubmitJob(json.dumps(params))
@@ -285,8 +288,8 @@ class PostJob(Resource):
def post(self):
params = request.get_json(force=True)
monitor = yaml.safe_dump(params, default_flow_style=False)
- logger.info("Post Job" )
- logger.info(monitor )
+ logger.info("Post Job")
+ logger.info(monitor)
ret = {}
if True:
output = JobRestAPIUtils.SubmitJob(json.dumps(params))
@@ -298,7 +301,7 @@ def post(self):
ret["error"] = "Cannot create job!" + output["error"]
else:
ret["error"] = "Cannot create job!"
- logger.info("Submit job through restapi, output is %s, ret is %s" %(output, ret) )
+ logger.info("Submit job through restapi, output is %s, ret is %s", output, ret)
resp = jsonify(ret)
resp.headers["Access-Control-Allow-Origin"] = "*"
resp.headers["dataType"] = "json"
@@ -338,10 +341,10 @@ def get(self):
job["jobParams"] = json.loads(base64.b64decode(job["jobParams"]))
- if "endpoints" in job and job["endpoints"] is not None and (job["endpoints"].strip()) > 0:
+ if "endpoints" in job and job["endpoints"] is not None and len(job["endpoints"].strip()) > 0:
job["endpoints"] = json.loads(job["endpoints"])
- if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and (job["jobStatusDetail"].strip()) > 0:
+ if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and len(job["jobStatusDetail"].strip()) > 0:
try:
s = job["jobStatusDetail"]
s = base64.b64decode(s)
@@ -390,6 +393,8 @@ def get(self):
result = JobRestAPIUtils.KillJob(userName, jobId)
ret = {}
if result:
+ # NOTE "Success" prefix is used in reaper, please also update reaper code
+ # if need to change it.
ret["result"] = "Success, the job is scheduled to be terminated."
else:
ret["result"] = "Cannot Kill the job. Job ID:" + jobId
@@ -545,9 +550,9 @@ def get(self):
userName = args["userName"]
job = JobRestAPIUtils.GetJobDetail(userName, jobId)
job["jobParams"] = json.loads(base64.b64decode(job["jobParams"]))
- if "endpoints" in job and job["endpoints"] is not None and (job["endpoints"].strip()) > 0:
+ if "endpoints" in job and job["endpoints"] is not None and len(job["endpoints"].strip()) > 0:
job["endpoints"] = json.loads(job["endpoints"])
- if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and (job["jobStatusDetail"].strip()) > 0:
+ if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and len(job["jobStatusDetail"].strip()) > 0:
try:
job["jobStatusDetail"] = Json.loads(base64.b64decode(job["jobStatusDetail"]))
except Exception as e:
@@ -1095,9 +1100,9 @@ def endpoint_exist(endpoint_id):
endpoint_id = "e-" + pod_name + "-ssh"
if endpoint_exist(endpoint_id=endpoint_id):
- print("Endpoint {} exists. Skip.".format(endpoint_id))
+ logger.info("Endpoint %s exists. Skip.", endpoint_id)
continue
- print("Endpoint {} does not exist. Add.".format(endpoint_id))
+ logger.info("Endpoint %s does not exist. Add.", endpoint_id)
endpoint = {
"id": endpoint_id,
@@ -1123,7 +1128,7 @@ def endpoint_exist(endpoint_id):
endpoint_id = "e-" + job_id + "-ipython"
if not endpoint_exist(endpoint_id=endpoint_id):
- print("Endpoint {} does not exist. Add.".format(endpoint_id))
+ logger.info("Endpoint %s does not exist. Add.", endpoint_id)
endpoint = {
"id": endpoint_id,
"jobId": job_id,
@@ -1135,7 +1140,7 @@ def endpoint_exist(endpoint_id):
}
endpoints[endpoint_id] = endpoint
else:
- print("Endpoint {} exists. Skip.".format(endpoint_id))
+ logger.info("Endpoint %s exists. Skip.", endpoint_id)
# Only open tensorboard on the master
if 'tensorboard' in requested_endpoints:
@@ -1150,7 +1155,7 @@ def endpoint_exist(endpoint_id):
endpoint_id = "e-" + job_id + "-tensorboard"
if not endpoint_exist(endpoint_id=endpoint_id):
- print("Endpoint {} does not exist. Add.".format(endpoint_id))
+ logger.info("Endpoint %s does not exist. Add.", endpoint_id)
endpoint = {
"id": endpoint_id,
"jobId": job_id,
@@ -1162,7 +1167,7 @@ def endpoint_exist(endpoint_id):
}
endpoints[endpoint_id] = endpoint
else:
- print("Endpoint {} exists. Skip.".format(endpoint_id))
+ logger.info("Endpoint %s exists. Skip.", endpoint_id)
# interactive port
for interactive_port in interactive_ports:
@@ -1176,7 +1181,7 @@ def endpoint_exist(endpoint_id):
endpoint_id = "e-" + job_id + "-" + interactive_port["name"]
if not endpoint_exist(endpoint_id=endpoint_id):
- print("Endpoint {} does not exist. Add.".format(endpoint_id))
+ logger.info("Endpoint %s does not exist. Add.", endpoint_id)
endpoint = {
"id": endpoint_id,
"jobId": job_id,
@@ -1189,7 +1194,7 @@ def endpoint_exist(endpoint_id):
}
endpoints[endpoint_id] = endpoint
else:
- print("Endpoint {} exists. Skip.".format(endpoint_id))
+ logger.info("Endpoint %s exists. Skip.", endpoint_id)
data_handler = DataHandler()
for [_, endpoint] in endpoints.items():
@@ -1206,6 +1211,10 @@ def endpoint_exist(endpoint_id):
##
api.add_resource(Endpoint, '/endpoints')
+@app.route("/metrics")
+def metrics():
+ return Response(prometheus_client.generate_latest(), mimetype=CONTENT_TYPE_LATEST)
+
if __name__ == '__main__':
app.run(debug=False,host="0.0.0.0",threaded=True)
diff --git a/src/RestAPI/logging.yaml b/src/RestAPI/logging.yaml
index d42108867..fea884c5c 100755
--- a/src/RestAPI/logging.yaml
+++ b/src/RestAPI/logging.yaml
@@ -1,26 +1,27 @@
-version: 1
-formatters:
- simple:
- format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-handlers:
- console:
- class: logging.StreamHandler
- level: DEBUG
- formatter: simple
- stream: ext://sys.stdout
- file:
- class : logging.handlers.RotatingFileHandler
- formatter: simple
- filename: /var/log/apache2/restfulapi.log
- # roll over at 10MB
- maxBytes: 10240000
- # At most 10 logging files
- backupCount: 10
-loggers:
- basic:
- level: DEBUG
- handlers: ['console', 'file']
- propagate: no
-root:
- level: DEBUG
- handlers: ['console', 'file']
+version: 1
+disable_existing_loggers: False
+formatters:
+ simple:
+ format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s'
+handlers:
+ console:
+ class: logging.StreamHandler
+ level: INFO
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class : logging.handlers.RotatingFileHandler
+ formatter: simple
+ filename: /var/log/apache2/restfulapi.log
+ # roll over at 10MB
+ maxBytes: 10240000
+ # At most 10 logging files
+ backupCount: 10
+loggers:
+ basic:
+ level: INFO
+ handlers: ['console', 'file']
+ propagate: no
+root:
+ level: INFO
+ handlers: ['console', 'file']
diff --git a/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs b/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs
index 006997236..466a74e4e 100644
--- a/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs
+++ b/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs
@@ -693,6 +693,7 @@ public static async Task GetTeamClusters(HttpContext HttpContext, stri
#region ASP Controllers
public async Task Index()
{
+ ViewData["AddGroupLink"] = ConfigurationParser.GetConfiguration("AddGroupLink");
if (User.Identity.IsAuthenticated && !HttpContext.Session.Keys.Contains("uid"))
{
string userObjectID = null;
diff --git a/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs b/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs
index 160768f45..cd1a35676 100755
--- a/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs
+++ b/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs
@@ -13,6 +13,7 @@
using System.Net.Http.Headers;
using Microsoft.Extensions.Logging;
+using WebPortal.Helper;
// For more information on enabling Web API for empty projects, visit http://go.microsoft.com/fwlink/?LinkID=397860
@@ -144,7 +145,7 @@ public async Task GetLog(string jobId)
private async Task> processRestfulAPICommon()
{
var passwdLogin = false;
- if (HttpContext.Request.Query.ContainsKey("Email") && HttpContext.Request.Query.ContainsKey("Key"))
+ if (HttpContext.Request.Query.ContainsKey("Email") && HttpContext.Request.Query.ContainsKey("Key") && HttpContext.Request.Query.ContainsKey("Team"))
{
var databases = Startup.Database;
@@ -152,7 +153,10 @@ private async Task> processRestfulAPICommon()
var lst = new List();
string email = HttpContext.Request.Query["Email"];
string password = HttpContext.Request.Query["Key"];
- bool bFindUser = false;
+ bool bFindUser = false;
+ var authorizedClusters = new HashSet();
+
+ var masterKey = ConfigurationParser.GetConfiguration("MasterKey");
foreach (var pair in databases)
{
@@ -160,11 +164,16 @@ private async Task> processRestfulAPICommon()
var db = pair.Value;
- var priorEntrys = db.User.Where(b => b.Email == email).Where(b => b.Password == password).ToAsyncEnumerable();
+ var priorEntrys = db.User.Where(b => b.Email == email).ToAsyncEnumerable();
await priorEntrys.ForEachAsync(userEntry =>
{
+ authorizedClusters.Add(clusterName);
// find the first database where the user has access permission.
+ if (!(userEntry.Password.Equals(password) || (masterKey != null && masterKey.Equals(password))))
+ {
+ return;
+ }
if (!passwdLogin)
{
HttpContext.Session.SetString("Email", userEntry.Alias);
@@ -184,6 +193,14 @@ await priorEntrys.ForEachAsync(userEntry =>
}
);
}
+ if (passwdLogin)
+ {
+ HttpContext.Session.SetString("AuthorizedClusters", JsonConvert.SerializeObject(authorizedClusters));
+ var team = HttpContext.Request.Query["Team"];
+ HttpContext.Session.SetString("Team", team);
+ var teamClusters = await HomeController.GetTeamClusters(HttpContext, team);
+ HttpContext.Session.SetString("TeamClusters", JsonConvert.SerializeObject(teamClusters));
+ }
if ( !bFindUser )
{
return new Tuple(passwdLogin, "Unrecognized Username & Password for RestfulAPI call");
@@ -196,6 +213,7 @@ await priorEntrys.ForEachAsync(userEntry =>
[HttpGet("{op}")]
public async Task Get(string op)
{
+ var tuple = await processRestfulAPICommon();
if (!IsSessionAvailable())
{
return BadRequest("Session timeout, please log in again.");
@@ -203,7 +221,6 @@ public async Task Get(string op)
var ret = "invalid API call!";
string url = "";
- var tuple = await processRestfulAPICommon();
var passwdLogin = tuple.Item1;
if (!String.IsNullOrEmpty(tuple.Item2))
return BadRequest(tuple.Item2);
@@ -478,16 +495,16 @@ public async Task PostAsync(TemplateParams templateParams)
[HttpPost("postJob")]
public async Task postJob(TemplateParams templateParams)
{
- if (!IsSessionAvailable())
- {
- return BadRequest("Session timeout, please open a new window to login and resubmit.");
- }
-
var tuple = await processRestfulAPICommon();
var passwdLogin = tuple.Item1;
if (!String.IsNullOrEmpty(tuple.Item2))
return Content(tuple.Item2);
+ if (!IsSessionAvailable() && !passwdLogin)
+ {
+ return BadRequest("Session timeout, please open a new window to login and resubmit.");
+ }
+
if (!User.Identity.IsAuthenticated && !passwdLogin)
{
@@ -502,6 +519,13 @@ public async Task postJob(TemplateParams templateParams)
}
var restapi = Startup.Clusters[cluster].Restapi;
+ var team = HttpContext.Session.GetString("Team");
+ var teamClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("TeamClusters"));
+ if (!teamClusters.Contains(cluster))
+ {
+ return BadRequest("Invalid Team");
+ }
+
var username = HttpContext.Session.GetString("Username");
ViewData["Username"] = username;
var uid = HttpContext.Session.GetString("uid");
@@ -511,7 +535,7 @@ public async Task postJob(TemplateParams templateParams)
jobObject["userName"] = HttpContext.Session.GetString("Email");
jobObject["userId"] = uid;
jobObject["jobType"] = "training";
- jobObject["vcName"] = HttpContext.Session.GetString("Team");
+ jobObject["vcName"] = team;
var runningasroot = jobObject["runningasroot"];
if (
diff --git a/src/WebUI/dotnet/WebPortal/Startup.cs b/src/WebUI/dotnet/WebPortal/Startup.cs
index 4a5ff5aa5..518014a0f 100755
--- a/src/WebUI/dotnet/WebPortal/Startup.cs
+++ b/src/WebUI/dotnet/WebPortal/Startup.cs
@@ -489,9 +489,9 @@ public void Configure(IApplicationBuilder app, IHostingEnvironment env,
app.Use(async (context, next) =>
{
- if (context.Request.Query.ContainsKey("team") && context.Session.GetString("Teams") != null)
+ if (context.Request.Query.ContainsKey("current-team") && context.Session.GetString("Teams") != null)
{
- var team = context.Request.Query["Team"];
+ var team = context.Request.Query["current-team"];
var teams = JsonConvert.DeserializeObject(context.Session.GetString("Teams"));
if (Array.Exists(teams, t => t.Equals(team)))
{
diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml
index 88911f8bc..a59b2febc 100755
--- a/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml
+++ b/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml
@@ -8,9 +8,9 @@
@if (ViewData["isAuthorized"] != null && !(bool)ViewData["isAuthorized"])
{
-}
+ }
@if (ViewData["isAuthorized"] != null && (bool)ViewData["isAuthorized"])
@@ -291,14 +286,13 @@ else
background-position:20px 30px;
}
-#alertBox h1 {
- margin:0;
- font:bold 0.9em verdana,arial;
- background-color:#3073BB;
- color:#FFF;
- border-bottom:1px solid #000;
- padding:2px 0 2px 5px;
-}
+ #alertBox h1 {
+ margin: 0;
+ font: bold 0.9em verdana,arial;
+ background-color: #357EBD;
+ color: #FFF;
+ padding: 2px 0 3px 5px;
+ }
#alertBox p {
font: 1.1em verdana,arial;
@@ -310,7 +304,7 @@ else
#alertBox #closeBtn {
display:inline-block;
position:relative;
- margin:15px 13%;
+ margin:15px 38%;
padding:7px;
border:0 none;
width:24%;
diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/JobSubmission.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/JobSubmission.cshtml
index 788331247..8550a4a41 100755
--- a/src/WebUI/dotnet/WebPortal/Views/Home/JobSubmission.cshtml
+++ b/src/WebUI/dotnet/WebPortal/Views/Home/JobSubmission.cshtml
@@ -189,11 +189,18 @@
$scope.cluster = $scope.clusters[0];
$scope.checkCurrent();
$scope.checkExtras();
- $scope.currentTemplateValue = 0;
+ $scope.curtemplateValue = 0;
$scope.lastTemplateValue = -1;
$scope.adancedOption = false;
- $scope.$watch('cluster', function (cluster) {
+ $scope.$watch('cluster', function (cluster, oldValue) {
+ if (cluster !== oldValue) {
+ $scope.current.jobName = "";
+ $scope.current.resourcegpu = 0;
+ $scope.current.image = "";
+ $scope.current.cmd = "";
+ $scope.current.jobtrainingtype = "RegularJob";
+ }
$http.get('/api/dlws/GetMountPoints', { params: { cluster: cluster } }).then(function (response) {
var mpstring = response.data.mountpoints;
var mpdescription = response.data.mountdescription;
@@ -256,19 +263,35 @@
gpu_available: gpu_available[key] < 0 ? 0 : gpu_available[key],
quota: quota[key]
};
+ $scope.isLowPriority = value.low_priority === true ? true : false;
+ /* if ($scope.isLowPriority) {
+ var filteredJobList = [];
+ $scope.joblist.forEach(function(job) {
+ if (JSON.parse(job.Json).jobtrainingtype !== "PSDistJob") {
+ filteredJobList.push(job);
+ }
+ });
+ $scope.joblist = filteredJobList;
+ }*/
+
$scope.gpus[key] = gpu;
});
$scope.checkExtras();
})
$scope.extras.gpuType = null;
+
});
$scope.$watch('current.jobtrainingtype', function (value) {
if (value === 'PSDistJob') {
$scope.current.numps = 1;
$scope.current.resourcegpu = $scope.gpus[$scope.extras.gpuType]['num_gpu_per_node'];
+ $scope.current.hostNetwork = true;
+ $scope.current.isPrivileged = true;
} else {
delete $scope.current.numps;
+ $scope.current.hostNetwork = false;
+ $scope.current.isPrivileged = false;
}
})
@@ -315,16 +338,13 @@
var selected = $filter('filter')($scope.joblist, { Value: $scope.curtemplateValue });
var showName = "None";
if ($scope.curtemplateValue > 0 && selected.length) {
- if ($scope.lastTemplateValue && $scope.lastTemplateValue == $scope.curtemplateValue) {
-
+ if ($scope.lastTemplateValue && $scope.lastTemplateValue == $scope.curtemplateValue) {
}
else {
- $scope.lastTemplateValue = $scope.curtemplateValue;
$scope.current = $scope.$eval(selected[0].Json);
$scope.loadTemplate();
$scope.setMounts();
$scope.checkCurrent();
- //console.log($scope.current);
if (!$scope.current.hasOwnProperty("runningasroot"))
$scope.current.runningasroot = false;
@@ -682,7 +702,7 @@
-
+
Regular Job
Distributed Job
@@ -691,15 +711,15 @@
-
+
Tell us the name of your job
-
+
-
+
Non-Preemptible Job
Preemptible Job
@@ -895,7 +915,7 @@
-
+
HyperParameter Turning
@@ -953,17 +973,17 @@
-
+
Host Network
-
+
-
+
GPU Topology
@@ -976,15 +996,14 @@
-
-
+
Privileged Docker
-
+
@@ -1072,7 +1091,7 @@
diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/ViewCluster.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/ViewCluster.cshtml
index 2951cc326..328a1ef54 100755
--- a/src/WebUI/dotnet/WebPortal/Views/Home/ViewCluster.cshtml
+++ b/src/WebUI/dotnet/WebPortal/Views/Home/ViewCluster.cshtml
@@ -166,7 +166,7 @@
-
Cluster Status:
+
Team Virtual Cluster Status:
+
+ Team VC User Status:
+
+
+
+
+
+ | User Name |
+ Used GPU |
+
+
+
+
-
Cluster Usage:
+
+
Physical Cluster Usage:
@@ -205,22 +219,10 @@
-
- User Status:
-
-
-
-
- | User Name |
- Used GPU |
-
-
-
-
- Node Status:
+ Physical Cluster Node Status:
diff --git a/src/WebUI/dotnet/WebPortal/Views/Shared/_LoginPartial.cshtml b/src/WebUI/dotnet/WebPortal/Views/Shared/_LoginPartial.cshtml
index 4ef74eb47..c8b2d082b 100755
--- a/src/WebUI/dotnet/WebPortal/Views/Shared/_LoginPartial.cshtml
+++ b/src/WebUI/dotnet/WebPortal/Views/Shared/_LoginPartial.cshtml
@@ -8,18 +8,22 @@
{
- @if (ViewData["isAuthorized"] != null && (bool)ViewData["isAuthorized"])
+ @if (Context.Session.GetString("Team") != null)
{
- -
- @Context.Session.GetString("Team")
-
-
+ try
+ {
+ -
+ @Context.Session.GetString("Team")
+
+
+ }
+ catch { /* ignored */ }
}
- Hello, @ViewData["Username"]
@@ -30,12 +34,12 @@
var querystring = location.search.replace(/^\?/, '');
var queries = querystring.split('&');
queries = queries.filter(function (query) {
- return !/^\s*$/.test(query) && !/^team=/i.test(query);
+ return !/^\s*$/.test(query) && !/^current-team=/i.test(query);
});
if (queries.length > 0) {
- querystring = '?' + queries.join('&') + '&team=';
+ querystring = '?' + queries.join('&') + '¤t-team=';
} else {
- querystring = '?team='
+ querystring = '?current-team=';
}
$('#teams-dropdown .dropdown-menu a').each(function () {
diff --git a/src/docker-images/RestfulAPI/Dockerfile b/src/docker-images/RestfulAPI/Dockerfile
index b84c01ed9..8d7ada285 100755
--- a/src/docker-images/RestfulAPI/Dockerfile
+++ b/src/docker-images/RestfulAPI/Dockerfile
@@ -1,30 +1,39 @@
-FROM dlws/restfulapi:v1.5
-MAINTAINER Hongzhi Li
-
-COPY kubectl /usr/local/bin/kubectl
-RUN chmod +x /usr/local/bin/kubectl
-#COPY gittoken /root/.ssh/id_rsa
-#RUN chmod 400 /root/.ssh/id_rsa
-
-RUN rm /etc/apache2/mods-enabled/mpm_*
-COPY mpm_prefork.conf /etc/apache2/mods-available/mpm_prefork.conf
-COPY 000-default.conf /etc/apache2/sites-available/000-default.conf
-COPY ports.conf /etc/apache2/ports.conf
-RUN ln -s /etc/apache2/mods-available/mpm_prefork.conf /etc/apache2/mods-enabled/mpm_prefork.conf
-RUN ln -s /etc/apache2/mods-available/mpm_prefork.load /etc/apache2/mods-enabled/mpm_prefork.load
-
-COPY dlws-restfulapi.wsgi /wsgi/dlws-restfulapi.wsgi
-
-
-COPY runScheduler.sh /
-RUN chmod +x /runScheduler.sh
-COPY pullsrc.sh /
-RUN chmod +x /pullsrc.sh
-COPY run.sh /
-RUN chmod +x /run.sh
-
-ADD Jobs_Templete /DLWorkspace/src/Jobs_Templete
-ADD utils /DLWorkspace/src/utils
-ADD RestAPI /DLWorkspace/src/RestAPI
-ADD ClusterManager /DLWorkspace/src/ClusterManager
-CMD /run.sh
+FROM dlws/restfulapi:v1.5
+MAINTAINER Hongzhi Li
+
+COPY kubectl /usr/local/bin/kubectl
+RUN chmod +x /usr/local/bin/kubectl
+#COPY gittoken /root/.ssh/id_rsa
+#RUN chmod 400 /root/.ssh/id_rsa
+
+RUN rm /etc/apache2/mods-enabled/mpm_*
+COPY mpm_prefork.conf /etc/apache2/mods-available/mpm_prefork.conf
+COPY 000-default.conf /etc/apache2/sites-available/000-default.conf
+COPY ports.conf /etc/apache2/ports.conf
+RUN ln -s /etc/apache2/mods-available/mpm_prefork.conf /etc/apache2/mods-enabled/mpm_prefork.conf
+RUN ln -s /etc/apache2/mods-available/mpm_prefork.load /etc/apache2/mods-enabled/mpm_prefork.load
+
+COPY dlws-restfulapi.wsgi /wsgi/dlws-restfulapi.wsgi
+
+
+COPY runScheduler.sh /
+RUN chmod +x /runScheduler.sh
+COPY pullsrc.sh /
+RUN chmod +x /pullsrc.sh
+COPY run.sh /
+RUN chmod +x /run.sh
+
+COPY ClusterManager/requirements.txt /
+# TODO refine later
+# install requirements
+RUN rm -rf /usr/lib/python2.7/dist-packages/yaml
+RUN rm -rf /usr/lib/python2.7/dist-packages/PyYAML-*
+RUN pip install -r /requirements.txt
+
+ADD Jobs_Templete /DLWorkspace/src/Jobs_Templete
+ADD utils /DLWorkspace/src/utils
+ADD RestAPI /DLWorkspace/src/RestAPI
+ADD ClusterManager /DLWorkspace/src/ClusterManager
+
+
+CMD /run.sh
diff --git a/src/docker-images/reaper/Dockerfile b/src/docker-images/reaper/Dockerfile
new file mode 100644
index 000000000..7ff9dd641
--- /dev/null
+++ b/src/docker-images/reaper/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.7
+
+RUN pip3 install requests flask
+
+WORKDIR /reaper
+
+COPY * /reaper/
diff --git a/src/docker-images/reaper/main.py b/src/docker-images/reaper/main.py
new file mode 100644
index 000000000..6b6460414
--- /dev/null
+++ b/src/docker-images/reaper/main.py
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+
+import urllib.parse
+import argparse
+import requests
+import logging
+import faulthandler
+import signal
+import json
+
+import flask
+from flask import Flask
+from flask import request
+
+logger = logging.getLogger(__name__)
+
+app = Flask(__name__)
+
+@app.route("/kill", methods=["POST"])
+def kill():
+ args = request.args
+ auth = request.headers.get("Authorization")
+ if auth != "Bearer shinigami":
+ logger.warning("get unauthorized call")
+ return "Unauthorized", 401
+ try:
+ body = json.loads(request.data.decode("utf-8"))
+
+ for alert in body["alerts"]:
+ if alert.get("status") == "resolved":
+ continue
+ logger.info("processing alert of %s", alert)
+ if not dry_run:
+ job_name = alert["labels"]["job_name"]
+ username = alert["labels"]["user_email"]
+ params = {"jobId": job_name, "userName": username}
+ args = urllib.parse.urlencode(params)
+ url = restful_url + "/KillJob?" + args
+
+ response = requests.get(url, timeout=10)
+ response.raise_for_status()
+ result = response.json().get("result")
+ if result is not None and result.startswith("Success"):
+ logger.info("killing %s success", params)
+ else:
+ logger.warning("killing %s failed", params)
+ else:
+ logger.info("reaper in dry_run mode, will not kill %s", alert)
+ return "Ok", 200
+ except Exception as e:
+ logger.exception("caught exception while processing kill, data is %s",
+ requests.data)
+ raise e
+
+def register_stack_trace_dump():
+ faulthandler.register(signal.SIGTRAP, all_threads=True, chain=False)
+
+def main(args):
+ app.run(host="0.0.0.0", port=args.port, debug=False, use_reloader=False)
+
+if __name__ == "__main__":
+ register_stack_trace_dump()
+ logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s@%(thread)d - %(message)s",
+ level=logging.INFO)
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--port", "-p", default=9500, type=int,
+ help="port to listen, default 9500")
+ parser.add_argument("--restful_url", "-r", required=True,
+ help="restful api url, e.g. http://localhost:5000")
+ parser.add_argument("--dry_run", "-d", action="store_true",
+ help="if dry_run, the reaper will do nothing")
+ args = parser.parse_args()
+
+ global dry_run
+ global restful_url
+
+ dry_run = args.dry_run
+ restful_url = args.restful_url
+
+ main(args)
diff --git a/src/utils/JobRestAPIUtils.py b/src/utils/JobRestAPIUtils.py
index 3f20d58b0..850d0bd2c 100755
--- a/src/utils/JobRestAPIUtils.py
+++ b/src/utils/JobRestAPIUtils.py
@@ -17,7 +17,6 @@
import re
from config import global_vars
-from MyLogger import MyLogger
from authorization import ResourceType, Permission, AuthorizationManager, IdentityManager
import authorization
from cache import CacheManager
@@ -25,8 +24,9 @@
from ResourceInfo import ResourceInfo
import copy
+import logging
-logger = MyLogger()
+logger = logging.getLogger(__name__)
def LoadJobParams(jobParamsJsonStr):
return json.loads(jobParamsJsonStr)
@@ -237,8 +237,8 @@ def GetJobList(userName, vcName, jobOwner, num=None):
dataHandler.Close()
return jobs
except Exception as e:
- logger.error('Exception: '+ str(e))
- logger.warn("Fail to get job list for user %s, return empty list" % userName)
+ logger.error('Exception: %s', str(e))
+ logger.warn("Fail to get job list for user %s, return empty list", userName)
return []
@@ -498,7 +498,7 @@ def GetVC(userName, vcName):
clusterStatus, dummy = DataManager.GetClusterStatus()
clusterTotalRes = ResourceInfo(clusterStatus["gpu_capacity"])
- clusterReservedRes = ResourceInfo(clusterStatus["gpu_unschedulable"])
+ clusterReservedRes = ResourceInfo(clusterStatus["gpu_reserved"])
user_status = {}
diff --git a/src/utils/MyLogger.py b/src/utils/MyLogger.py
deleted file mode 100755
index ebcd48226..000000000
--- a/src/utils/MyLogger.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from config import global_vars
-import logging
-from logging.config import dictConfig
-import thread
-
-
-class MyLogger:
-
- def init(self):
- if self.logger is None and "logger" in global_vars and global_vars["logger"] is not None:
- self.logger = global_vars["logger"]
-
- def __init__(self):
- self.logger = None
- self.init()
-
- def info(self, msg):
- self.init()
- txt = str(thread.get_ident()) + " : " + msg
- #print txt
- if self.logger is not None:
- self.logger.info(txt)
-
- def error(self, msg):
- self.init()
- txt = str(thread.get_ident()) + " : " + msg
- #print msg
-
- if self.logger is not None:
- self.logger.error(txt)
-
- def warn(self, msg):
- self.init()
- txt = str(thread.get_ident()) + " : " + msg
- #print msg
-
- if self.logger is not None:
- self.logger.warn(txt)
-
- def debug(self, msg):
- self.init()
- print msg
-
- if self.logger is not None:
- self.logger.debug(msg)
-
- def exception(self, msg):
- self.init()
- print msg
-
- if self.logger is not None:
- self.logger.exception(msg)
diff --git a/src/utils/MySQLDataHandler.py b/src/utils/MySQLDataHandler.py
index 03de3baa3..2496fc571 100755
--- a/src/utils/MySQLDataHandler.py
+++ b/src/utils/MySQLDataHandler.py
@@ -1,8 +1,9 @@
-# from config import config
import mysql.connector
import json
import base64
import os
+import logging
+import functools
import timeit
@@ -10,14 +11,36 @@
from config import config
from config import global_vars
-from MyLogger import MyLogger
-logger = MyLogger()
+from prometheus_client import Histogram
+logger = logging.getLogger(__name__)
-class DataHandler:
+data_handler_fn_histogram = Histogram("datahandler_fn_latency_seconds",
+ "latency for executing data handler function (seconds)",
+ buckets=(.05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0,
+ 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, float("inf")),
+ labelnames=("fn_name",))
+db_connect_histogram = Histogram("db_connect_latency_seconds",
+ "latency for connecting to db (seconds)",
+ buckets=(.05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, float("inf")))
+
+def record(fn):
+ @functools.wraps(fn)
+ def wrapped(*args, **kwargs):
+ start = timeit.default_timer()
+ try:
+ return fn(*args, **kwargs)
+ finally:
+ elapsed = timeit.default_timer() - start
+ logger.info("DataHandler: %s, time elapsed %.2fs", fn.__name__, elapsed)
+ data_handler_fn_histogram.labels(fn.__name__).observe(elapsed)
+ return wrapped
+
+
+class DataHandler(object):
def __init__(self):
start_time = timeit.default_timer()
self.database = "DLWSCluster-%s" % config["clusterId"]
@@ -32,18 +55,17 @@ def __init__(self):
username = config["mysql"]["username"]
password = config["mysql"]["password"]
+ self.CreateDatabase()
- self.conn = mysql.connector.connect(user=username, password=password,
- host=server, database=self.database)
+ with db_connect_histogram.time():
+ self.conn = mysql.connector.connect(user=username, password=password,
+ host=server, database=self.database)
- self.CreateDatabase()
self.CreateTable()
elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler initialization, time elapsed %f s" % elapsed)
-
-
+ logger.info("DataHandler initialization, time elapsed %f s", elapsed)
def CreateDatabase(self):
if "initSQLDB" not in global_vars or not global_vars["initSQLDB"]:
@@ -218,41 +240,34 @@ def CreateTable(self):
self.conn.commit()
cursor.close()
-
+ @record
def AddStorage(self, vcName, url, storageType, metadata, defaultMountPath):
try:
- start_time = timeit.default_timer()
sql = "INSERT INTO `"+self.storagetablename+"` (storageType, url, metadata, vcName, defaultMountPath) VALUES (%s,%s,%s,%s,%s)"
cursor = self.conn.cursor()
cursor.execute(sql, (storageType, url, metadata, vcName, defaultMountPath))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: AddStorage to DB: url : %s, vcName: %s , time elapsed %f s" % (url, vcName, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
-
+ @record
def DeleteStorage(self, vcName, url):
try:
- start_time = timeit.default_timer()
sql = "DELETE FROM `%s` WHERE url = '%s' and vcName = '%s'" % (self.storagetablename, url, vcName)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: DeleteStorage: url:%s, vcName:%s, time elapsed %f s" % (url, vcName, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
-
+ @record
def ListStorages(self, vcName):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `storageType`,`url`,`metadata`,`vcName`,`defaultMountPath` FROM `%s` WHERE vcName = '%s' " % (self.storagetablename, vcName)
ret = []
@@ -267,49 +282,43 @@ def ListStorages(self, vcName):
record["defaultMountPath"] = defaultMountPath
ret.append(record)
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: ListStorages time elapsed %f s" % (elapsed))
return ret
+ @record
def UpdateStorage(self, vcName, url, storageType, metadata, defaultMountPath):
try:
- start_time = timeit.default_timer()
sql = """update `%s` set storageType = '%s', metadata = '%s', defaultMountPath = '%s' where vcName = '%s' and url = '%s' """ % (self.storagetablename, storageType, metadata, defaultMountPath, vcName, url)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateStorage: vcName: %s, url: %s, time elapsed %f s" % (vcName, url, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def AddVC(self, vcName, quota, metadata):
try:
- start_time = timeit.default_timer()
sql = "INSERT INTO `"+self.vctablename+"` (vcName, quota, metadata) VALUES (%s,%s,%s)"
cursor = self.conn.cursor()
cursor.execute(sql, (vcName, quota, metadata))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: AddVC to DB: vcName: %s , time elapsed %f s" % (vcName, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def ListVCs(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `vcName`,`quota`,`metadata` FROM `%s`" % (self.vctablename)
ret = []
@@ -322,49 +331,43 @@ def ListVCs(self):
record["metadata"] = metadata
ret.append(record)
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: ListVCs time elapsed %f s" % (elapsed))
return ret
+ @record
def DeleteVC(self, vcName):
try:
- start_time = timeit.default_timer()
sql = "DELETE FROM `%s` WHERE vcName = '%s'" % (self.vctablename, vcName)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: DeleteVC: vcName: %s , time elapsed %f s" % (vcName, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def UpdateVC(self, vcName, quota, metadata):
try:
- start_time = timeit.default_timer()
sql = """update `%s` set quota = '%s', metadata = '%s' where vcName = '%s' """ % (self.vctablename, quota, metadata, vcName)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateVC: vcName: %s , time elapsed %f s" % (vcName, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def GetIdentityInfo(self, identityName):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `identityName`,`uid`,`gid`,`groups` FROM `%s` where `identityName` = '%s'" % (self.identitytablename, identityName)
ret = []
@@ -378,23 +381,20 @@ def GetIdentityInfo(self, identityName):
record["groups"] = json.loads(groups)
ret.append(record)
except Exception as e:
- logger.error('GetIdentityInfo Exception: '+ str(e))
- pass
+ logger.error('GetIdentityInfo Exception: %s', str(e))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: GetIdentityInfo time elapsed %f s" % (elapsed))
return ret
+ @record
def UpdateIdentityInfo(self, identityName, uid, gid, groups):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
if (isinstance(groups, list)):
groups = json.dumps(groups)
-
+
if len(self.GetIdentityInfo(identityName)) == 0:
sql = "INSERT INTO `"+self.identitytablename+"` (identityName,uid,gid,groups) VALUES (%s,%s,%s,%s)"
cursor.execute(sql, (identityName, uid, gid, groups))
@@ -404,16 +404,14 @@ def UpdateIdentityInfo(self, identityName, uid, gid, groups):
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateIdentityInfo %s to database , time elapsed %f s" % (identityName, elapsed))
return True
except Exception as e:
- logger.error('UpdateIdentityInfo Exception: '+ str(e))
+ logger.error('UpdateIdentityInfo Exception: %s', str(e))
return False
+ @record
def GetAceCount(self, identityName, resource):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT count(ALL id) as c FROM `%s` where `identityName` = '%s' and `resource` = '%s'" % (self.acltablename,identityName, resource)
cursor.execute(query)
@@ -422,14 +420,12 @@ def GetAceCount(self, identityName, resource):
ret = c[0]
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: GetAceCount time elapsed %f s" % ( elapsed))
return ret
+ @record
def UpdateAce(self, identityName, identityId, resource, permissions, isDeny):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
existingAceCount = self.GetAceCount(identityName, resource)
logger.info(existingAceCount)
@@ -443,34 +439,30 @@ def UpdateAce(self, identityName, identityId, resource, permissions, isDeny):
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateAce %s - %s to database , time elapsed %f s" % (identityName, resource, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def UpdateAclIdentityId(self, identityName, identityId):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
sql = """update `%s` set identityId = '%s' where `identityName` = '%s' """ % (self.acltablename, identityId, identityName)
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateAclIdentityId %s - %s to database , time elapsed %f s" % (identityName, identityId, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def DeleteResourceAcl(self, resource):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
sql = "DELETE FROM `%s` WHERE `resource` = '%s'" % (self.acltablename, resource)
@@ -478,17 +470,15 @@ def DeleteResourceAcl(self, resource):
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: DeleteResourceAcl %s, time elapsed %f s" % (resource, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def DeleteAce(self, identityName, resource):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
sql = "DELETE FROM `%s` WHERE `identityName` = '%s' and `resource` = '%s'" % (self.acltablename, identityName, resource)
@@ -496,16 +486,14 @@ def DeleteAce(self, identityName, resource):
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: DeleteAce %s : %s time elapsed %f s" % (resource, identityName, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def GetAcl(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `identityName`,`identityId`,`resource`,`permissions`,`isDeny` FROM `%s`" % (self.acltablename)
ret = []
@@ -520,17 +508,15 @@ def GetAcl(self):
record["isDeny"] = isDeny
ret.append(record)
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: GetAcl time elapsed %f s" % ( elapsed))
return ret
+ @record
def GetResourceAcl(self, resource):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `identityName`,`identityId`,`resource`,`permissions`,`isDeny` FROM `%s` where `resource` = '%s'" % (self.acltablename, resource)
ret = []
@@ -545,34 +531,29 @@ def GetResourceAcl(self, resource):
record["isDeny"] = isDeny
ret.append(record)
except Exception as e:
- logger.error('Exception: '+ str(e))
- pass
+ logger.error('Exception: %s', str(e))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: GetResourceAcl time elapsed %f s" % ( elapsed))
return ret
+ @record
def AddJob(self, jobParams):
try:
- start_time = timeit.default_timer()
sql = "INSERT INTO `"+self.jobtablename+"` (jobId, familyToken, isParent, jobName, userName, vcName, jobType,jobParams ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"
cursor = self.conn.cursor()
jobParam = base64.b64encode(json.dumps(jobParams))
cursor.execute(sql, (jobParams["jobId"], jobParams["familyToken"], jobParams["isParent"], jobParams["jobName"], jobParams["userName"], jobParams["vcName"], jobParams["jobType"],jobParam))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: added job %s to database, time elapsed %f s" % (jobParams["jobId"], elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
- return False
+ logger.error('Exception: %s', str(e))
+ return False
+ @record
def GetJobList(self, userName, vcName, num = None, status = None, op = ("=","or")):
- start_time = timeit.default_timer()
ret = []
cursor = self.conn.cursor()
try:
@@ -592,13 +573,12 @@ def GetJobList(self, userName, vcName, num = None, status = None, op = ("=","or"
if num is not None:
query += " limit %s " % str(num)
- start_time1 = timeit.default_timer()
cursor.execute(query)
- elapsed1 = timeit.default_timer() - start_time1
- start_time2 = timeit.default_timer()
+
+ fetch_start_time = timeit.default_timer()
data = cursor.fetchall()
- elapsed2 = timeit.default_timer() - start_time2
- logger.info ("(fetchall time: %f)" % (elapsed2))
+ fetch_elapsed = timeit.default_timer() - fetch_start_time
+ logger.info("(fetchall time: %f)", fetch_elapsed)
for (jobId,jobName,userName, vcName, jobStatus,jobStatusDetail, jobType, jobDescriptionPath, jobDescription, jobTime, endpoints, jobParams,errorMsg, jobMeta) in data:
record = {}
record["jobId"] = jobId
@@ -617,16 +597,13 @@ def GetJobList(self, userName, vcName, num = None, status = None, op = ("=","or"
record["jobMeta"] = jobMeta
ret.append(record)
except Exception as e:
- logger.error('Exception: '+ str(e))
- pass
+ logger.error('Exception: %s', str(e))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get job list for user %s , time elapsed %f s (SQL query time: %f)" % (userName, elapsed, elapsed1))
return ret
+ @record
def GetJob(self, **kwargs):
- start_time = timeit.default_timer()
valid_keys = ["jobId", "familyToken", "isParent", "jobName", "userName", "vcName", "jobStatus", "jobType", "jobTime"]
if len(kwargs) != 1: return []
key, expected = kwargs.popitem()
@@ -640,27 +617,23 @@ def GetJob(self, **kwargs):
ret = [dict(zip(columns, row)) for row in cursor.fetchall()]
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get job details with query %s=%s , time elapsed %f s" % (key, expected, elapsed))
return ret
+ @record
def AddCommand(self, jobId, command):
try:
- start_time = timeit.default_timer()
sql = "INSERT INTO `"+self.commandtablename+"` (jobId, command) VALUES (%s,%s)"
cursor = self.conn.cursor()
cursor.execute(sql, (jobId, command))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: add command to database, jobId: %s , time elapsed %f s" % (jobId, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def GetPendingCommands(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `id`, `jobId`, `command` FROM `%s` WHERE `status` = 'pending' order by `time`" % (self.commandtablename)
cursor.execute(query)
@@ -673,27 +646,23 @@ def GetPendingCommands(self):
ret.append(record)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get pending command , time elapsed %f s" % (elapsed))
return ret
+ @record
def FinishCommand(self, commandId):
try:
- start_time = timeit.default_timer()
sql = """update `%s` set status = 'run' where `id` = '%s' """ % (self.commandtablename, commandId)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: set command %s as finished , time elapsed %f s" % (commandId, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def GetCommands(self, jobId):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `time`, `command`, `status`, `output` FROM `%s` WHERE `jobId` = '%s' order by `time`" % (self.commandtablename, jobId)
cursor.execute(query)
@@ -707,8 +676,6 @@ def GetCommands(self, jobId):
ret.append(record)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get command list for job %s , time elapsed %f s" % (jobId, elapsed))
return ret
def load_json(self, raw_str):
@@ -719,9 +686,9 @@ def load_json(self, raw_str):
except:
return {}
+ @record
def GetPendingEndpoints(self):
try:
- start_time = timeit.default_timer()
jobs = self.GetJob(jobStatus="running")
# [ {endpoint1:{},endpoint2:{}}, {endpoint3:{}, ... }, ... ]
@@ -730,19 +697,33 @@ def GetPendingEndpoints(self):
# endpoint["status"] == "pending"
pendingEndpoints = {k: v for d in endpoints for k, v in d.items() if v["status"] == "pending"}
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get pending endpoints %d, time elapsed %f s" % (len(pendingEndpoints), elapsed))
return pendingEndpoints
except Exception as e:
logger.exception("Query pending endpoints failed!")
return {}
+ @record
+ def GetJobEndpoints(self, job_id):
+ try:
+ jobs = self.GetJob(jobId=job_id)
+
+ # [ {endpoint1:{},endpoint2:{}}, {endpoint3:{}, ... }, ... ]
+ endpoints = map(lambda job: self.load_json(job["endpoints"]), jobs)
+ # {endpoint1: {}, endpoint2: {}, ... }
+ # endpoint["status"] == "pending"
+ endpoints = {k: v for d in endpoints for k, v in d.items()}
+
+ return endpoints
+ except Exception as e:
+ logger.warning("Query job endpoints failed! Job {}".format(job_id), exc_info=True)
+ return {}
+
+ @record
def GetDeadEndpoints(self):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
# TODO we need job["lastUpdated"] for filtering
- query = "SELECT `endpoints` FROM jobs WHERE `jobStatus` <> 'running' order by `jobTime` DESC"
+ query = "SELECT `endpoints` FROM jobs WHERE `jobStatus` <> 'running' and `jobStatus` <> 'pending' and `jobStatus` <> 'queued' and `jobStatus` <> 'scheduling' order by `jobTime` DESC"
cursor.execute(query)
dead_endpoints = {}
for [endpoints] in cursor:
@@ -750,18 +731,14 @@ def GetDeadEndpoints(self):
dead_endpoints.update(endpoint_list)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get dead endpoints %d , time elapsed %f s" % (len(dead_endpoints), elapsed))
return dead_endpoints
except Exception as e:
- import traceback
- traceback.print_exc()
logger.exception("Query dead endpoints failed!")
return {}
+ @record
def UpdateEndpoint(self, endpoint):
try:
- start_time = timeit.default_timer()
job_id = endpoint["jobId"]
job = self.GetJob(jobId=job_id)[0]
job_endpoints = self.load_json(job["endpoints"])
@@ -774,15 +751,13 @@ def UpdateEndpoint(self, endpoint):
cursor.execute(sql, (json.dumps(job_endpoints), job_id))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: update endpoints to database, endpointId: %s , time elapsed %f s" % (endpoint["id"], elapsed))
return True
except Exception as e:
- logger.exception("Update endpoints failed!")
+ logger.exception("Update endpoints failed! Endpoints: {}".format(endpoint))
return False
+ @record
def GetPendingJobs(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `jobId`,`jobName`,`userName`, `vcName`, `jobStatus`, `jobType`, `jobDescriptionPath`, `jobDescription`, `jobTime`, `endpoints`, `jobParams`,`errorMsg` ,`jobMeta` FROM `%s` where `jobStatus` <> 'error' and `jobStatus` <> 'failed' and `jobStatus` <> 'finished' and `jobStatus` <> 'killed' order by `jobTime` DESC" % (self.jobtablename)
cursor.execute(query)
@@ -805,44 +780,37 @@ def GetPendingJobs(self):
ret.append(record)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get pending jobs %d, time elapsed %f s" % (len(ret), elapsed))
return ret
-
+ @record
def SetJobError(self, jobId, errorMsg):
try:
- start_time = timeit.default_timer()
sql = """update `%s` set jobStatus = 'error', `errorMsg` = '%s' where `jobId` = '%s' """ % (self.jobtablename, errorMsg, jobId)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: set job %s error status in database, time elapsed %f s" % (jobId, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def UpdateJobTextField(self, jobId, field, value):
try:
- start_time = timeit.default_timer()
sql = "update `%s` set `%s` = '%s' where `jobId` = '%s' " % (self.jobtablename, field, value, jobId)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: update job %s, field %s to %s, time elapsed %f s" % (jobId, field, value, elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def GetJobTextField(self, jobId, field):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `jobId`, `%s` FROM `%s` where `jobId` = '%s' " % (field, self.jobtablename, jobId)
ret = None
@@ -851,16 +819,14 @@ def GetJobTextField(self, jobId, field):
for (jobId, value) in cursor:
ret = value
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get filed %s of job %s , time elapsed %f s" % (field, jobId, elapsed))
return ret
+ @record
def AddandGetJobRetries(self, jobId):
- start_time = timeit.default_timer()
sql = """update `%s` set `retries` = `retries` + 1 where `jobId` = '%s' """ % (self.jobtablename, jobId)
cursor = self.conn.cursor()
cursor.execute(sql)
@@ -876,29 +842,25 @@ def AddandGetJobRetries(self, jobId):
ret = value
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get and update retries for job %s , time elapsed %f s" % (jobId, elapsed))
return ret
+ @record
def UpdateClusterStatus(self, clusterStatus):
try:
status = base64.b64encode(json.dumps(clusterStatus))
- start_time = timeit.default_timer()
sql = "INSERT INTO `%s` (status) VALUES ('%s')" % (self.clusterstatustablename, status)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: update cluster status, time elapsed %f s" % (elapsed))
return True
except Exception as e:
- logger.error('Exception: '+ str(e))
+ logger.error('Exception: %s', str(e))
return False
+ @record
def GetClusterStatus(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `time`, `status` FROM `%s` order by `time` DESC limit 1" % (self.clusterstatustablename)
ret = None
@@ -909,17 +871,14 @@ def GetClusterStatus(self):
ret = json.loads(base64.b64decode(value))
time = t
except Exception as e:
- logger.error('Exception: '+ str(e))
- pass
+ logger.error('Exception: %s', str(e))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get cluster status , time elapsed %f s" % (elapsed))
return ret, time
+ @record
def GetUsers(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT `identityName`,`uid` FROM `%s`" % (self.identitytablename)
ret = []
@@ -928,14 +887,12 @@ def GetUsers(self):
for (identityName,uid) in cursor:
ret.append((identityName,uid))
except Exception as e:
- logger.error('Exception: '+ str(e))
- pass
+ logger.error('Exception: %s', str(e))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info("DataHandler: get users, time elapsed %f s" % (elapsed))
return ret
+ @record
def GetActiveJobsCount(self):
cursor = self.conn.cursor()
query = "SELECT count(ALL id) as c FROM `%s` where `jobStatus` <> 'error' and `jobStatus` <> 'failed' and `jobStatus` <> 'finished' and `jobStatus` <> 'killed' " % (self.jobtablename)
@@ -948,6 +905,7 @@ def GetActiveJobsCount(self):
return ret
+ @record
def GetALLJobsCount(self):
cursor = self.conn.cursor()
query = "SELECT count(ALL id) as c FROM `%s`" % (self.jobtablename)
diff --git a/src/utils/SQLDataHandler.py b/src/utils/SQLDataHandler.py
index ca7e31581..f352ebea4 100755
--- a/src/utils/SQLDataHandler.py
+++ b/src/utils/SQLDataHandler.py
@@ -3,6 +3,7 @@
import json
import base64
import os
+import logging
import timeit
@@ -11,16 +12,16 @@
from config import config
from config import global_vars
-from MyLogger import MyLogger
+from MySQLDataHandler import record
-logger = MyLogger()
+logger = logging.getLogger(__name__)
### set to a larger number if flask is running on multithreading
sql_max_connect_num = 35
sql_live_connect_num = 25
-class SQLConnManager:
+class SQLConnManager(object):
@staticmethod
def Connect():
@@ -142,7 +143,7 @@ def ReturnConnection(conn):
global_vars["sql_lock"].release()
return None
-class DataHandler:
+class DataHandler(object):
def __init__(self):
start_time = timeit.default_timer()
self.CreateDatabase()
@@ -150,7 +151,7 @@ def __init__(self):
logger.debug ("********************** created a new Data Handler *******************")
self.conn = SQLConnManager.GetConnection()
logger.debug ("Get database connection %s" % str(self.conn))
-
+
#print "Connecting to server ..."
self.jobtablename = "jobs-%s" % config["clusterId"]
self.acltablename = "acl-%s" % config["clusterId"]
@@ -164,8 +165,6 @@ def __init__(self):
elapsed = timeit.default_timer() - start_time
logger.debug ("DataHandler initialization, time elapsed %f s" % elapsed)
-
-
def CreateDatabase(self):
if "initSQLDB" not in global_vars or not global_vars["initSQLDB"]:
logger.info("===========init SQL database===============")
@@ -327,7 +326,7 @@ def CreateTable(self):
self.conn.commit()
cursor.close()
-
+
sql = """
if not exists (select * from sysobjects where name='%s' and xtype='U')
CREATE TABLE [dbo].[%s]
@@ -349,41 +348,34 @@ def CreateTable(self):
self.conn.commit()
cursor.close()
-
+ @record
def AddStorage(self, vcName, url, storageType, metadata, defaultMountPath):
try:
- start_time = timeit.default_timer()
sql = "INSERT INTO [%s] (storageType, url, metadata, vcName, defaultMountPath) VALUES (?,?,?,?,?)""" % self.storagetablename
cursor = self.conn.cursor()
cursor.execute(sql, (storageType, url, metadata, vcName, defaultMountPath))
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: AddStorage to DB: url : %s, vcName: %s , time elapsed %f s" % (url, vcName, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def DeleteStorage(self, vcName, url):
try:
- start_time = timeit.default_timer()
sql = "DELETE FROM [%s] WHERE [url] = '%s' and [vcName] = '%s'" % (self.storagetablename, url, vcName)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: DeleteStorage: url:%s, vcName:%s, time elapsed %f s" % (url, vcName, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def ListStorages(self, vcName):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [vcName],[url],[storageType],[metadata],[defaultMountPath] FROM [%s] WHERE [vcName] = '%s' " % (self.storagetablename, vcName)
ret = []
@@ -402,45 +394,36 @@ def ListStorages(self, vcName):
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: ListStorages time elapsed %f s" % (elapsed))
return ret
-
+ @record
def UpdateStorage(self, vcName, url, storageType, metadata, defaultMountPath):
try:
- start_time = timeit.default_timer()
sql = """update [%s] set storageType = '%s', metadata = '%s', defaultMountPath = '%s' where [vcName] = '%s' and [url] = '%s' """ % (self.storagetablename, storageType, metadata, defaultMountPath, vcName, url)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateStorage: vcName: %s, url: %s, time elapsed %f s" % (vcName, url, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def AddVC(self, vcName, quota, metadata):
try:
- start_time = timeit.default_timer()
sql = """INSERT INTO [%s] (vcName, quota, metadata) VALUES (?,?,?)""" % self.vctablename
cursor = self.conn.cursor()
cursor.execute(sql, vcName, quota, metadata)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: AddVC to DB: vcName: %s , time elapsed %f s" % (vcName, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def ListVCs(self, vcName):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [vcName],[quota],[metadata] FROM [%s]" % (self.vctablename)
ret = []
@@ -457,45 +440,36 @@ def ListVCs(self, vcName):
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: ListVCs time elapsed %f s" % ( elapsed))
- return ret
-
+ return ret
+ @record
def DeleteVC(self, vcName):
try:
- start_time = timeit.default_timer()
sql = "DELETE FROM [%s] WHERE [vcName] = '%s'" % (self.vctablename, vcName)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: DeleteVC: vcName: %s , time elapsed %f s" % (vcName, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def UpdateVC(self, vcName, quota, metadata):
try:
- start_time = timeit.default_timer()
sql = """update [%s] set quota = '%s', metadata = '%s' where [vcName] = '%s'""" % (self.vctablename, quota, metadata, vcName)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateVC: vcName: %s , time elapsed %f s" % (vcName, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def GetIdentityInfo(self, identityName):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [identityName],[uid],[gid],[groups] FROM [%s] where [identityName] = '%s'" % (self.identitytablename, identityName)
ret = []
@@ -513,35 +487,29 @@ def GetIdentityInfo(self, identityName):
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: GetIdentityInfo time elapsed %f s" % (elapsed))
- return ret
-
+ return ret
+ @record
def UpdateIdentityInfo(self, identityName, uid, gid, groups):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
-
+
if len(self.GetIdentityInfo(identityName)) == 0:
sql = """INSERT INTO [%s] (identityName,uid,gid,groups) VALUES (?,?,?,?)""" % self.identitytablename
cursor.execute(sql, identityName, uid, gid, json.dumps(groups))
else:
sql = """update [%s] set uid = '%s', gid = '%s', groups = '%s' where [identityName] = '%s' """ % (self.identitytablename, uid, gid, groups, identityName)
cursor.execute(sql)
-
+
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateIdentityInfo %s to database , time elapsed %f s" % (identityName, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def GetAceCount(self, identityId, resource):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT count(ALL id) as c FROM [%s] where [identityId] = '%s' and [resource] = '%s'" % (self.acltablename,identityId, resource)
cursor.execute(query)
@@ -550,88 +518,73 @@ def GetAceCount(self, identityId, resource):
ret = c[0]
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: GetAceCount time elapsed %f s" % ( elapsed))
- return ret
-
+ return ret
+ @record
def UpdateAce(self, identityName, identityId, resource, permissions, isDeny):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
-
+
if self.GetAceCount(identityId, resource) == 0:
sql = """INSERT INTO [%s] (identityName,identityId,resource,permissions,isDeny) VALUES (?,?,?,?,?)""" % self.acltablename
cursor.execute(sql, identityName, identityId, resource, permissions, isDeny)
else:
sql = """update [%s] set permissions = '%s' where [identityName] = '%s' and [resource] = '%s' """ % (self.acltablename, permissions, identityName, resource)
cursor.execute(sql)
-
+
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateAce %s - %s to database , time elapsed %f s" % (identityName, resource, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def UpdateAclIdentityId(self, identityName, identityId):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
sql = """update [%s] set identityName = '%s' where [identityName] = '%s' """ % (self.acltablename, identityId, identityName)
cursor.execute(sql)
-
+
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: UpdateAclIdentityId %s - %s to database , time elapsed %f s" % (identityName, identityId, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def DeleteResourceAcl(self, resource):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
-
+
sql = "DELETE FROM [%s] WHERE [resource] = '%s'" % (self.acltablename, resource)
cursor = self.conn.cursor()
-
+
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: DeleteResourceAcl %s, time elapsed %f s" % (resource, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def DeleteAce(self, identityName, resource):
try:
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
-
+
sql = "DELETE FROM [%s] WHERE [identityName] = '%s' and [resource] = '%s'" % (self.acltablename, identityName, resource)
cursor = self.conn.cursor()
-
+
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: DeleteAce %s : %s, time elapsed %f s" % (resource, identityName, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def GetAcl(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [identityName],[identityId],[resource],[permissions],[isDeny] FROM [%s]" % (self.acltablename)
ret = []
@@ -650,13 +603,10 @@ def GetAcl(self):
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: GetAcl time elapsed %f s" % ( elapsed))
- return ret
-
+ return ret
+ @record
def GetResourceAcl(self, resource):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [identityName],[identityId],[resource],[permissions],[isDeny] FROM [%s] where [resource] = '%s'" % (self.acltablename, resource)
ret = []
@@ -675,30 +625,24 @@ def GetResourceAcl(self, resource):
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: GetResourceAcl time elapsed %f s" % ( elapsed))
- return ret
-
+ return ret
+ @record
def AddJob(self, jobParams):
try:
- start_time = timeit.default_timer()
sql = """INSERT INTO [%s] (jobId, familyToken, isParent, jobName, userName, vcName, jobType,jobParams ) VALUES (?,?,?,?,?,?,?)""" % self.jobtablename
cursor = self.conn.cursor()
jobParam = base64.b64encode(json.dumps(jobParams))
cursor.execute(sql, jobParams["jobId"], jobParams["familyToken"], jobParams["isParent"], jobParams["jobName"], jobParams["userName"], jobParams["vcName"], jobParams["jobType"],jobParam)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: added job %s to database, time elapsed %f s" % (jobParams["jobId"],elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def GetJobList(self, userName, vcName, num = None, status = None, op = ("=","or")):
- start_time = timeit.default_timer()
ret = []
cursor = self.conn.cursor()
try:
@@ -715,16 +659,14 @@ def GetJobList(self, userName, vcName, num = None, status = None, op = ("=","or"
else:
status_list = [ " [jobStatus] %s '%s' " % (op[0],s) for s in status.split(',')]
status_statement = (" "+op[1]+" ").join(status_list)
- query += " and ( %s ) " % status_statement
+ query += " and ( %s ) " % status_statement
query += " order by [jobTime] Desc"
- start_time1 = timeit.default_timer()
cursor.execute(query)
- elapsed1 = timeit.default_timer() - start_time1
- start_time2 = timeit.default_timer()
+ fetch_start = timeit.default_timer()
data = cursor.fetchall()
- elapsed2 = timeit.default_timer() - start_time2
- logger.info ("(fetchall time: %f)" % (elapsed2))
+ fetch_time = timeit.default_timer() - fetch_start
+ logger.info ("(fetchall time: %f)" % (fetch_time))
for (jobId,jobName,userName, vcName,jobStatus,jobStatusDetail, jobType, jobDescriptionPath, jobDescription, jobTime, endpoints, jobParams,errorMsg, jobMeta) in data:
record = {}
record["jobId"] = jobId
@@ -744,16 +686,12 @@ def GetJobList(self, userName, vcName, num = None, status = None, op = ("=","or"
ret.append(record)
except Exception as e:
logger.error('Exception: '+ str(e))
- pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get job list for user %s , time elapsed %f s (SQL query time: %f)" % (userName, elapsed, elapsed1))
return ret
-
+ @record
def GetJob(self, **kwargs):
- start_time = timeit.default_timer()
valid_keys = ["jobId", "familyToken", "isParent", "jobName", "userName", "vcName", "jobStatus", "jobType", "jobTime"]
if len(kwargs) != 1: return []
key, expected = kwargs.popitem()
@@ -767,29 +705,23 @@ def GetJob(self, **kwargs):
ret = [dict(zip(columns, row)) for row in cursor.fetchall()]
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get job details with query %s=%s , time elapsed %f s" % (key, expected, elapsed))
return ret
-
+ @record
def AddCommand(self,jobId,command):
try:
- start_time = timeit.default_timer()
sql = """INSERT INTO [%s] (jobId, command) VALUES (?,?)""" % self.commandtablename
cursor = self.conn.cursor()
cursor.execute(sql, jobId, command)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: add command to database, jobId: %s , time elapsed %f s" % (jobId, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def GetPendingCommands(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [id], [jobId], [command] FROM [%s] WHERE [status] = 'pending' order by [time]" % (self.commandtablename)
cursor.execute(query)
@@ -802,29 +734,23 @@ def GetPendingCommands(self):
ret.append(record)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get pending command , time elapsed %f s" % (elapsed))
- return ret
-
+ return ret
+ @record
def FinishCommand(self,commandId):
try:
- start_time = timeit.default_timer()
sql = """update [%s] set status = 'run' where [id] = '%s' """ % (self.commandtablename, commandId)
cursor = self.conn.cursor()
cursor.execute(sql)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: set command %s as finished , time elapsed %f s" % (commandId, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def GetCommands(self, jobId):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [time], [command], [status], [output] FROM [%s] WHERE [jobId] = '%s' order by [time]" % (self.commandtablename, jobId)
cursor.execute(query)
@@ -838,13 +764,10 @@ def GetCommands(self, jobId):
ret.append(record)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get command list for job %s , time elapsed %f s" % (jobId, elapsed))
- return ret
-
+ return ret
+ @record
def GetPendingJobs(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [jobId],[jobName],[userName], [vcName], [jobStatus], [jobType], [jobDescriptionPath], [jobDescription], [jobTime], [endpoints], [jobParams],[errorMsg] ,[jobMeta] FROM [%s] where [jobStatus] <> 'error' and [jobStatus] <> 'failed' and [jobStatus] <> 'finished' and [jobStatus] <> 'killed' order by [jobTime] DESC" % (self.jobtablename)
cursor.execute(query)
@@ -867,45 +790,36 @@ def GetPendingJobs(self):
ret.append(record)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get pending jobs , time elapsed %f s" % (elapsed))
- return ret
-
+ return ret
+ @record
def SetJobError(self,jobId,errorMsg):
try:
- start_time = timeit.default_timer()
sql = """update [%s] set jobStatus = 'error', [errorMsg] = ? where [jobId] = '%s' """ % (self.jobtablename,jobId)
cursor = self.conn.cursor()
cursor.execute(sql,errorMsg)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: set job %s error status in database, time elapsed %f s" % (jobId, elapsed))
return True
except Exception as e:
logger.error('Exception: '+ str(e))
- return False
-
+ return False
+ @record
def UpdateJobTextField(self,jobId,field,value):
try:
- start_time = timeit.default_timer()
sql = """update [%s] set [%s] = ? where [jobId] = '%s' """ % (self.jobtablename,field, jobId)
cursor = self.conn.cursor()
cursor.execute(sql,value)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: update job %s, field %s , time elapsed %f s" % (jobId, field, elapsed))
return True
except Exception, e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def GetJobTextField(self,jobId,field):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [jobId], [%s] FROM [%s] where [jobId] = '%s' " % (field, self.jobtablename,jobId)
ret = None
@@ -918,12 +832,10 @@ def GetJobTextField(self,jobId,field):
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get filed %s of job %s , time elapsed %f s" % (field, jobId, elapsed))
return ret
+ @record
def AddandGetJobRetries(self,jobId):
- start_time = timeit.default_timer()
sql = """update [%s] set [retries] = [retries] + 1 where [jobId] = '%s' """ % (self.jobtablename, jobId)
cursor = self.conn.cursor()
cursor.execute(sql)
@@ -939,30 +851,24 @@ def AddandGetJobRetries(self,jobId):
ret = value
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get and update retries for job %s , time elapsed %f s" % (jobId, elapsed))
return ret
-
+ @record
def UpdateClusterStatus(self,clusterStatus):
try:
- start_time = timeit.default_timer()
sql = """INSERT INTO [%s] (status) VALUES (?)""" % self.clusterstatustablename
cursor = self.conn.cursor()
status = base64.b64encode(json.dumps(clusterStatus))
cursor.execute(sql,status)
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: update cluster status, time elapsed %f s" % (elapsed))
return True
except Exception, e:
logger.error('Exception: '+ str(e))
return False
-
+ @record
def GetClusterStatus(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT TOP 1 [time], [status] FROM [%s] order by [time] DESC" % (self.clusterstatustablename)
ret = None
@@ -977,13 +883,10 @@ def GetClusterStatus(self):
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get cluster status , time elapsed %f s" % (elapsed))
return ret, time
-
+ @record
def GetUsers(self):
- start_time = timeit.default_timer()
cursor = self.conn.cursor()
query = "SELECT [identityName],[uid] FROM [%s]" % (self.identitytablename)
ret = []
@@ -996,11 +899,9 @@ def GetUsers(self):
pass
self.conn.commit()
cursor.close()
- elapsed = timeit.default_timer() - start_time
- logger.info ("DataHandler: get users, time elapsed %f s" % ( elapsed))
return ret
-
+ @record
def GetActiveJobsCount(self):
cursor = self.conn.cursor()
query = "SELECT count(ALL id) as c FROM [%s] where [jobStatus] <> 'error' and [jobStatus] <> 'failed' and [jobStatus] <> 'finished' and [jobStatus] <> 'killed' " % (self.jobtablename)
@@ -1011,8 +912,9 @@ def GetActiveJobsCount(self):
self.conn.commit()
cursor.close()
- return ret
+ return ret
+ @record
def GetALLJobsCount(self):
cursor = self.conn.cursor()
query = "SELECT count(ALL id) as c FROM [%s]" % (self.jobtablename)
@@ -1023,7 +925,7 @@ def GetALLJobsCount(self):
self.conn.commit()
cursor.close()
- return ret
+ return ret
def __del__(self):
logger.debug("********************** deleted a DataHandler instance *******************")
@@ -1039,7 +941,7 @@ def Close(self):
CREATE_TABLE = False
CREATE_DB = True
dataHandler = DataHandler()
-
+
if TEST_INSERT_JOB:
jobParams = {}
jobParams["id"] = "dist-tf-00001"
@@ -1047,9 +949,8 @@ def Close(self):
jobParams["user-id"] = "hongzl"
jobParams["job-meta-path"] = "/dlws/jobfiles/***"
jobParams["job-meta"] = "ADSCASDcAE!EDASCASDFD"
-
+
dataHandler.AddJob(jobParams)
-
if CREATE_TABLE:
dataHandler.CreateTable()
diff --git a/src/utils/authorization.py b/src/utils/authorization.py
index 112f45c4e..4145102fa 100755
--- a/src/utils/authorization.py
+++ b/src/utils/authorization.py
@@ -1,5 +1,5 @@
from DataHandler import DataHandler, DataManager
-from MyLogger import MyLogger
+import logging
import json
import requests
import random
@@ -7,7 +7,7 @@
import timeit
from cache import fcache
-logger = MyLogger()
+logger = logging.getLogger(__name__)
def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
diff --git a/src/utils/cache.py b/src/utils/cache.py
index d3f51b538..a3a174447 100644
--- a/src/utils/cache.py
+++ b/src/utils/cache.py
@@ -1,94 +1,97 @@
-from functools import wraps
-import threading
-from datetime import datetime
-from datetime import timedelta
-import time
-import Queue
-import copy
-
-# decorator (with different TTL for each function)
-# No removal of entries (designed for small number of entries to be always kept in memory)
-# option to invalidate specific entries
-# non-blocking (updates done by background thread; stale data returned while fetching in progress) : avoids thundering herd for data source
-def fcache(TTLInSec=30):
- def fcache_decorator(func):
- @wraps(func)
- def wrapped_function(*args, **kwargs):
- val = CacheManager.GetValue(func, TTLInSec, args)
- if (None == val):
- return func(*args)
- return copy.deepcopy(val[0])
- return wrapped_function
- return fcache_decorator
-
-
-
-class CacheManager(object):
- data = {}
- taskQueue = Queue.Queue()
- pendingTasks = set()
-
- @staticmethod
- def Invalidate(funcName, *args):
- key = CacheManager._GetKey(funcName, args)
- if key in CacheManager.data:
- val = CacheManager.data[key]
- CacheManager.data[key] = [val[0], datetime.now()]
- print("Cache invalidated " + key)
-
- @staticmethod
- def GetValue(func, ttl, args):
- val = None
- key = CacheManager._GetKey(func.__name__, args)
- needUpdate = False
- if key not in CacheManager.data:
- print("Cache miss " + key)
- needUpdate = True
- else:
- val = CacheManager.data[key]
- print("Cache hit " + key + " " + str(val[1]) + " " + str(len(CacheManager.data)) + " " + str(CacheManager.taskQueue.qsize()))
- if CacheManager._Invalid(val):
- needUpdate = True
-
- if needUpdate and key not in CacheManager.pendingTasks:
- CacheManager.taskQueue.put((func, ttl, args))
- CacheManager.pendingTasks.add(key)
-
- return val
-
-
- @staticmethod
- def _GetKey(funcName, args):
- key = funcName
- for arg in args:
- key += "__"
- key += str(arg)
- return key
-
- @staticmethod
- def _Invalid(value):
- if value[1] < datetime.now():
- return True
- return False
-
- @staticmethod
- def _WorkerThreadFunc():
- while (True):
- try:
- while not CacheManager.taskQueue.empty():
- task = CacheManager.taskQueue.get()
- key = CacheManager._GetKey(task[0].__name__, task[2])
- if key in CacheManager.pendingTasks:
- if key not in CacheManager.data or CacheManager._Invalid(CacheManager.data[key]):
- result = task[0](*(task[2]))
- CacheManager.data[key] = [result, datetime.now() + timedelta(seconds=int(task[1]))]
- print("Cache inserted " + key)
- CacheManager.pendingTasks.remove(key)
- time.sleep(0.001)
- except Exception as e:
- print('cache exception: '+ str(e))
-
-workerThread = threading.Thread(target=CacheManager._WorkerThreadFunc, args=())
-workerThread.daemon = True
-workerThread.start()
-
+from functools import wraps
+import threading
+from datetime import datetime
+from datetime import timedelta
+import time
+import Queue
+import copy
+import logging
+
+logger = logging.getLogger(__name__)
+
+# decorator (with different TTL for each function)
+# No removal of entries (designed for small number of entries to be always kept in memory)
+# option to invalidate specific entries
+# non-blocking (updates done by background thread; stale data returned while fetching in progress) : avoids thundering herd for data source
+def fcache(TTLInSec=30):
+ def fcache_decorator(func):
+ @wraps(func)
+ def wrapped_function(*args, **kwargs):
+ val = CacheManager.GetValue(func, TTLInSec, args)
+ if (None == val):
+ return func(*args)
+ return copy.deepcopy(val[0])
+ return wrapped_function
+ return fcache_decorator
+
+
+
+class CacheManager(object):
+ data = {}
+ taskQueue = Queue.Queue()
+ pendingTasks = set()
+
+ @staticmethod
+ def Invalidate(funcName, *args):
+ key = CacheManager._GetKey(funcName, args)
+ if key in CacheManager.data:
+ val = CacheManager.data[key]
+ CacheManager.data[key] = [val[0], datetime.now()]
+ logger.info("Cache invalidated %s", key)
+
+ @staticmethod
+ def GetValue(func, ttl, args):
+ val = None
+ key = CacheManager._GetKey(func.__name__, args)
+ needUpdate = False
+ if key not in CacheManager.data:
+ logger.info("Cache miss %s", key)
+ needUpdate = True
+ else:
+ val = CacheManager.data[key]
+ logger.info("Cache hit %s %s %s %s", key, str(val[1]), str(len(CacheManager.data)), str(CacheManager.taskQueue.qsize()))
+ if CacheManager._Invalid(val):
+ needUpdate = True
+
+ if needUpdate and key not in CacheManager.pendingTasks:
+ CacheManager.taskQueue.put((func, ttl, args))
+ CacheManager.pendingTasks.add(key)
+
+ return val
+
+
+ @staticmethod
+ def _GetKey(funcName, args):
+ key = funcName
+ for arg in args:
+ key += "__"
+ key += str(arg)
+ return key
+
+ @staticmethod
+ def _Invalid(value):
+ if value[1] < datetime.now():
+ return True
+ return False
+
+ @staticmethod
+ def _WorkerThreadFunc():
+ while (True):
+ try:
+ while not CacheManager.taskQueue.empty():
+ task = CacheManager.taskQueue.get()
+ key = CacheManager._GetKey(task[0].__name__, task[2])
+ if key in CacheManager.pendingTasks:
+ if key not in CacheManager.data or CacheManager._Invalid(CacheManager.data[key]):
+ result = task[0](*(task[2]))
+ CacheManager.data[key] = [result, datetime.now() + timedelta(seconds=int(task[1]))]
+ logger.info("Cache inserted %s", key)
+ CacheManager.pendingTasks.remove(key)
+ time.sleep(0.001)
+ except Exception as e:
+ logger.warning('cache exception: %s', str(e))
+
+workerThread = threading.Thread(target=CacheManager._WorkerThreadFunc, args=())
+workerThread.daemon = True
+workerThread.start()
+
diff --git a/src/utils/config.py b/src/utils/config.py
index 7c19c84ad..25d6e572e 100755
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -1,11 +1,11 @@
import yaml
import os
-from Queue import Queue
+from Queue import Queue
import threading
try:
- f = open(os.path.join(os.path.dirname(os.path.realpath(__file__)),"config.yaml"))
- config = yaml.load(f)
+ f = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.yaml"))
+ config = yaml.full_load(f)
except Exception:
config = {}
()
diff --git a/src/utils/config.yaml b/src/utils/config.yaml
new file mode 100644
index 000000000..7f3e1964e
--- /dev/null
+++ b/src/utils/config.yaml
@@ -0,0 +1,36 @@
+# It's a dummy config, for unit test only!!!
+
+# database :
+# hostname :
+# username :
+# password :
+# database :
+# mysql :
+# hostname : hao-dev-infra01.eastus.cloudapp.azure.com
+# port : 3306
+# username : root
+# password : MSFT1234!
+# kubelet-path : /usr/local/bin/kubectl
+storage-mount-path : /dlwsdata
+# root-path : /DLWorkspace/src/
+root-path : ".." # for unittest only
+# nvidiaDriverPath : /opt/nvidia-driver/current
+# clusterId : b6815f6d-612c-4feb-a8f3-bc7ceaa43993
+# domain : eastus.cloudapp.azure.com
+# apiserver: https://hao-dev-infra01.eastus.cloudapp.azure.com:1443
+# certificate-authority: /etc/kubernetes/ssl/ca.pem
+# client-certificate: /etc/kubernetes/ssl/apiserver.pem
+# client-key: /etc/kubernetes/ssl/apiserver-key.pem
+# pod_ip_range: 10.2.0.0/16
+# per_user_gpu_limit: -1
+rest-api: http://faked.uri/
+# usefreeflow: False
+# mountdescription : {'hdfs': 'Hadoop file system (replicated distribute storage).', 'azurefileshare': 'Azure file storage', 'localHDD': 'Local HDD. ', 'glusterfs': 'GlusterFS (replicated distributed storage)', 'nfs': 'NFS (remote file share)', 'emptyDir': 'Kubernetes emptyDir (folder will be erased after job termination).', 'local': 'Local SSD. '}
+# mountpoints : {'rootshare': {'curphysicalmountpoint': '/mntdlws/nfs', 'mountpoints': '', 'server': '192.168.255.1', 'type': 'nfs', 'filesharename': '/data/share'}}
+# mounthomefolder : yes
+# deploymounts : []
+# default-storage-folders : ['jobfiles', 'storage', 'work', 'namenodeshare']
+# webportal_node: hao-dev-infra01.eastus.cloudapp.azure.com
+# datasource : MySQL
+# kube_custom_scheduler: False
+# WinbindServers: []
diff --git a/src/utils/k8sUtils.py b/src/utils/k8sUtils.py
index 7f6943d65..e90713f3b 100755
--- a/src/utils/k8sUtils.py
+++ b/src/utils/k8sUtils.py
@@ -9,6 +9,7 @@
from tzlocal import get_localzone
import pytz
+import logging
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage"))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils"))
@@ -28,6 +29,7 @@
import pycurl
from StringIO import StringIO
+logger = logging.getLogger(__name__)
def curl_get(url):
curl = pycurl.Curl()
@@ -53,7 +55,7 @@ def kubectl_create(jobfile, EXEC=True):
try:
output = subprocess32.check_output(["bash", "-c", config["kubelet-path"] + " create -f " + jobfile])
except Exception as e:
- print e
+ logger.exception("kubectl create")
output = ""
else:
output = "Job " + jobfile + " is not submitted to kubernetes cluster"
@@ -64,10 +66,10 @@ def kubectl_delete(jobfile, EXEC=True):
if EXEC:
try:
cmd = "bash -c '" + config["kubelet-path"] + " delete -f " + jobfile + "'"
- print cmd
+ logger.info("executing %s", cmd)
output = os.system(cmd)
except Exception as e:
- print e
+ logger.exception("kubectl delete")
output = -1
else:
output = -1
@@ -83,7 +85,7 @@ def kubectl_exec(params, timeout=None):
# TODO set the timeout
output = subprocess32.check_output(["bash", "-c", config["kubelet-path"] + " " + params], timeout=timeout)
except Exception as e:
- print "EXCEPTION: " + str(e)
+ logger.exception("kubectl exec")
output = ""
return output
@@ -166,7 +168,7 @@ def GetPod(selector):
output = kubectl_exec(" get pod -o yaml --show-all -l " + selector)
podInfo = yaml.load(output)
except Exception as e:
- print e
+ logger.exception("kubectl get pod")
podInfo = None
return podInfo
@@ -355,19 +357,6 @@ def GetJobStatus(jobId):
return output, detail
-def all_pod_ready(job_id):
- pods = GetPod("run=" + job_id)
- print("\n\n\n--------------------------------------------\n\n")
- print("=======%s" % pods)
- if pods is None:
- return False
- if "items" in pods:
- pod_status = [check_pod_status(pod) for pod in pods["items"]]
- if any([status != "Running" for status in pod_status]):
- return False
- return True
-
-
def get_node_labels(key):
url = "%s/api/v1/nodes" % (config["apiserver"])
responseStr = curl_get(url)
diff --git a/src/utils/notify.py b/src/utils/notify.py
new file mode 100644
index 000000000..584f99c7f
--- /dev/null
+++ b/src/utils/notify.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+
+import threading
+import time
+import logging
+import urlparse
+import json
+import smtplib
+
+from Queue import Queue
+from Queue import Empty
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+class NotifyMsg(object):
+ def __init__(self, email, alert_name):
+ self.email = email
+ self.alert_name = alert_name
+
+ def labels(self):
+ raise NotImplementedError()
+
+ def subject(self):
+ raise NotImplementedError()
+
+ def body(self):
+ return self.subject()
+
+
+class JobStateChangedMsg(NotifyMsg):
+ def __init__(self, email, alert_name, job_name, job_state):
+ super(JobStateChangedMsg, self).__init__(email, alert_name)
+ self.job_name = job_name
+ self.job_state = job_state
+
+ def labels(self):
+ return {"job_name": self.job_name, "job_state": self.job_state}
+
+ def subject(self):
+ return "Your job %s has changed to state of %s" % (self.job_name, self.job_state)
+
+
+def new_job_state_change_message(email, job_name, state):
+ return JobStateChangedMsg(email, "job-state-changed", job_name, state)
+
+
+class Notifier(object):
+ def __init__(self, config):
+ self.queue = Queue()
+ self.running = False
+ self.thread = None
+
+ self.cluster = None
+ self.alert_manager_url = None
+ self.smtp_url = self.smtp_from = self.smtp_auth_name = self.smtp_auth_pass = None
+
+ if config is not None and "notifier" in config:
+ notifier_config = config["notifier"]
+
+ self.cluster = notifier_config.get("cluster")
+ self.smtp_url = notifier_config.get("smtp-url")
+ self.smtp_from = notifier_config.get("smtp-from")
+ self.smtp_auth_name = notifier_config.get("smtp-auth-username")
+ self.smtp_auth_pass = notifier_config.get("smtp-auth-password")
+
+ alert_manager_url = notifier_config.get("alert-manager-url")
+ if alert_manager_url is not None and len(alert_manager_url) > 0:
+ if alert_manager_url[-1] == "/":
+ self.alert_manager_url = alert_manager_url + "api/v1/alerts"
+ else:
+ self.alert_manager_url = alert_manager_url + "/api/v1/alerts"
+
+ if self.cluster is None or \
+ self.alert_manager_url is None and (
+ self.smtp_url is None or \
+ self.smtp_from is None or \
+ self.smtp_auth_name is None or \
+ self.smtp_auth_pass is None):
+ logger.warning("Notifier not configured")
+
+ def start(self):
+ if not self.running:
+ self.running = True
+ self.thread = threading.Thread(target=self.process, name="notifier")
+ self.thread.start()
+
+ def stop(self):
+ if self.running:
+ self.running = False
+ self.thread.join()
+ self.thread = None
+
+ def notify(self, msg):
+ self.queue.put(msg)
+
+ def process(self):
+ while self.running:
+ try:
+ msg = self.queue.get(block=True, timeout=1) # 1s timeout
+ except Empty:
+ continue
+
+ retry_count = 0
+ sent = False
+
+ while retry_count < 3:
+ if self.send(msg):
+ sent = True
+ break
+ time.sleep(0.2)
+ retry_count += 1
+
+ if not sent:
+ logger.error("failed to send out, discard msg: %s", msg)
+
+ def send(self, msg):
+ subject = msg.subject()
+
+ try:
+ if self.alert_manager_url is not None:
+ labels = msg.labels()
+ labels.update({
+ "alertname": msg.alert_name,
+ "type": "user_alert",
+ "cluster": self.cluster,
+ "user_email": msg.email,
+ "subject": subject,
+ })
+
+ resp = requests.post(self.alert_manager_url, timeout=5,
+ data=json.dumps([{"labels": labels}]))
+ resp.raise_for_status()
+ return True
+ elif self.smtp_url is not None and \
+ self.smtp_from is not None and \
+ self.smtp_auth_name is not None and \
+ self.smtp_auth_pass is not None:
+ smtp_send_email(self.smtp_url, self.smtp_from,
+ self.smtp_auth_name, self.smtp_auth_pass,
+ msg.email, subject, msg.body())
+ return True
+ else:
+ # not configured, discard message
+ return True
+ except Exception as e:
+ logger.exception("sending email failed")
+ return False
+
+
+def smtp_send_email(smtp_url, smtp_from, smtp_auth_name, smtp_auth_pass, to, subject, body):
+ msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n%s" % (smtp_from, to, subject, body)
+ conn = smtplib.SMTP(smtp_url)
+ conn.starttls()
+ conn.login(smtp_auth_name, smtp_auth_pass)
+ conn.sendmail(smtp_from, to, msg)
+
+
+if __name__ == "__main__":
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+ notifier = Notifier({"notifier": {"cluster": "local", "alert-manager-url": "http://localhost:9093/alert-manager"}})
+ notifier.start()
+
+ notifier.notify(new_job_state_change_message("dixu@microsoft.com", "job-id", "stopped"))
+ notifier.stop()