Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
[Multiple Hardwares] [Device Plugin] Read computing devices from layo…
Browse files Browse the repository at this point in the history
…ut.yaml (#5168)

* fix

* fix

* fix
  • Loading branch information
hzy46 committed Dec 16, 2020
1 parent d6c022c commit 27ddaca
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 6 deletions.
19 changes: 19 additions & 0 deletions src/device-plugin/config/device_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,31 @@ def __init__(self, cluster_conf, service_conf, default_service_conf):
self.service_conf = service_conf
self.default_service_conf = default_service_conf

# This function is used to find the all valid computing device types from `layout.yaml`
def find_computing_device_types(self):
computing_device_types = set()
machine_sku_to_device_type = {
sku_name: sku_attrs['computing-device']['type']
for sku_name, sku_attrs in self.cluster_conf['machine-sku'].items()
if 'computing-device' in sku_attrs
}
workers = list(filter(lambda elem: 'pai-worker' in elem and elem["pai-worker"] == 'true', self.cluster_conf['machine-list']))
for worker in workers:
if worker['machine-type'] in machine_sku_to_device_type:
computing_device_type = machine_sku_to_device_type[worker['machine-type']]
computing_device_types.add(computing_device_type)
return list(computing_device_types)

def validation_pre(self):
if 'devices' not in self.service_conf:
self.service_conf['devices'] = self.default_service_conf['devices']
return True, None

def run(self):
# add computing device from `layout.yaml`
for computing_device in self.find_computing_device_types():
if computing_device not in self.service_conf['devices']:
self.service_conf['devices'].append(computing_device)
return self.service_conf

def validation_post(self, conf):
Expand Down
15 changes: 13 additions & 2 deletions src/device-plugin/deploy/delete.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,33 @@

pushd $(dirname "$0") > /dev/null

PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.maintaintool.update_resource \
--operation delete --resource daemonset --name k8s-host-device-plugin-daemonset --namespace kube-system
# Remove device plugins of computing devices.

# Begin: NVIDIA GPU device plugin
{% if 'nvidia.com/gpu' in cluster_cfg['device-plugin']['devices'] %}

PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.maintaintool.update_resource \
--operation delete --resource daemonset --name nvidia-device-plugin-daemonset --namespace kube-system

{% endif %}
# End: NVIDIA GPU device plugin

# Begin: AMD GPU device plugin
{% if 'amd.com/gpu' in cluster_cfg['device-plugin']['devices'] %}

PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.maintaintool.update_resource \
--operation delete --resource daemonset --name amdgpu-device-plugin-daemonset --namespace kube-system

{% endif %}
# End: AMD GPU device plugin

# Please add other types of computing devices here.
# Use comment "Begin: <device plugin name>" and "End: <device plugin name>"

# Other devices (except computing devices)

PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.maintaintool.update_resource \
--operation delete --resource daemonset --name k8s-host-device-plugin-daemonset --namespace kube-system

{% if 'rdma/hca' in cluster_cfg['device-plugin']['devices'] %}

Expand Down
24 changes: 20 additions & 4 deletions src/device-plugin/deploy/start.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,40 @@

pushd $(dirname "$0") > /dev/null

# host device plugin
kubectl apply --overwrite=true -f device-plugin.yaml || exit $?
# Deploy device plugins of computing devices.
# Computing device refers to the main hardware used in a machine for computing,
# e.g. GPU, TPU, and NPU. We don't treat CPU as a computing device.
# In pactice, one computing device usually requires one device plugin in cluster.
# Users can specify their computing device type in `layout.yaml`,
# and the specified device types will be added to cluster_cfg['device-plugin']['devices'] automatically.

# NVIDIA GPU device plugin
# Begin: NVIDIA GPU device plugin
{% if 'nvidia.com/gpu' in cluster_cfg['device-plugin']['devices'] %}

svn cat https://github.com/NVIDIA/k8s-device-plugin.git/tags/1.0.0-beta4/nvidia-device-plugin.yml \
| kubectl apply --overwrite=true -f - || exit $?

{% endif %}
# End: NVIDIA GPU device plugin

# AMD GPU device plugin
# Begin: AMD GPU device plugin
{% if 'amd.com/gpu' in cluster_cfg['device-plugin']['devices'] %}

svn cat https://github.com/RadeonOpenCompute/k8s-device-plugin.git/trunk/k8s-ds-amdgpu-dp.yaml \
| kubectl apply --overwrite=true -f - || exit $?

{% endif %}
# End: AMD GPU device plugin

# Please add other types of computing devices here.
# Use comment "Begin: <device plugin name>" and "End: <device plugin name>"
# Also, the scripts in `delete.sh.template` should also be modified.

# Other devices (except computing devices)
# Some other device plugins except the computing device plugins are also needed.

# host device plugin
kubectl apply --overwrite=true -f device-plugin.yaml || exit $?

# Mellanox InfiniBand device plugin
{% if 'rdma/hca' in cluster_cfg['device-plugin']['devices'] %}
Expand Down

0 comments on commit 27ddaca

Please sign in to comment.