Skip to content

Commit

Permalink
gpu: fix failure and add pytorch test case
Browse files Browse the repository at this point in the history
  • Loading branch information
Chi Song authored and squirrelsc committed Feb 26, 2022
1 parent 733e4ce commit 4661125
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 44 deletions.
23 changes: 14 additions & 9 deletions lisa/features/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ class GpuSettings(schema.FeatureSettings):
DEFAULT_CUDA_DRIVER_VERSION = "10.1.105-1"


class ComputeSDK(Enum):
# GRID Driver
GRID = 1
# CUDA Driver
CUDA = 2
class ComputeSDK(str, Enum):
GRID = "GRID"
CUDA = "CUDA"
AMD = "AMD"


class Gpu(Feature):
Expand Down Expand Up @@ -84,6 +83,10 @@ def on_before_deployment(cls, *args: Any, **kwargs: Any) -> None:
):
cls._install_by_platform(*args, **kwargs)

@classmethod
def remove_virtual_gpus(cls, devices: List[PciDevice]) -> List[PciDevice]:
return [x for x in devices if x.vendor != "Microsoft Corporation"]

def is_supported(self) -> bool:
raise NotImplementedError

Expand All @@ -109,7 +112,7 @@ def install_compute_sdk(self, version: str = "") -> None:
)

# install the driver
supported_driver = self._get_supported_driver()
supported_driver = self.get_supported_driver()
for driver in supported_driver:
if driver == ComputeSDK.GRID:
if not version:
Expand Down Expand Up @@ -146,23 +149,25 @@ def get_gpu_count_with_lsvmbus(self) -> int:
def get_gpu_count_with_lspci(self) -> int:
lspci_tool = self._node.tools[Lspci]
device_list = lspci_tool.get_devices_by_type(constants.DEVICE_TYPE_GPU)
# Remove Microsoft Virtual one. It presents with GRID driver.
device_list = self.remove_virtual_gpus(device_list)

return len(device_list)

def get_gpu_count_with_vendor_cmd(self) -> int:
nvidiasmi = self._node.tools[NvidiaSmi]
return nvidiasmi.get_gpu_count()

def get_supported_driver(self) -> List[ComputeSDK]:
raise NotImplementedError()

def _initialize(self, *args: Any, **kwargs: Any) -> None:
self.gpu_vendor: Set[str] = set()

@classmethod
def _install_by_platform(cls, *args: Any, **kwargs: Any) -> None:
raise NotImplementedError()

def _get_supported_driver(self) -> List[ComputeSDK]:
raise NotImplementedError()

# download and install NVIDIA grid driver
def _install_grid_driver(self, driver_url: str) -> None:
self._log.debug("Starting GRID driver installation")
Expand Down
70 changes: 44 additions & 26 deletions lisa/sut_orchestrator/azure/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from dataclasses_json import dataclass_json
from PIL import Image, UnidentifiedImageError

from lisa import Logger, features, schema, search_space
from lisa import Environment, Logger, features, schema, search_space
from lisa.features import NvmeSettings
from lisa.features.gpu import ComputeSDK
from lisa.features.resize import ResizeAction
Expand Down Expand Up @@ -135,10 +135,9 @@ def _get_console_log(self, saved_path: Optional[Path]) -> bytes:


class Gpu(AzureFeatureMixin, features.Gpu):
_grid_supported_skus = ["Standard_NV"]
_cuda_supported_skus = ["Standard_NC", "Standard_ND"]
_gpu_extension_template = json.loads(
"""
_grid_supported_skus = re.compile(r"^Standard_[^_]+(_v3)?$", re.I)
_amd_supported_skus = re.compile(r"^Standard_[^_]+_v4$", re.I)
_gpu_extension_template = """
{
"name": "[concat(parameters('nodes')[copyIndex('vmCopy')]['name'], '/gpu-extension')]",
"type": "Microsoft.Compute/virtualMachines/extensions",
Expand All @@ -150,17 +149,20 @@ class Gpu(AzureFeatureMixin, features.Gpu):
},
"dependsOn": [
"[concat('Microsoft.Compute/virtualMachines/', parameters('nodes')[copyIndex('vmCopy')]['name'])]"
],
"properties": {
]
}
""" # noqa: E501
_gpu_extension_nvidia_properties = json.loads(
"""
{
"publisher": "Microsoft.HpcCompute",
"type": "NvidiaGpuDriverLinux",
"typeHandlerVersion": "1.6",
"autoUpgradeMinorVersion": true,
"settings": {
}
}
}
""" # noqa: E501
"""
)

def is_supported(self) -> bool:
Expand All @@ -171,27 +173,17 @@ def is_supported(self) -> bool:

return False

def _initialize(self, *args: Any, **kwargs: Any) -> None:
super()._initialize(*args, **kwargs)
self._initialize_information(self._node)

@classmethod
def _install_by_platform(cls, *args: Any, **kwargs: Any) -> None:

template: Any = kwargs.get("template")
log = cast(Logger, kwargs.get("log"))
log.debug("updating arm template to support GPU extension.")
resources = template["resources"]
resources.append(cls._gpu_extension_template)

def _get_supported_driver(self) -> List[ComputeSDK]:
def get_supported_driver(self) -> List[ComputeSDK]:
driver_list = []
node_runbook = self._node.capability.get_extended_runbook(
AzureNodeSchema, AZURE
)
if any(map((node_runbook.vm_size).__contains__, self._grid_supported_skus)):
if re.match(self._grid_supported_skus, node_runbook.vm_size):
driver_list.append(ComputeSDK.GRID)
if any(map((node_runbook.vm_size).__contains__, self._cuda_supported_skus)):
elif re.match(self._amd_supported_skus, node_runbook.vm_size):
driver_list.append(ComputeSDK.AMD)
self._is_nvidia: bool = False
else:
driver_list.append(ComputeSDK.CUDA)

if not driver_list:
Expand All @@ -201,6 +193,32 @@ def _get_supported_driver(self) -> List[ComputeSDK]:
)
return driver_list

def _initialize(self, *args: Any, **kwargs: Any) -> None:
super()._initialize(*args, **kwargs)
self._initialize_information(self._node)
self._is_nvidia = True

@classmethod
def _install_by_platform(cls, *args: Any, **kwargs: Any) -> None:

template: Any = kwargs.get("template")
environment = cast(Environment, kwargs.get("environment"))
log = cast(Logger, kwargs.get("log"))
log.debug("updating arm template to support GPU extension.")
resources = template["resources"]

# load a copy to avoid side effect.
gpu_template = json.loads(cls._gpu_extension_template)

node: Node = environment.nodes[0]
runbook = node.capability.get_extended_runbook(AzureNodeSchema)
if re.match(cls._amd_supported_skus, runbook.vm_size):
# skip AMD, because no AMD GPU Linux extension.
...
else:
gpu_template["properties"] = cls._gpu_extension_nvidia_properties
resources.append(gpu_template)


class Infiniband(AzureFeatureMixin, features.Infiniband):
@classmethod
Expand All @@ -213,7 +231,7 @@ def on_before_deployment(cls, *args: Any, **kwargs: Any) -> None:

def is_over_sriov(self) -> bool:
lspci = self._node.tools[Lspci]
device_list = lspci.get_device_list()
device_list = lspci.get_devices()
return any("Virtual Function" in device.device_info for device in device_list)

# nd stands for network direct
Expand Down
114 changes: 105 additions & 9 deletions microsoft/testsuites/gpu/gpusuite.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import re
from pathlib import Path

from assertpy import assert_that
Expand All @@ -17,7 +18,16 @@
simple_requirement,
)
from lisa.features import Gpu, SerialConsole
from lisa.tools import Lspci, NvidiaSmi, Reboot
from lisa.features.gpu import ComputeSDK
from lisa.operating_system import Debian
from lisa.tools import Lspci, NvidiaSmi, Pip, Python, Reboot, Tar, Wget
from lisa.util import get_matched_str

_cudnn_location = (
"https://partnerpipelineshare.blob.core.windows.net/"
"packages/cudnn-10.0-linux-x64-v7.5.0.56.tgz"
)
_cudnn_file_name = "cudnn.tgz"


@TestSuiteMetadata(
Expand All @@ -31,6 +41,8 @@
class GpuTestSuite(TestSuite):
TIMEOUT = 2000

_pytorch_pattern = re.compile(r"^gpu count: (?P<count>\d+)", re.M)

@TestCaseMetadata(
description="""
This test case verifies if gpu drivers are loaded fine.
Expand All @@ -50,10 +62,6 @@ class GpuTestSuite(TestSuite):
priority=1,
)
def verify_load_gpu_driver(self, node: Node, log_path: Path, log: Logger) -> None:
gpu_feature = node.features[Gpu]
if not gpu_feature.is_supported():
raise SkippedException(f"GPU is not supported with distro {node.os.name}")

_check_driver_installed(node)

@TestCaseMetadata(
Expand All @@ -76,10 +84,9 @@ def verify_load_gpu_driver(self, node: Node, log_path: Path, log: Logger) -> Non
priority=2,
)
def verify_gpu_adapter_count(self, node: Node, log_path: Path, log: Logger) -> None:
gpu_feature = node.features[Gpu]
if not gpu_feature.is_supported():
raise SkippedException(f"GPU is not supported with distro {node.os.name}")
_check_driver_installed(node)

gpu_feature = node.features[Gpu]
assert isinstance(node.capability.gpu_count, int)
expected_count = node.capability.gpu_count

Expand Down Expand Up @@ -116,15 +123,79 @@ def verify_gpu_adapter_count(self, node: Node, log_path: Path, log: Logger) -> N
),
)
def verify_gpu_rescind_validation(self, node: Node) -> None:
_check_driver_installed(node)

lspci = node.tools[Lspci]
gpu = node.features[Gpu]

# 1. Disable GPU devices.
lspci.disable_devices(device_type=constants.DEVICE_TYPE_GPU)
gpu_devices = lspci.get_devices_by_type(device_type=constants.DEVICE_TYPE_GPU)
gpu_devices = gpu.remove_virtual_gpus(gpu_devices)

for device in gpu_devices:
lspci.disable_device(device)

# 2. Enable GPU devices.
lspci.enable_devices()

@TestCaseMetadata(
description="""
This test case will run PyTorch to check CUDA driver installed correctly.
1. Install PyTorch.
2. Check GPU count by torch.cuda.device_count()
3. Compare with PCI result
""",
priority=3,
requirement=simple_requirement(
supported_features=[Gpu],
),
)
def verify_gpu_cuda_with_pytorch(self, node: Node) -> None:
_check_driver_installed(node)

_install_cudnn(node)

gpu = node.features[Gpu]

pip = node.tools[Pip]
if not pip.exists_package("torch"):
pip.install_packages("torch")

gpu_script = "import torch;print(f'gpu count: {torch.cuda.device_count()}')"
python = node.tools[Python]
expected_count = gpu.get_gpu_count_with_lspci()

script_result = python.run(
f'-c "{gpu_script}"',
force_run=True,
)
gpu_count_str = get_matched_str(script_result.stdout, self._pytorch_pattern)
script_result.assert_exit_code(
message=f"failed on run gpu script: {gpu_script}, "
f"output: {script_result.stdout}"
)

assert_that(gpu_count_str).described_as(
f"gpu count is not in result: {script_result.stdout}"
).is_not_empty()

gpu_count = int(gpu_count_str)
assert_that(gpu_count).described_as(
"GPU must be greater than zero."
).is_greater_than(0)
assert_that(gpu_count).described_as(
"cannot detect GPU from PyTorch"
).is_equal_to(expected_count)


def _check_driver_installed(node: Node) -> None:
gpu = node.features[Gpu]

if not gpu.is_supported():
raise SkippedException(f"GPU is not supported with distro {node.os.name}")
if ComputeSDK.AMD in gpu.get_supported_driver():
raise SkippedException("AMD vm sizes is not supported")
try:
_ = node.tools[NvidiaSmi]
except Exception as identifier:
Expand All @@ -134,6 +205,31 @@ def _check_driver_installed(node: Node) -> None:
)


def _install_cudnn(node: Node) -> None:
wget = node.tools[Wget]
tar = node.tools[Tar]

path = wget.get_tool_path(use_global=True)
extracted_path = tar.get_tool_path(use_global=True)
if node.shell.exists(path / _cudnn_file_name):
return

download_path = wget.get(
url=_cudnn_location, filename=str(_cudnn_file_name), file_path=str(path)
)
tar.extract(download_path, dest_dir=str(extracted_path))
if isinstance(node.os, Debian):
target_path = "/usr/lib/x86_64-linux-gnu/"
else:
target_path = "/usr/lib64/"
node.execute(
f"cp -p {extracted_path}/cuda/lib64/libcudnn* {target_path}",
shell=True,
sudo=True,
)
return


# We use platform to install the driver by default. If in future, it needs to
# install independently, this logic can be reused.
def _ensure_driver_installed(
Expand Down

0 comments on commit 4661125

Please sign in to comment.