gpu: fix failure and add pytorch test case

microsoft · Feb 26, 2022 · 4661125 · 4661125
1 parent 733e4ce
commit 4661125
Show file tree

Hide file tree

Showing 3 changed files with 163 additions and 44 deletions.
diff --git a/lisa/features/gpu.py b/lisa/features/gpu.py
@@ -30,11 +30,10 @@ class GpuSettings(schema.FeatureSettings):
 DEFAULT_CUDA_DRIVER_VERSION = "10.1.105-1"
 
 
-class ComputeSDK(Enum):
-    # GRID Driver
-    GRID = 1
-    # CUDA Driver
-    CUDA = 2
+class ComputeSDK(str, Enum):
+    GRID = "GRID"
+    CUDA = "CUDA"
+    AMD = "AMD"
 
 
 class Gpu(Feature):
@@ -84,6 +83,10 @@ def on_before_deployment(cls, *args: Any, **kwargs: Any) -> None:
         ):
             cls._install_by_platform(*args, **kwargs)
 
+    @classmethod
+    def remove_virtual_gpus(cls, devices: List[PciDevice]) -> List[PciDevice]:
+        return [x for x in devices if x.vendor != "Microsoft Corporation"]
+
     def is_supported(self) -> bool:
         raise NotImplementedError
 
@@ -109,7 +112,7 @@ def install_compute_sdk(self, version: str = "") -> None:
             )
 
         # install the driver
-        supported_driver = self._get_supported_driver()
+        supported_driver = self.get_supported_driver()
         for driver in supported_driver:
             if driver == ComputeSDK.GRID:
                 if not version:
@@ -146,23 +149,25 @@ def get_gpu_count_with_lsvmbus(self) -> int:
     def get_gpu_count_with_lspci(self) -> int:
         lspci_tool = self._node.tools[Lspci]
         device_list = lspci_tool.get_devices_by_type(constants.DEVICE_TYPE_GPU)
+        # Remove Microsoft Virtual one. It presents with GRID driver.
+        device_list = self.remove_virtual_gpus(device_list)
 
         return len(device_list)
 
     def get_gpu_count_with_vendor_cmd(self) -> int:
         nvidiasmi = self._node.tools[NvidiaSmi]
         return nvidiasmi.get_gpu_count()
 
+    def get_supported_driver(self) -> List[ComputeSDK]:
+        raise NotImplementedError()
+
     def _initialize(self, *args: Any, **kwargs: Any) -> None:
         self.gpu_vendor: Set[str] = set()
 
     @classmethod
     def _install_by_platform(cls, *args: Any, **kwargs: Any) -> None:
         raise NotImplementedError()
 
-    def _get_supported_driver(self) -> List[ComputeSDK]:
-        raise NotImplementedError()
-
     # download and install NVIDIA grid driver
     def _install_grid_driver(self, driver_url: str) -> None:
         self._log.debug("Starting GRID driver installation")

diff --git a/lisa/sut_orchestrator/azure/features.py b/lisa/sut_orchestrator/azure/features.py
@@ -22,7 +22,7 @@
 from dataclasses_json import dataclass_json
 from PIL import Image, UnidentifiedImageError
 
-from lisa import Logger, features, schema, search_space
+from lisa import Environment, Logger, features, schema, search_space
 from lisa.features import NvmeSettings
 from lisa.features.gpu import ComputeSDK
 from lisa.features.resize import ResizeAction
@@ -135,10 +135,9 @@ def _get_console_log(self, saved_path: Optional[Path]) -> bytes:
 
 
 class Gpu(AzureFeatureMixin, features.Gpu):
-    _grid_supported_skus = ["Standard_NV"]
-    _cuda_supported_skus = ["Standard_NC", "Standard_ND"]
-    _gpu_extension_template = json.loads(
-        """
+    _grid_supported_skus = re.compile(r"^Standard_[^_]+(_v3)?$", re.I)
+    _amd_supported_skus = re.compile(r"^Standard_[^_]+_v4$", re.I)
+    _gpu_extension_template = """
         {
         "name": "[concat(parameters('nodes')[copyIndex('vmCopy')]['name'], '/gpu-extension')]",
         "type": "Microsoft.Compute/virtualMachines/extensions",
@@ -150,17 +149,20 @@ class Gpu(AzureFeatureMixin, features.Gpu):
         },
         "dependsOn": [
             "[concat('Microsoft.Compute/virtualMachines/', parameters('nodes')[copyIndex('vmCopy')]['name'])]"
-        ],
-        "properties": {
+        ]
+    }
+    """  # noqa: E501
+    _gpu_extension_nvidia_properties = json.loads(
+        """
+        {
             "publisher": "Microsoft.HpcCompute",
             "type": "NvidiaGpuDriverLinux",
             "typeHandlerVersion": "1.6",
             "autoUpgradeMinorVersion": true,
             "settings": {
             }
         }
-    }
-    """  # noqa: E501
+    """
     )
 
     def is_supported(self) -> bool:
@@ -171,27 +173,17 @@ def is_supported(self) -> bool:
 
         return False
 
-    def _initialize(self, *args: Any, **kwargs: Any) -> None:
-        super()._initialize(*args, **kwargs)
-        self._initialize_information(self._node)
-
-    @classmethod
-    def _install_by_platform(cls, *args: Any, **kwargs: Any) -> None:
-
-        template: Any = kwargs.get("template")
-        log = cast(Logger, kwargs.get("log"))
-        log.debug("updating arm template to support GPU extension.")
-        resources = template["resources"]
-        resources.append(cls._gpu_extension_template)
-
-    def _get_supported_driver(self) -> List[ComputeSDK]:
+    def get_supported_driver(self) -> List[ComputeSDK]:
         driver_list = []
         node_runbook = self._node.capability.get_extended_runbook(
             AzureNodeSchema, AZURE
         )
-        if any(map((node_runbook.vm_size).__contains__, self._grid_supported_skus)):
+        if re.match(self._grid_supported_skus, node_runbook.vm_size):
             driver_list.append(ComputeSDK.GRID)
-        if any(map((node_runbook.vm_size).__contains__, self._cuda_supported_skus)):
+        elif re.match(self._amd_supported_skus, node_runbook.vm_size):
+            driver_list.append(ComputeSDK.AMD)
+            self._is_nvidia: bool = False
+        else:
             driver_list.append(ComputeSDK.CUDA)
 
         if not driver_list:
@@ -201,6 +193,32 @@ def _get_supported_driver(self) -> List[ComputeSDK]:
             )
         return driver_list
 
+    def _initialize(self, *args: Any, **kwargs: Any) -> None:
+        super()._initialize(*args, **kwargs)
+        self._initialize_information(self._node)
+        self._is_nvidia = True
+
+    @classmethod
+    def _install_by_platform(cls, *args: Any, **kwargs: Any) -> None:
+
+        template: Any = kwargs.get("template")
+        environment = cast(Environment, kwargs.get("environment"))
+        log = cast(Logger, kwargs.get("log"))
+        log.debug("updating arm template to support GPU extension.")
+        resources = template["resources"]
+
+        # load a copy to avoid side effect.
+        gpu_template = json.loads(cls._gpu_extension_template)
+
+        node: Node = environment.nodes[0]
+        runbook = node.capability.get_extended_runbook(AzureNodeSchema)
+        if re.match(cls._amd_supported_skus, runbook.vm_size):
+            # skip AMD, because no AMD GPU Linux extension.
+            ...
+        else:
+            gpu_template["properties"] = cls._gpu_extension_nvidia_properties
+            resources.append(gpu_template)
+
 
 class Infiniband(AzureFeatureMixin, features.Infiniband):
     @classmethod
@@ -213,7 +231,7 @@ def on_before_deployment(cls, *args: Any, **kwargs: Any) -> None:
 
     def is_over_sriov(self) -> bool:
         lspci = self._node.tools[Lspci]
-        device_list = lspci.get_device_list()
+        device_list = lspci.get_devices()
         return any("Virtual Function" in device.device_info for device in device_list)
 
     # nd stands for network direct

diff --git a/microsoft/testsuites/gpu/gpusuite.py b/microsoft/testsuites/gpu/gpusuite.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import re
 from pathlib import Path
 
 from assertpy import assert_that
@@ -17,7 +18,16 @@
     simple_requirement,
 )
 from lisa.features import Gpu, SerialConsole
-from lisa.tools import Lspci, NvidiaSmi, Reboot
+from lisa.features.gpu import ComputeSDK
+from lisa.operating_system import Debian
+from lisa.tools import Lspci, NvidiaSmi, Pip, Python, Reboot, Tar, Wget
+from lisa.util import get_matched_str
+
+_cudnn_location = (
+    "https://partnerpipelineshare.blob.core.windows.net/"
+    "packages/cudnn-10.0-linux-x64-v7.5.0.56.tgz"
+)
+_cudnn_file_name = "cudnn.tgz"
 
 
 @TestSuiteMetadata(
@@ -31,6 +41,8 @@
 class GpuTestSuite(TestSuite):
     TIMEOUT = 2000
 
+    _pytorch_pattern = re.compile(r"^gpu count: (?P<count>\d+)", re.M)
+
     @TestCaseMetadata(
         description="""
             This test case verifies if gpu drivers are loaded fine.
@@ -50,10 +62,6 @@ class GpuTestSuite(TestSuite):
         priority=1,
     )
     def verify_load_gpu_driver(self, node: Node, log_path: Path, log: Logger) -> None:
-        gpu_feature = node.features[Gpu]
-        if not gpu_feature.is_supported():
-            raise SkippedException(f"GPU is not supported with distro {node.os.name}")
-
         _check_driver_installed(node)
 
     @TestCaseMetadata(
@@ -76,10 +84,9 @@ def verify_load_gpu_driver(self, node: Node, log_path: Path, log: Logger) -> Non
         priority=2,
     )
     def verify_gpu_adapter_count(self, node: Node, log_path: Path, log: Logger) -> None:
-        gpu_feature = node.features[Gpu]
-        if not gpu_feature.is_supported():
-            raise SkippedException(f"GPU is not supported with distro {node.os.name}")
+        _check_driver_installed(node)
 
+        gpu_feature = node.features[Gpu]
         assert isinstance(node.capability.gpu_count, int)
         expected_count = node.capability.gpu_count
 
@@ -116,15 +123,79 @@ def verify_gpu_adapter_count(self, node: Node, log_path: Path, log: Logger) -> N
         ),
     )
     def verify_gpu_rescind_validation(self, node: Node) -> None:
+        _check_driver_installed(node)
+
         lspci = node.tools[Lspci]
+        gpu = node.features[Gpu]
+
         # 1. Disable GPU devices.
-        lspci.disable_devices(device_type=constants.DEVICE_TYPE_GPU)
+        gpu_devices = lspci.get_devices_by_type(device_type=constants.DEVICE_TYPE_GPU)
+        gpu_devices = gpu.remove_virtual_gpus(gpu_devices)
+
+        for device in gpu_devices:
+            lspci.disable_device(device)
+
         # 2. Enable GPU devices.
         lspci.enable_devices()
 
+    @TestCaseMetadata(
+        description="""
+        This test case will run PyTorch to check CUDA driver installed correctly.
+
+        1. Install PyTorch.
+        2. Check GPU count by torch.cuda.device_count()
+        3. Compare with PCI result
+        """,
+        priority=3,
+        requirement=simple_requirement(
+            supported_features=[Gpu],
+        ),
+    )
+    def verify_gpu_cuda_with_pytorch(self, node: Node) -> None:
+        _check_driver_installed(node)
+
+        _install_cudnn(node)
+
+        gpu = node.features[Gpu]
+
+        pip = node.tools[Pip]
+        if not pip.exists_package("torch"):
+            pip.install_packages("torch")
+
+        gpu_script = "import torch;print(f'gpu count: {torch.cuda.device_count()}')"
+        python = node.tools[Python]
+        expected_count = gpu.get_gpu_count_with_lspci()
+
+        script_result = python.run(
+            f'-c "{gpu_script}"',
+            force_run=True,
+        )
+        gpu_count_str = get_matched_str(script_result.stdout, self._pytorch_pattern)
+        script_result.assert_exit_code(
+            message=f"failed on run gpu script: {gpu_script}, "
+            f"output: {script_result.stdout}"
+        )
+
+        assert_that(gpu_count_str).described_as(
+            f"gpu count is not in result: {script_result.stdout}"
+        ).is_not_empty()
+
+        gpu_count = int(gpu_count_str)
+        assert_that(gpu_count).described_as(
+            "GPU must be greater than zero."
+        ).is_greater_than(0)
+        assert_that(gpu_count).described_as(
+            "cannot detect GPU from PyTorch"
+        ).is_equal_to(expected_count)
+
 
 def _check_driver_installed(node: Node) -> None:
+    gpu = node.features[Gpu]
 
+    if not gpu.is_supported():
+        raise SkippedException(f"GPU is not supported with distro {node.os.name}")
+    if ComputeSDK.AMD in gpu.get_supported_driver():
+        raise SkippedException("AMD vm sizes is not supported")
     try:
         _ = node.tools[NvidiaSmi]
     except Exception as identifier:
@@ -134,6 +205,31 @@ def _check_driver_installed(node: Node) -> None:
         )
 
 
+def _install_cudnn(node: Node) -> None:
+    wget = node.tools[Wget]
+    tar = node.tools[Tar]
+
+    path = wget.get_tool_path(use_global=True)
+    extracted_path = tar.get_tool_path(use_global=True)
+    if node.shell.exists(path / _cudnn_file_name):
+        return
+
+    download_path = wget.get(
+        url=_cudnn_location, filename=str(_cudnn_file_name), file_path=str(path)
+    )
+    tar.extract(download_path, dest_dir=str(extracted_path))
+    if isinstance(node.os, Debian):
+        target_path = "/usr/lib/x86_64-linux-gnu/"
+    else:
+        target_path = "/usr/lib64/"
+    node.execute(
+        f"cp -p {extracted_path}/cuda/lib64/libcudnn* {target_path}",
+        shell=True,
+        sudo=True,
+    )
+    return
+
+
 # We use platform to install the driver by default. If in future, it needs to
 # install independently, this logic can be reused.
 def _ensure_driver_installed(