11 changes: 8 additions & 3 deletions llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vle.mask.nxv1i64(
define <vscale x 1 x double> @test1(i64 %avl, <vscale x 1 x double> %a, <vscale x 1 x double> %b) nounwind {
; CHECK-LABEL: test1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetvli a0, a0, e32, mf2, ta, mu
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
; CHECK-NEXT: vfadd.vv v8, v8, v9
; CHECK-NEXT: ret
Expand Down Expand Up @@ -412,7 +413,7 @@ define i64 @avl_forward1b_neg(<vscale x 2 x i32> %v, <vscale x 2 x i32>* %p) nou
; CHECK-LABEL: avl_forward1b_neg:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli a1, 6, e16, m1, ta, mu
; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, mu
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: ret
Expand Down Expand Up @@ -467,6 +468,7 @@ entry:
define void @avl_forward4(<vscale x 2 x i32> %v, <vscale x 2 x i32>* %p, i64 %reg) nounwind {
; CHECK-LABEL: avl_forward4:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetvli a1, a1, e16, m1, ta, mu
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
Expand All @@ -480,10 +482,10 @@ entry:
define i64 @avl_forward4b(<vscale x 2 x i32> %v, <vscale x 2 x i32>* %p, i64 %reg) nounwind {
; CHECK-LABEL: avl_forward4b:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetvli a2, a1, e16, m1, ta, mu
; CHECK-NEXT: vsetvli a1, a1, e16, m1, ta, mu
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: ret
entry:
%vl = tail call i64 @llvm.riscv.vsetvli(i64 %reg, i64 1, i64 0)
Expand All @@ -496,6 +498,7 @@ entry:
define <vscale x 1 x i64> @vleNff(i64* %str, i64 %n, i64 %x) {
; CHECK-LABEL: vleNff:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetvli a1, a1, e8, m4, ta, mu
; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu
; CHECK-NEXT: vle64ff.v v8, (a0)
; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, mu
Expand All @@ -516,6 +519,7 @@ entry:
define <vscale x 1 x i64> @vleNff2(i64* %str, i64 %n, i64 %x) {
; CHECK-LABEL: vleNff2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetvli a1, a1, e8, m4, ta, mu
; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu
; CHECK-NEXT: vle64ff.v v8, (a0)
; CHECK-NEXT: vadd.vx v8, v8, a2
Expand All @@ -541,6 +545,7 @@ define <vscale x 2 x i32> @avl_forward5(<vscale x 2 x i32>* %addr) {
; CHECK-LABEL: avl_forward5:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: vsetvli a1, a1, e8, m4, ta, mu
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: ret
Expand Down
14 changes: 11 additions & 3 deletions openmp/libomptarget/plugins/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
//
//===----------------------------------------------------------------------===//

#include "llvm/ADT/StringRef.h"

#include <algorithm>
#include <cassert>
#include <cstddef>
Expand All @@ -33,6 +35,8 @@

#include "llvm/Frontend/OpenMP/OMPConstants.h"

using namespace llvm;

// Utility for retrieving and printing CUDA error string.
#ifdef OMPTARGET_DEBUG
#define CUDA_ERR_STRING(err) \
Expand Down Expand Up @@ -1529,13 +1533,14 @@ int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
return false;

// A subarchitecture was not specified. Assume it is compatible.
if (!info->Arch)
if (!info || !info->Arch)
return true;

int32_t NumberOfDevices = 0;
if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS)
return false;

StringRef ArchStr = StringRef(info->Arch).drop_front(sizeof("sm_") - 1);
for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
CUdevice Device;
if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
Expand All @@ -1551,8 +1556,11 @@ int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
Device) != CUDA_SUCCESS)
return false;

std::string ArchStr = "sm_" + std::to_string(Major) + std::to_string(Minor);
if (ArchStr != info->Arch)
// A cubin generated for a certain compute capability is supported to run on
// any GPU with the same major revision and same or higher minor revision.
int32_t ImageMajor = ArchStr[0] - '0';
int32_t ImageMinor = ArchStr[1] - '0';
if (Major != ImageMajor || Minor < ImageMinor)
return false;
}

Expand Down