Skip to content

Commit

Permalink
[CUDA] Added __hmma_m16n16k16_* builtins to support mma instructions …
Browse files Browse the repository at this point in the history
…on sm_70

Differential Revision: https://reviews.llvm.org/D38742

llvm-svn: 315624
  • Loading branch information
Artem-B committed Oct 12, 2017
1 parent ead69ee commit 91cc00b
Show file tree
Hide file tree
Showing 3 changed files with 377 additions and 0 deletions.
13 changes: 13 additions & 0 deletions clang/include/clang/Basic/BuiltinsNVPTX.def
Expand Up @@ -688,5 +688,18 @@ BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "")
BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "")
BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "")

// Builtins to support WMMA instructions on sm_70
TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", "ptx60")
TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", "ptx60")
TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", "ptx60")
TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", "ptx60")
TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", "ptx60")
TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", "ptx60")

TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "ptx60")
TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "ptx60")
TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "ptx60")
TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "ptx60")

#undef BUILTIN
#undef TARGET_BUILTIN
198 changes: 198 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Expand Up @@ -9731,6 +9731,204 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
Builder.CreateStore(Pred, PredOutPtr);
return Builder.CreateExtractValue(ResultPair, 0);
}
case NVPTX::BI__hmma_m16n16k16_ld_a:
case NVPTX::BI__hmma_m16n16k16_ld_b:
case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
case NVPTX::BI__hmma_m16n16k16_ld_c_f32: {
Address Dst = EmitPointerWithAlignment(E->getArg(0));
Value *Src = EmitScalarExpr(E->getArg(1));
Value *Ldm = EmitScalarExpr(E->getArg(2));
llvm::APSInt isColMajorArg;
if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
return nullptr;
bool isColMajor = isColMajorArg.getSExtValue();
unsigned IID;
unsigned NumResults;
switch (BuiltinID) {
case NVPTX::BI__hmma_m16n16k16_ld_a:
IID = isColMajor ? Intrinsic::nvvm_wmma_load_a_f16_col_stride
: Intrinsic::nvvm_wmma_load_a_f16_row_stride;
NumResults = 8;
break;
case NVPTX::BI__hmma_m16n16k16_ld_b:
IID = isColMajor ? Intrinsic::nvvm_wmma_load_b_f16_col_stride
: Intrinsic::nvvm_wmma_load_b_f16_row_stride;
NumResults = 8;
break;
case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f16_col_stride
: Intrinsic::nvvm_wmma_load_c_f16_row_stride;
NumResults = 4;
break;
case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f32_col_stride
: Intrinsic::nvvm_wmma_load_c_f32_row_stride;
NumResults = 8;
break;
default:
llvm_unreachable("Unexpected builtin ID.");
}
Value *Result =
Builder.CreateCall(CGM.getIntrinsic(IID),
{Builder.CreatePointerCast(Src, VoidPtrTy), Ldm});

// Save returned values.
for (unsigned i = 0; i < NumResults; ++i) {
Builder.CreateAlignedStore(
Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
Dst.getElementType()),
Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
}
return Result;
}

case NVPTX::BI__hmma_m16n16k16_st_c_f16:
case NVPTX::BI__hmma_m16n16k16_st_c_f32: {
Value *Dst = EmitScalarExpr(E->getArg(0));
Address Src = EmitPointerWithAlignment(E->getArg(1));
Value *Ldm = EmitScalarExpr(E->getArg(2));
llvm::APSInt isColMajorArg;
if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
return nullptr;
bool isColMajor = isColMajorArg.getSExtValue();
unsigned IID;
unsigned NumResults = 8;
// PTX Instructions (and LLVM instrinsics) are defined for slice _d_, yet
// for some reason nvcc builtins use _c_.
switch (BuiltinID) {
case NVPTX::BI__hmma_m16n16k16_st_c_f16:
IID = isColMajor ? Intrinsic::nvvm_wmma_store_d_f16_col_stride
: Intrinsic::nvvm_wmma_store_d_f16_row_stride;
NumResults = 4;
break;
case NVPTX::BI__hmma_m16n16k16_st_c_f32:
IID = isColMajor ? Intrinsic::nvvm_wmma_store_d_f32_col_stride
: Intrinsic::nvvm_wmma_store_d_f32_row_stride;
break;
default:
llvm_unreachable("Unexpected builtin ID.");
}
Function *Intrinsic = CGM.getIntrinsic(IID);
llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
SmallVector<Value *, 10> Values;
Values.push_back(Builder.CreatePointerCast(Dst, VoidPtrTy));
for (unsigned i = 0; i < NumResults; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
Values.push_back(Builder.CreateBitCast(V, ParamType));
}
Values.push_back(Ldm);
Value *Result = Builder.CreateCall(Intrinsic, Values);
return Result;
}

// BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf)
// --> Intrinsic::nvvm_wmma_mma_sync<layout A,B><DType><CType><Satf>
case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
case NVPTX::BI__hmma_m16n16k16_mma_f16f32: {
Address Dst = EmitPointerWithAlignment(E->getArg(0));
Address SrcA = EmitPointerWithAlignment(E->getArg(1));
Address SrcB = EmitPointerWithAlignment(E->getArg(2));
Address SrcC = EmitPointerWithAlignment(E->getArg(3));
llvm::APSInt LayoutArg;
if (!E->getArg(4)->isIntegerConstantExpr(LayoutArg, getContext()))
return nullptr;
int Layout = LayoutArg.getSExtValue();
if (Layout < 0 || Layout > 3)
return nullptr;
llvm::APSInt SatfArg;
if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext()))
return nullptr;
bool Satf = SatfArg.getSExtValue();

// clang-format off
#define MMA_VARIANTS(type) {{ \
Intrinsic::nvvm_wmma_mma_sync_row_row_##type, \
Intrinsic::nvvm_wmma_mma_sync_row_row_##type##_satfinite, \
Intrinsic::nvvm_wmma_mma_sync_row_col_##type, \
Intrinsic::nvvm_wmma_mma_sync_row_col_##type##_satfinite, \
Intrinsic::nvvm_wmma_mma_sync_col_row_##type, \
Intrinsic::nvvm_wmma_mma_sync_col_row_##type##_satfinite, \
Intrinsic::nvvm_wmma_mma_sync_col_col_##type, \
Intrinsic::nvvm_wmma_mma_sync_col_col_##type##_satfinite \
}}
// clang-format on

auto getMMAIntrinsic = [Layout, Satf](std::array<unsigned, 8> Variants) {
unsigned Index = Layout * 2 + Satf;
assert(Index < 8);
return Variants[Index];
};
unsigned IID;
unsigned NumEltsC;
unsigned NumEltsD;
switch (BuiltinID) {
case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
IID = getMMAIntrinsic(MMA_VARIANTS(f16_f16));
NumEltsC = 4;
NumEltsD = 4;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
IID = getMMAIntrinsic(MMA_VARIANTS(f32_f16));
NumEltsC = 4;
NumEltsD = 8;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
IID = getMMAIntrinsic(MMA_VARIANTS(f16_f32));
NumEltsC = 8;
NumEltsD = 4;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
IID = getMMAIntrinsic(MMA_VARIANTS(f32_f32));
NumEltsC = 8;
NumEltsD = 8;
break;
default:
llvm_unreachable("Unexpected builtin ID.");
}
#undef MMA_VARIANTS

SmallVector<Value *, 24> Values;
Function *Intrinsic = CGM.getIntrinsic(IID);
llvm::Type *ABType = Intrinsic->getFunctionType()->getParamType(0);
// Load A
for (unsigned i = 0; i < 8; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(SrcA.getPointer(),
llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
Values.push_back(Builder.CreateBitCast(V, ABType));
}
// Load B
for (unsigned i = 0; i < 8; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(SrcB.getPointer(),
llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
Values.push_back(Builder.CreateBitCast(V, ABType));
}
// Load C
llvm::Type *CType = Intrinsic->getFunctionType()->getParamType(16);
for (unsigned i = 0; i < NumEltsC; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(SrcC.getPointer(),
llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
Values.push_back(Builder.CreateBitCast(V, CType));
}
Value *Result = Builder.CreateCall(Intrinsic, Values);
llvm::Type *DType = Dst.getElementType();
for (unsigned i = 0; i < NumEltsD; ++i)
Builder.CreateAlignedStore(
Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
return Result;
}
default:
return nullptr;
}
Expand Down
166 changes: 166 additions & 0 deletions clang/test/CodeGen/builtins-nvptx-sm_70.cu
@@ -0,0 +1,166 @@
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \
// RUN: -fcuda-is-device -target-feature +ptx60 \
// RUN: -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK %s
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \
// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s

#if !defined(CUDA_VERSION)
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))
#define __constant__ __attribute__((constant))

typedef unsigned long long uint64_t;
#endif
// We have to keep all builtins that depend on particular target feature in the
// same function, because the codegen will stop after the very first function
// that encounters an error, so -verify will not be able to find errors in
// subsequent functions.

// CHECK-LABEL: nvvm_wmma
__device__ void nvvm_wmma(int *src, int *dst,
float *fsrc, float *fdst,
int ldm) {
// CHECK: call {{.*}} @llvm.nvvm.wmma.load.a.sync.row.m16n16k16.stride.f16
// expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}}
__hmma_m16n16k16_ld_a(dst, src, ldm, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.load.a.sync.col.m16n16k16.stride.f16
// expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}}
__hmma_m16n16k16_ld_a(dst, src+1, ldm, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.load.b.sync.row.m16n16k16.stride.f16
// expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}}
__hmma_m16n16k16_ld_b(dst, src, ldm, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.load.b.sync.col.m16n16k16.stride.f16
// expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}}
__hmma_m16n16k16_ld_b(dst, src+2, ldm, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.row.m16n16k16.stride.f16
// expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}}
__hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.col.m16n16k16.stride.f16
// expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}}
__hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.row.m16n16k16.stride.f32
// expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}}
__hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.col.m16n16k16.stride.f32
// expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}}
__hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.row.m16n16k16.stride.f16
// expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}}
__hmma_m16n16k16_st_c_f16(dst, src, ldm, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.col.m16n16k16.stride.f16
// expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}}
__hmma_m16n16k16_st_c_f16(dst, src, ldm, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.row.m16n16k16.stride.f32
// expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}}
__hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.col.m16n16k16.stride.f32
// expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}}
__hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f16
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f16.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f16
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f16.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f16
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f16.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f16
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f16.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f32
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f32.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f32
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f32.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f32
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f32.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f32
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f32.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f16
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f16.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f16
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f16.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f16
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f16.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f16
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f16.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1);

// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f32
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f32.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f32
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f32.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f32
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f32.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f32
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
// CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f32.satfinite
// expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
}

0 comments on commit 91cc00b

Please sign in to comment.