diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index cf50f2a59f602..ea6f02d1b1eb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2491,6 +2491,47 @@ def int_nvvm_tex_unified_cube_array_level_v4u32_f32 llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [], "llvm.nvvm.tex.unified.cube.array.level.v4u32.f32">; +def int_nvvm_tex_unified_cube_grad_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.grad.v4f32.f32">; +def int_nvvm_tex_unified_cube_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.grad.v4s32.f32">; +def int_nvvm_tex_unified_cube_grad_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.grad.v4u32.f32">; + +def int_nvvm_tex_unified_cube_array_grad_v4f32_f32 + : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.grad.v4f32.f32">; +def int_nvvm_tex_unified_cube_array_grad_v4s32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.grad.v4s32.f32">; +def int_nvvm_tex_unified_cube_array_grad_v4u32_f32 + : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, + llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, + llvm_float_ty, llvm_float_ty, llvm_float_ty], [], + "llvm.nvvm.tex.unified.cube.array.grad.v4u32.f32">; + def int_nvvm_tld4_unified_r_2d_v4f32_f32 : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [], diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 7abe984b34e19..ded2f2584014d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -309,6 +309,12 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: case NVPTXISD::TexUnifiedCubeArrayU32Float: case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + case NVPTXISD::TexUnifiedCubeFloatFloatGrad: + case NVPTXISD::TexUnifiedCubeS32FloatGrad: + case NVPTXISD::TexUnifiedCubeU32FloatGrad: + case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: + case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: + case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: case NVPTXISD::Tld4UnifiedR2DFloatFloat: case NVPTXISD::Tld4UnifiedG2DFloatFloat: case NVPTXISD::Tld4UnifiedB2DFloatFloat: @@ -2763,6 +2769,24 @@ bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { case NVPTXISD::Tld4UnifiedA2DU64Float: Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R; break; + case NVPTXISD::TexUnifiedCubeFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R; + break; + case NVPTXISD::TexUnifiedCubeS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R; + break; + case NVPTXISD::TexUnifiedCubeU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R; + break; + case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R; + break; + case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R; + break; + case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R; + break; } // Copy over operands diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 34c5569b8076e..b2e527c170a0e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1257,6 +1257,18 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { return "NVPTXISD::TexUnifiedCubeArrayU32Float"; case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; + case NVPTXISD::TexUnifiedCubeFloatFloatGrad: + return "NVPTXISD::TexUnifiedCubeFloatFloatGrad"; + case NVPTXISD::TexUnifiedCubeS32FloatGrad: + return "NVPTXISD::TexUnifiedCubeS32FloatGrad"; + case NVPTXISD::TexUnifiedCubeU32FloatGrad: + return "NVPTXISD::TexUnifiedCubeU32FloatGrad"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad"; + case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: + return "NVPTXISD::TexUnifiedCubeArrayS32FloatGrad"; + case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: + return "NVPTXISD::TexUnifiedCubeArrayU32FloatGrad"; case NVPTXISD::Tld4UnifiedR2DFloatFloat: return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; case NVPTXISD::Tld4UnifiedG2DFloatFloat: @@ -3652,6 +3664,19 @@ static unsigned getOpcForTextureInstr(unsigned Intrinsic) { case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32FloatGrad; + case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32FloatGrad; + case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad; + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: return NVPTXISD::Tld4UnifiedR2DFloatFloat; case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: @@ -4536,6 +4561,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: @@ -4652,6 +4679,10 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 06adc0c47f051..18e6179b06819 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -240,6 +240,12 @@ enum NodeType : unsigned { TexUnifiedCubeArrayS32FloatLevel, TexUnifiedCubeArrayU32Float, TexUnifiedCubeArrayU32FloatLevel, + TexUnifiedCubeFloatFloatGrad, + TexUnifiedCubeS32FloatGrad, + TexUnifiedCubeU32FloatGrad, + TexUnifiedCubeArrayFloatFloatGrad, + TexUnifiedCubeArrayS32FloatGrad, + TexUnifiedCubeArrayU32FloatGrad, Tld4UnifiedR2DFloatFloat, Tld4UnifiedG2DFloatFloat, Tld4UnifiedB2DFloatFloat, diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 6b062a7f39127..12b6fad34a562 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -3754,6 +3754,62 @@ defm TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL : TEX_UNIFIED_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.u32.f32", Int32Regs, Float32Regs>; +class TEX_UNIFIED_CUBE_GRAD_base + : NVPTXInst<(outs outtype:$r, outtype:$g, + outtype:$b, outtype:$a), + !con(tex, (ins intype:$x, intype:$y, intype:$z, + intype:$gradx0, intype:$gradx1, + intype:$gradx2, intype:$grady0, + intype:$grady1, intype:$grady2)), + inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y, $z, $z\\}]," + " \\{$gradx0, $gradx1, $gradx2, $gradx2\\}," + " \\{$grady0, $grady1, $grady2, $grady2\\};", + []>; + +multiclass TEX_UNIFIED_CUBE_GRAD { + def _R : TEX_UNIFIED_CUBE_GRAD_base; + def _I : TEX_UNIFIED_CUBE_GRAD_base; +} + +defm TEX_UNIFIED_CUBE_F32_F32_GRAD + : TEX_UNIFIED_CUBE_GRAD<"tex.grad.cube.v4.f32.f32", Float32Regs, Float32Regs>; +defm TEX_UNIFIED_CUBE_S32_F32_GRAD + : TEX_UNIFIED_CUBE_GRAD<"tex.grad.cube.v4.s32.f32", Int32Regs, Float32Regs>; +defm TEX_UNIFIED_CUBE_U32_F32_GRAD + : TEX_UNIFIED_CUBE_GRAD<"tex.grad.cube.v4.u32.f32", Int32Regs, Float32Regs>; + +class TEX_UNIFIED_CUBE_ARRAY_GRAD_base + : NVPTXInst<(outs outtype:$r, outtype:$g, + outtype:$b, outtype:$a), + !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y, intype:$z, + intype:$gradx0, intype:$gradx1, + intype:$gradx2, intype:$grady0, + intype:$grady1, intype:$grady2)), + inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x, $y, $z\\}]," + " \\{$gradx0, $gradx1, $gradx2, $gradx2\\}," + " \\{$grady0, $grady1, $grady2, $grady2\\};", + []>; +multiclass TEX_UNIFIED_CUBE_ARRAY_GRAD { + def _R : TEX_UNIFIED_CUBE_ARRAY_GRAD_base; + def _I : TEX_UNIFIED_CUBE_ARRAY_GRAD_base; +} + +defm TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD + : TEX_UNIFIED_CUBE_ARRAY_GRAD<"tex.grad.acube.v4.f32.f32", + Float32Regs, Float32Regs>; +defm TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD + : TEX_UNIFIED_CUBE_ARRAY_GRAD<"tex.grad.acube.v4.s32.f32", + Int32Regs, Float32Regs>; +defm TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD + : TEX_UNIFIED_CUBE_ARRAY_GRAD<"tex.grad.acube.v4.u32.f32", + Int32Regs, Float32Regs>; + class TLD4_UNIFIED_2D_base : NVPTXInst<(outs outtype:$v0, outtype:$v1, diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index 85f75df39c0d0..f2515f971595b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -1319,6 +1319,18 @@ static unsigned texRegisterToIndexOpcode(unsigned RegOC) { return NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_I; case NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R: return NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_I; + case NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R: + return NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_I; + case NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R: + return NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_I; + case NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R: + return NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_I; + case NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R: + return NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_I; + case NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R: + return NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_I; + case NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R: + return NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_I; case NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R: return NVPTX::TLD4_UNIFIED_R_2D_F32_F32_I; case NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R: diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py index d63cfc521117d..7d86696087438 100644 --- a/llvm/test/CodeGen/NVPTX/surf-tex.py +++ b/llvm/test/CodeGen/NVPTX/surf-tex.py @@ -1,12 +1,12 @@ # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll -# RUN: llc -mcpu=sm_20 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll -# RUN: %if ptxas %{ llc -mcpu=sm_20 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %} +# RUN: llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll --check-prefixes=CHECK,CHECK-CUDA +# RUN: %if ptxas %{ llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %} # We only need to run this second time for texture tests, because # there is a difference between unified and non-unified intrinsics. # # RUN: %python %s --target=nvcl --tests=suld,sust,tex,tld4 --gen-list-append --gen-list=%t.list > %t-nvcl.ll -# RUN: llc %t-nvcl.ll -verify-machineinstrs -o - | FileCheck %t-nvcl.ll +# RUN: llc %t-nvcl.ll -verify-machineinstrs -o - | FileCheck %t-nvcl.ll --check-prefixes=CHECK,CHECK-NVCL # RUN: %if ptxas %{ llc %t-nvcl.ll -verify-machineinstrs -o - | %ptxas-verify %} # Verify that all instructions and intrinsics defined in TableGen @@ -115,6 +115,15 @@ def get_llvm_value_type(vec, ty_ptx): return value[vec].format(ty=ty) +id_counter = 0 + + +def get_table_gen_id(): + global id_counter + id_counter += 1 + return id_counter + + def gen_triple(target): if target == "cuda": print('target triple = "nvptx64-unknown-cuda"\n') @@ -260,8 +269,9 @@ def gen_suld_tests(target, global_surf): ret void } ; CHECK-LABEL: .entry ${test_name}_global - ; CHECK: ${instruction} ${reg_ret}, [${global_surf}, ${reg_access}] - ; + ; CHECK-CUDA: mov.u64 [[REG${reg_id}:%.*]], ${global_surf} + ; CHECK-CUDA: ${instruction} ${reg_ret}, [[[REG${reg_id}]], ${reg_access}] + ; CHECK-NVCL: ${instruction} ${reg_ret}, [${global_surf}, ${reg_access}] define void @${test_name}_global(${retty}* %ret, ${access}) { %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf}) %val = tail call ${retty} @${intrinsic}(i64 %gs, ${access}) @@ -304,6 +314,7 @@ def gen_suld_tests(target, global_surf): "reg_ret": get_ptx_vec_reg(vec, dtype), "reg_surf": get_ptx_surface(target), "reg_access": get_ptx_surface_access(geom), + "reg_id": get_table_gen_id(), } gen_test(template, params) generated_items.append((params["intrinsic"], params["instruction"])) @@ -353,8 +364,9 @@ def gen_sust_tests(target, global_surf): ret void } ; CHECK-LABEL: .entry ${test_name}_global - ; CHECK: ${instruction} [${global_surf}, ${reg_access}], ${reg_value} - ; + ; CHECK-CUDA: mov.u64 [[REG${reg_id}:%.*]], ${global_surf} + ; CHECK-CUDA: ${instruction} [[[REG${reg_id}]], ${reg_access}], ${reg_value} + ; CHECK-NVCL: ${instruction} [${global_surf}, ${reg_access}], ${reg_value} define void @${test_name}_global(${value}, ${access}) { %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf}) tail call void @${intrinsic}(i64 %gs, ${access}, ${value}) @@ -408,6 +420,7 @@ def gen_sust_tests(target, global_surf): "reg_value": get_ptx_vec_reg(vec, ctype), "reg_surf": get_ptx_surface(target), "reg_access": get_ptx_surface_access(geom), + "reg_id": get_table_gen_id(), } gen_test(template, params) generated_items.append((params["intrinsic"], params["instruction"])) @@ -614,7 +627,9 @@ def gen_tex_tests(target, global_tex, global_sampler): ret void } ; CHECK-LABEL: .entry ${test_name}_global - ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}] + ; CHECK-CUDA: mov.u64 [[REG${reg_id}:%.*]], ${global_tex} + ; CHECK-CUDA: ${instruction} ${ptx_ret}, [[[REG${reg_id}]], ${ptx_global_sampler} ${ptx_access}] + ; CHECK-NVCL: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}] define void @${test_name}_global(${retty}* %ret, ${access}) { %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex}) ${get_sampler_handle} @@ -656,8 +671,8 @@ def gen_tex_tests(target, global_tex, global_sampler): # FIXME: missing intrinsics. # Support for tex.grad.{cube, acube} introduced in PTX ISA version - # 4.3. - if mipmap == "grad" and geom in ("cube", "acube"): + # 4.3, currently supported only in unified mode. + if not is_unified(target) and mipmap == "grad" and geom in ("cube", "acube"): continue # The instruction returns a two-element vector for destination @@ -698,6 +713,7 @@ def gen_tex_tests(target, global_tex, global_sampler): "ptx_tex": get_ptx_texture(target), "ptx_access": get_ptx_texture_access(geom, ctype), "ptx_global_sampler": get_ptx_global_sampler(target, global_sampler), + "reg_id": get_table_gen_id(), } gen_test(template, params) generated_items.append((params["intrinsic"], params["instruction"])) @@ -798,7 +814,9 @@ def gen_tld4_tests(target, global_tex, global_sampler): ret void } ; CHECK-LABEL: .entry ${test_name}_global - ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}] + ; CHECK-CUDA: mov.u64 [[REG${reg_id}:%.*]], ${global_tex} + ; CHECK-CUDA: ${instruction} ${ptx_ret}, [[[REG${reg_id}]], ${ptx_global_sampler} ${ptx_access}] + ; CHECK-NVCL: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}] define void @${test_name}_global(${retty}* %ret, ${access}) { %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex}) ${get_sampler_handle} @@ -844,6 +862,7 @@ def gen_tld4_tests(target, global_tex, global_sampler): "ptx_tex": get_ptx_texture(target), "ptx_access": get_ptx_tld4_access(geom), "ptx_global_sampler": get_ptx_global_sampler(target, global_sampler), + "reg_id": get_table_gen_id(), } gen_test(template, params) generated_items.append((params["intrinsic"], params["instruction"]))