94 changes: 45 additions & 49 deletions llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "nvptx64-unknown-unknown"

declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) #1
declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) #1
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) #1

define i8* @memcpy_caller(i8* %dst, i8* %src, i64 %n) #0 {
define ptr @memcpy_caller(ptr %dst, ptr %src, i64 %n) #0 {
entry:
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i1 false)
ret i8* %dst
tail call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %n, i1 false)
ret ptr %dst

; IR-LABEL: @memcpy_caller
; IR: entry:
Expand All @@ -24,16 +24,16 @@ entry:

; IR: loop-memcpy-expansion:
; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
; IR: store i8 [[Load]], i8* [[DstGep]]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
; IR: store i8 [[Load]], ptr [[DstGep]]
; IR: [[IndexInc]] = add i64 %loop-index, 1
; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion

; IR-LABEL: post-loop-memcpy-expansion:
; IR: ret i8* %dst
; IR: ret ptr %dst

; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_caller
; PTX: $L__BB[[LABEL:[_0-9]+]]:
Expand All @@ -45,10 +45,10 @@ entry:

}

define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
define ptr @memcpy_volatile_caller(ptr %dst, ptr %src, i64 %n) #0 {
entry:
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i1 true)
ret i8* %dst
tail call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %n, i1 true)
ret ptr %dst

; IR-LABEL: @memcpy_volatile_caller
; IR: entry:
Expand All @@ -57,16 +57,16 @@ entry:

; IR: loop-memcpy-expansion:
; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
; IR: store volatile i8 [[Load]], i8* [[DstGep]]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load volatile i8, ptr [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
; IR: store volatile i8 [[Load]], ptr [[DstGep]]
; IR: [[IndexInc]] = add i64 %loop-index, 1
; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion

; IR-LABEL: post-loop-memcpy-expansion:
; IR: ret i8* %dst
; IR: ret ptr %dst


; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_volatile_caller
Expand All @@ -78,54 +78,50 @@ entry:
; PTX: @%p[[PRED]] bra $L__BB[[LABEL]]
}

define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
define ptr @memcpy_casting_caller(ptr %dst, ptr %src, i64 %n) #0 {
entry:
%0 = bitcast i32* %dst to i8*
%1 = bitcast i32* %src to i8*
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 %n, i1 false)
ret i8* %0
tail call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %n, i1 false)
ret ptr %dst

; Check that casts in calls to memcpy are handled properly
; IR-LABEL: @memcpy_casting_caller
; IR: [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8*
; IR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
; IR: getelementptr inbounds i8, i8* [[SRCCAST]]
; IR: getelementptr inbounds i8, i8* [[DSTCAST]]
; IR: getelementptr inbounds i8, ptr %src
; IR: getelementptr inbounds i8, ptr %dst
}

define i8* @memcpy_known_size(i8* %dst, i8* %src) {
define ptr @memcpy_known_size(ptr %dst, ptr %src) {
entry:
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 144, i1 false)
ret i8* %dst
tail call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 144, i1 false)
ret ptr %dst

; Check that calls with compile-time constant size are handled correctly
; IR-LABEL: @memcpy_known_size
; IR: entry:
; IR: br label %load-store-loop
; IR: load-store-loop:
; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
; IR: store i8 [[Load]], i8* [[DstGep]]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
; IR: store i8 [[Load]], ptr [[DstGep]]
; IR: [[IndexInc]] = add i64 %loop-index, 1
; IR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
; IR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split
}

define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
define ptr @memset_caller(ptr %dst, i32 %c, i64 %n) #0 {
entry:
%0 = trunc i32 %c to i8
tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i1 false)
ret i8* %dst
tail call void @llvm.memset.p0.i64(ptr %dst, i8 %0, i64 %n, i1 false)
ret ptr %dst

; IR-LABEL: @memset_caller
; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8
; IR: [[CMPREG:%[0-9]+]] = icmp eq i64 0, %n
; IR: br i1 [[CMPREG]], label %split, label %loadstoreloop
; IR: loadstoreloop:
; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64
; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]]
; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64
; IR-NEXT: store i8 [[VAL]], ptr [[STOREPTR]]

; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller(
; PTX: ld.param.u32 %r[[C:[0-9]+]]
Expand All @@ -137,26 +133,26 @@ entry:
; PTX: @%p[[PRED]] bra $L__BB[[LABEL]]
}

define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
define ptr @volatile_memset_caller(ptr %dst, i32 %c, i64 %n) #0 {
entry:
%0 = trunc i32 %c to i8
tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i1 true)
ret i8* %dst
tail call void @llvm.memset.p0.i64(ptr %dst, i8 %0, i64 %n, i1 true)
ret ptr %dst

; IR-LABEL: @volatile_memset_caller
; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8
; IR: loadstoreloop:
; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64
; IR-NEXT: store volatile i8 [[VAL]], i8* [[STOREPTR]]
; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64
; IR-NEXT: store volatile i8 [[VAL]], ptr [[STOREPTR]]
}

define i8* @memmove_caller(i8* %dst, i8* %src, i64 %n) #0 {
define ptr @memmove_caller(ptr %dst, ptr %src, i64 %n) #0 {
entry:
tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i1 false)
ret i8* %dst
tail call void @llvm.memmove.p0.p0.i64(ptr %dst, ptr %src, i64 %n, i1 false)
ret ptr %dst

; IR-LABEL: @memmove_caller
; IR: icmp ult i8* %src, %dst
; IR: icmp ult ptr %src, %dst
; IR: [[PHIVAL:%[0-9a-zA-Z_]+]] = phi i64
; IR-NEXT: %index_ptr = sub i64 [[PHIVAL]], 1
; IR: [[FWDPHIVAL:%[0-9a-zA-Z_]+]] = phi i64
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/NVPTX/lower-alloca.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ define void @kernel() {
; LABEL: @lower_alloca
; PTX-LABEL: .visible .entry kernel(
%A = alloca i32
; CHECK: addrspacecast i32* %A to i32 addrspace(5)*
; CHECK: store i32 0, i32 addrspace(5)* {{%.+}}
; CHECK: addrspacecast ptr %A to ptr addrspace(5)
; CHECK: store i32 0, ptr addrspace(5) {{%.+}}
; PTX: st.local.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
store i32 0, i32* %A
call void @callee(i32* %A)
store i32 0, ptr %A
call void @callee(ptr %A)
ret void
}

declare void @callee(i32*)
declare void @callee(ptr)

!nvvm.annotations = !{!0}
!0 = !{void ()* @kernel, !"kernel", i32 1}
!0 = !{ptr @kernel, !"kernel", i32 1}
25 changes: 12 additions & 13 deletions llvm/test/CodeGen/NVPTX/lower-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,29 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

%class.outer = type <{ %class.inner, i32, [4 x i8] }>
%class.inner = type { i32*, i32* }
%class.inner = type { ptr, ptr }

; Check that nvptx-lower-args preserves arg alignment
define void @load_alignment(%class.outer* nocapture readonly byval(%class.outer) align 8 %arg) {
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
entry:
; IR: load %class.outer, %class.outer addrspace(101)*
; IR: load %class.outer, ptr addrspace(101)
; IR-SAME: align 8
; PTX: ld.param.u64
; PTX-NOT: ld.param.u8
%arg.idx = getelementptr %class.outer, %class.outer* %arg, i64 0, i32 0, i32 0
%arg.idx.val = load i32*, i32** %arg.idx, align 8
%arg.idx1 = getelementptr %class.outer, %class.outer* %arg, i64 0, i32 0, i32 1
%arg.idx1.val = load i32*, i32** %arg.idx1, align 8
%arg.idx2 = getelementptr %class.outer, %class.outer* %arg, i64 0, i32 1
%arg.idx2.val = load i32, i32* %arg.idx2, align 8
%arg.idx.val.val = load i32, i32* %arg.idx.val, align 4
%arg.idx.val = load ptr, ptr %arg, align 8
%arg.idx1 = getelementptr %class.outer, ptr %arg, i64 0, i32 0, i32 1
%arg.idx1.val = load ptr, ptr %arg.idx1, align 8
%arg.idx2 = getelementptr %class.outer, ptr %arg, i64 0, i32 1
%arg.idx2.val = load i32, ptr %arg.idx2, align 8
%arg.idx.val.val = load i32, ptr %arg.idx.val, align 4
%add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val
store i32 %add.i, i32* %arg.idx1.val, align 4
store i32 %add.i, ptr %arg.idx1.val, align 4

; let the pointer escape so we still create a local copy this test uses to
; check the load alignment.
%tmp = call i32* @escape(i32* nonnull %arg.idx2)
%tmp = call ptr @escape(ptr nonnull %arg.idx2)
ret void
}

; Function Attrs: convergent nounwind
declare dso_local i32* @escape(i32*) local_unnamed_addr
declare dso_local ptr @escape(ptr) local_unnamed_addr
58 changes: 28 additions & 30 deletions llvm/test/CodeGen/NVPTX/lower-byval-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@
; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12];
; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
bb:
%tmp = icmp eq i32 %arg2, 3
br i1 %tmp, label %bb3, label %bb6

bb3: ; preds = %bb
%tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3
%tmp5 = load i32, i32* %tmp4, align 4
store i32 %tmp5, i32* %arg, align 4
%tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3
%tmp5 = load i32, ptr %tmp4, align 4
store i32 %tmp5, ptr %arg, align 4
br label %bb6

bb6: ; preds = %bb3, %bb
Expand All @@ -55,12 +55,12 @@ bb6: ; preds = %bb3, %bb
; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];

; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
%tmp4 = load i32, i32* %tmp3, align 4
store i32 %tmp4, i32* %arg, align 4
%tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
%tmp4 = load i32, ptr %tmp3, align 4
store i32 %tmp4, ptr %arg, align 4
ret void
}

Expand All @@ -80,13 +80,12 @@ bb:
; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]];
;
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @gep_bitcast(i8* nocapture %out, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
bb:
%n64 = sext i32 %n to i64
%gep = getelementptr inbounds %struct.ham, %struct.ham* %in, i64 0, i32 0, i64 %n64
%bc = bitcast i32* %gep to i8*
%load = load i8, i8* %bc, align 4
store i8 %load, i8* %out, align 4
%gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
%load = load i8, ptr %gep, align 4
store i8 %load, ptr %out, align 4
ret void
}

Expand All @@ -106,14 +105,13 @@ bb:
; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]];
;
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @gep_bitcast_asc(i8* nocapture %out, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
bb:
%n64 = sext i32 %n to i64
%gep = getelementptr inbounds %struct.ham, %struct.ham* %in, i64 0, i32 0, i64 %n64
%bc = bitcast i32* %gep to i8*
%asc = addrspacecast i8* %bc to i8 addrspace(101)*
%load = load i8, i8 addrspace(101)* %asc, align 4
store i8 %load, i8* %out, align 4
%gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
%asc = addrspacecast ptr %gep to ptr addrspace(101)
%load = load i8, ptr addrspace(101) %asc, align 4
store i8 %load, ptr %out, align 4
ret void
}

Expand Down Expand Up @@ -141,18 +139,18 @@ bb:
; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];

; Function Attrs: convergent norecurse nounwind mustprogress
define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
%tmp4 = load i32, i32* %tmp3, align 4
store i32 %tmp4, i32* %arg, align 4
%tmp5 = call i32* @escape(i32* nonnull %tmp3) #3
%tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
%tmp4 = load i32, ptr %tmp3, align 4
store i32 %tmp4, ptr %arg, align 4
%tmp5 = call ptr @escape(ptr nonnull %tmp3) #3
ret void
}

; Function Attrs: convergent nounwind
declare dso_local i32* @escape(i32*) local_unnamed_addr
declare dso_local ptr @escape(ptr) local_unnamed_addr


!llvm.module.flags = !{!0, !1, !2}
Expand All @@ -161,8 +159,8 @@ declare dso_local i32* @escape(i32*) local_unnamed_addr
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1}
!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1}
!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1}
!6 = !{void (i8*, %struct.ham*, i32)* @gep_bitcast, !"kernel", i32 1}
!7 = !{void (i8*, %struct.ham*, i32)* @gep_bitcast_asc, !"kernel", i32 1}
!3 = !{ptr @static_offset, !"kernel", i32 1}
!4 = !{ptr @dynamic_offset, !"kernel", i32 1}
!5 = !{ptr @pointer_escapes, !"kernel", i32 1}
!6 = !{ptr @gep_bitcast, !"kernel", i32 1}
!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1}
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,61 +6,61 @@ target triple = "nvptx64-nvidia-cuda"

; Verify that both %input and %output are converted to global pointers and then
; addrspacecast'ed back to the original type.
define void @kernel(float* %input, float* %output) {
define void @kernel(ptr %input, ptr %output) {
; CHECK-LABEL: .visible .entry kernel(
; CHECK: cvta.to.global.u64
; CHECK: cvta.to.global.u64
%1 = load float, float* %input, align 4
%1 = load float, ptr %input, align 4
; CHECK: ld.global.f32
store float %1, float* %output, align 4
store float %1, ptr %output, align 4
; CHECK: st.global.f32
ret void
}

define void @kernel2(float addrspace(1)* %input, float addrspace(1)* %output) {
define void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
; CHECK-LABEL: .visible .entry kernel2(
; CHECK-NOT: cvta.to.global.u64
%1 = load float, float addrspace(1)* %input, align 4
%1 = load float, ptr addrspace(1) %input, align 4
; CHECK: ld.global.f32
store float %1, float addrspace(1)* %output, align 4
store float %1, ptr addrspace(1) %output, align 4
; CHECK: st.global.f32
ret void
}

%struct.S = type { i32*, i32* }
%struct.S = type { ptr, ptr }

define void @ptr_in_byval_kernel(%struct.S* byval(%struct.S) %input, i32* %output) {
define void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) {
; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%b = load i32*, i32** %b_ptr, align 8
%v = load i32, i32* %b, align 4
%b_ptr = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1
%b = load ptr, ptr %b_ptr, align 8
%v = load i32, ptr %b, align 4
; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
store i32 %v, i32* %output, align 4
store i32 %v, ptr %output, align 4
; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
ret void
}

; Regular functions lower byval arguments differently. We need to make
; sure that we're loading byval argument data using [symbol+offset].
; There's also no assumption that all pointers within are in global space.
define void @ptr_in_byval_func(%struct.S* byval(%struct.S) %input, i32* %output) {
define void @ptr_in_byval_func(ptr byval(%struct.S) %input, ptr %output) {
; CHECK-LABEL: .visible .func ptr_in_byval_func(
; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1]
; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%b = load i32*, i32** %b_ptr, align 8
%v = load i32, i32* %b, align 4
%b_ptr = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1
%b = load ptr, ptr %b_ptr, align 8
%v = load i32, ptr %b, align 4
; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
store i32 %v, i32* %output, align 4
store i32 %v, ptr %output, align 4
; CHECK: st.u32 [%[[optr]]], %[[val]]
ret void
}

!nvvm.annotations = !{!0, !1, !2}
!0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
!1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}
!0 = !{ptr @kernel, !"kernel", i32 1}
!1 = !{ptr @kernel2, !"kernel", i32 1}
!2 = !{ptr @ptr_in_byval_kernel, !"kernel", i32 1}
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/machine-sink.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
define float @post_dominate(float %x, i1 %cond) {
; CHECK-LABEL: post_dominate(
entry:
%0 = load float, float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
%1 = load float, float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
%0 = load float, ptr addrspacecast (ptr addrspace(3) @scalar1 to ptr), align 4
%1 = load float, ptr addrspacecast (ptr addrspace(3) @scalar2 to ptr), align 4
; CHECK: ld.shared.f32
; CHECK: ld.shared.f32
%2 = fmul float %0, %0
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/NVPTX/managed.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
; CHECK: .extern .global .align 4 .u32 decl_g;
@decl_g = external addrspace(1) global i32, align 4
; CHECK: .extern .global .attribute(.managed) .align 8 .b32 managed_decl_g;
@managed_decl_g = external addrspace(1) global i32*, align 8
@managed_decl_g = external addrspace(1) global ptr, align 8

!nvvm.annotations = !{!0, !1}
!0 = !{i32 addrspace(1)* @managed_g, !"managed", i32 1}
!1 = !{i32* addrspace(1)* @managed_decl_g, !"managed", i32 1}
!0 = !{ptr addrspace(1) @managed_g, !"managed", i32 1}
!1 = !{ptr addrspace(1) @managed_decl_g, !"managed", i32 1}
86 changes: 43 additions & 43 deletions llvm/test/CodeGen/NVPTX/mbarrier.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,143 +3,143 @@
; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}

declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
declare void @llvm.nvvm.mbarrier.init(ptr %a, i32 %b)
declare void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) %a, i32 %b)

; CHECK-LABEL: barrierinit
define void @barrierinit(i64* %a, i32 %b) {
define void @barrierinit(ptr %a, i32 %b) {
; CHECK_PTX32: mbarrier.init.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.init.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
tail call void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
tail call void @llvm.nvvm.mbarrier.init(ptr %a, i32 %b)
ret void
}

; CHECK-LABEL: barrierinitshared
define void @barrierinitshared(i64 addrspace(3)* %a, i32 %b) {
define void @barrierinitshared(ptr addrspace(3) %a, i32 %b) {
; CHECK_PTX32: mbarrier.init.shared.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.init.shared.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
tail call void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
tail call void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) %a, i32 %b)
ret void
}

declare void @llvm.nvvm.mbarrier.inval(i64* %a)
declare void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
declare void @llvm.nvvm.mbarrier.inval(ptr %a)
declare void @llvm.nvvm.mbarrier.inval.shared(ptr addrspace(3) %a)

; CHECK-LABEL: barrierinval
define void @barrierinval(i64* %a) {
define void @barrierinval(ptr %a) {
; CHECK_PTX32: mbarrier.inval.b64 [%r{{[0-1]+}}];
; CHECK_PTX64: mbarrier.inval.b64 [%rd{{[0-1]+}}];
tail call void @llvm.nvvm.mbarrier.inval(i64* %a)
tail call void @llvm.nvvm.mbarrier.inval(ptr %a)
ret void
}

; CHECK-LABEL: barrierinvalshared
define void @barrierinvalshared(i64 addrspace(3)* %a) {
define void @barrierinvalshared(ptr addrspace(3) %a) {
; CHECK_PTX32: mbarrier.inval.shared.b64 [%r{{[0-1]+}}];
; CHECK_PTX64: mbarrier.inval.shared.b64 [%rd{{[0-1]+}}];
tail call void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
tail call void @llvm.nvvm.mbarrier.inval.shared(ptr addrspace(3) %a)
ret void
}

declare i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
declare i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
declare i64 @llvm.nvvm.mbarrier.arrive(ptr %a)
declare i64 @llvm.nvvm.mbarrier.arrive.shared(ptr addrspace(3) %a)

; CHECK-LABEL: barrierarrive
define void @barrierarrive(i64* %a) {
define void @barrierarrive(ptr %a) {
; CHECK_PTX32: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
; CHECK_PTX64: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive(ptr %a)
ret void
}

; CHECK-LABEL: barrierarriveshared
define void @barrierarriveshared(i64 addrspace(3)* %a) {
define void @barrierarriveshared(ptr addrspace(3) %a) {
; CHECK_PTX32: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
; CHECK_PTX64: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.shared(ptr addrspace(3) %a)
ret void
}

declare i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
declare i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
declare i64 @llvm.nvvm.mbarrier.arrive.noComplete(ptr %a, i32 %b)
declare i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(ptr addrspace(3) %a, i32 %b)

; CHECK-LABEL: barrierarrivenoComplete
define void @barrierarrivenoComplete(i64* %a, i32 %b) {
define void @barrierarrivenoComplete(ptr %a, i32 %b) {
; CHECK_PTX32: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete(ptr %a, i32 %b)
ret void
}

; CHECK-LABEL: barrierarrivenoCompleteshared
define void @barrierarrivenoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
define void @barrierarrivenoCompleteshared(ptr addrspace(3) %a, i32 %b) {
; CHECK_PTX32: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(ptr addrspace(3) %a, i32 %b)
ret void
}

declare i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
declare i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
declare i64 @llvm.nvvm.mbarrier.arrive.drop(ptr %a)
declare i64 @llvm.nvvm.mbarrier.arrive.drop.shared(ptr addrspace(3) %a)

; CHECK-LABEL: barrierarrivedrop
define void @barrierarrivedrop(i64* %a) {
define void @barrierarrivedrop(ptr %a) {
; CHECK_PTX32: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
; CHECK_PTX64: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop(ptr %a)
ret void
}

; CHECK-LABEL: barrierarrivedropshared
define void @barrierarrivedropshared(i64 addrspace(3)* %a) {
define void @barrierarrivedropshared(ptr addrspace(3) %a) {
; CHECK_PTX32: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
; CHECK_PTX64: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.shared(ptr addrspace(3) %a)
ret void
}

declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(ptr %a, i32 %b)
declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(ptr addrspace(3) %a, i32 %b)

; CHECK-LABEL: barrierarrivedropnoComplete
define void @barrierarrivedropnoComplete(i64* %a, i32 %b) {
define void @barrierarrivedropnoComplete(ptr %a, i32 %b) {
; CHECK_PTX32: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(ptr %a, i32 %b)
ret void
}

; CHECK-LABEL: barrierarrivedropnoCompleteshared
define void @barrierarrivedropnoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
define void @barrierarrivedropnoCompleteshared(ptr addrspace(3) %a, i32 %b) {
; CHECK_PTX32: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(ptr addrspace(3) %a, i32 %b)
ret void
}

declare i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
declare i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
declare i1 @llvm.nvvm.mbarrier.test.wait(ptr %a, i64 %b)
declare i1 @llvm.nvvm.mbarrier.test.wait.shared(ptr addrspace(3) %a, i64 %b)

; CHECK-LABEL: barriertestwait
define void @barriertestwait(i64* %a, i64 %b) {
define void @barriertestwait(ptr %a, i64 %b) {
; CHECK_PTX32: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
; CHECK_PTX64: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
%ret = tail call i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
%ret = tail call i1 @llvm.nvvm.mbarrier.test.wait(ptr %a, i64 %b)
ret void
}

; CHECK-LABEL: barriertestwaitshared
define void @barriertestwaitshared(i64 addrspace(3)* %a, i64 %b) {
define void @barriertestwaitshared(ptr addrspace(3) %a, i64 %b) {
; CHECK_PTX32: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
; CHECK_PTX64: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
%ret = tail call i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
%ret = tail call i1 @llvm.nvvm.mbarrier.test.wait.shared(ptr addrspace(3) %a, i64 %b)
ret void
}

declare i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)

; CHECK-LABEL: barrierpendingcount
define i32 @barrierpendingcount(i64* %a, i64 %b) {
define i32 @barrierpendingcount(ptr %a, i64 %b) {
; CHECK_PTX32: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
; CHECK_PTX64: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
%ret = tail call i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/minmax-negative.ll
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
; RUN: llc < %s -march=nvptx -O0 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx -O0 | %ptxas-verify %}

define i16 @test1(i16* %sur1) {
define i16 @test1(ptr %sur1) {
; CHECK-NOT: mov.u16 %rs{{[0-9]+}}, 32767
%_tmp21.i = icmp sle i16 0, 0
%_tmp22.i = select i1 %_tmp21.i, i16 0, i16 32767
store i16 %_tmp22.i, i16* %sur1
store i16 %_tmp22.i, ptr %sur1
ret i16 0
}
54 changes: 25 additions & 29 deletions llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,40 +5,36 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "nvptx64-nvidia-cuda"

; CHECK-LABEL: t1
define <4 x float> @t1(i8* %p1) {
define <4 x float> @t1(ptr %p1) {
; CHECK-NOT: ld.v4
; CHECK-NOT: ld.v2
; CHECK-NOT: ld.f32
; CHECK: ld.u8
%cast = bitcast i8* %p1 to <4 x float>*
%r = load <4 x float>, <4 x float>* %cast, align 1
%r = load <4 x float>, ptr %p1, align 1
ret <4 x float> %r
}

; CHECK-LABEL: t2
define <4 x float> @t2(i8* %p1) {
define <4 x float> @t2(ptr %p1) {
; CHECK-NOT: ld.v4
; CHECK-NOT: ld.v2
; CHECK: ld.f32
%cast = bitcast i8* %p1 to <4 x float>*
%r = load <4 x float>, <4 x float>* %cast, align 4
%r = load <4 x float>, ptr %p1, align 4
ret <4 x float> %r
}

; CHECK-LABEL: t3
define <4 x float> @t3(i8* %p1) {
define <4 x float> @t3(ptr %p1) {
; CHECK-NOT: ld.v4
; CHECK: ld.v2
%cast = bitcast i8* %p1 to <4 x float>*
%r = load <4 x float>, <4 x float>* %cast, align 8
%r = load <4 x float>, ptr %p1, align 8
ret <4 x float> %r
}

; CHECK-LABEL: t4
define <4 x float> @t4(i8* %p1) {
define <4 x float> @t4(ptr %p1) {
; CHECK: ld.v4
%cast = bitcast i8* %p1 to <4 x float>*
%r = load <4 x float>, <4 x float>* %cast, align 16
%r = load <4 x float>, ptr %p1, align 16
ret <4 x float> %r
}

Expand All @@ -50,9 +46,9 @@ define <4 x float> @t4(i8* %p1) {
; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]]
; CHECK: ret
define void @test_v1halfp0a1(<1 x half> * noalias readonly %from, <1 x half> * %to) {
%1 = load <1 x half>, <1 x half> * %from , align 1
store <1 x half> %1, <1 x half> * %to , align 1
define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) {
%1 = load <1 x half>, ptr %from , align 1
store <1 x half> %1, ptr %to , align 1
ret void
}

Expand All @@ -68,9 +64,9 @@ define void @test_v1halfp0a1(<1 x half> * noalias readonly %from, <1 x half> * %
; CHECK-DAG: ld.u8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
; CHECK-DAG: st.u8 [%[[TO]]+3],
; CHECK: ret
define void @test_v2halfp0a1(<2 x half> * noalias readonly %from, <2 x half> * %to) {
%1 = load <2 x half>, <2 x half> * %from , align 1
store <2 x half> %1, <2 x half> * %to , align 1
define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) {
%1 = load <2 x half>, ptr %from , align 1
store <2 x half> %1, ptr %to , align 1
ret void
}

Expand All @@ -94,43 +90,43 @@ define void @test_v2halfp0a1(<2 x half> * noalias readonly %from, <2 x half> * %
; CHECK-DAG: ld.u8 [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7]
; CHECK-DAG: st.u8 [%[[TO]]+7], [[B7]]
; CHECK: ret
define void @test_v4halfp0a1(<4 x half> * noalias readonly %from, <4 x half> * %to) {
%1 = load <4 x half>, <4 x half> * %from , align 1
store <4 x half> %1, <4 x half> * %to , align 1
define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) {
%1 = load <4 x half>, ptr %from , align 1
store <4 x half> %1, ptr %to , align 1
ret void
}


; CHECK-LABEL: s1
define void @s1(<4 x float>* %p1, <4 x float> %v) {
define void @s1(ptr %p1, <4 x float> %v) {
; CHECK-NOT: st.v4
; CHECK-NOT: st.v2
; CHECK-NOT: st.f32
; CHECK: st.u8
store <4 x float> %v, <4 x float>* %p1, align 1
store <4 x float> %v, ptr %p1, align 1
ret void
}

; CHECK-LABEL: s2
define void @s2(<4 x float>* %p1, <4 x float> %v) {
define void @s2(ptr %p1, <4 x float> %v) {
; CHECK-NOT: st.v4
; CHECK-NOT: st.v2
; CHECK: st.f32
store <4 x float> %v, <4 x float>* %p1, align 4
store <4 x float> %v, ptr %p1, align 4
ret void
}

; CHECK-LABEL: s3
define void @s3(<4 x float>* %p1, <4 x float> %v) {
define void @s3(ptr %p1, <4 x float> %v) {
; CHECK-NOT: st.v4
store <4 x float> %v, <4 x float>* %p1, align 8
store <4 x float> %v, ptr %p1, align 8
ret void
}

; CHECK-LABEL: s4
define void @s4(<4 x float>* %p1, <4 x float> %v) {
define void @s4(ptr %p1, <4 x float> %v) {
; CHECK: st.v4
store <4 x float> %v, <4 x float>* %p1, align 16
store <4 x float> %v, ptr %p1, align 16
ret void
}

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/no-extra-parens.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@

@"$str" = private addrspace(1) constant [4 x i8] c"str\00"

declare void @str2(i8* %str)
declare void @str2(ptr %str)
define void @str1() {
entry:
;; CHECK: mov.u64 %rd{{[0-9]+}}, $str;
tail call void @str2(i8* getelementptr ([4 x i8], [4 x i8]* addrspacecast ([4 x i8] addrspace(1)* @"$str" to [4 x i8]*), i64 0, i64 0))
tail call void @str2(ptr addrspacecast (ptr addrspace(1) @"$str" to ptr))
ret void
}
42 changes: 20 additions & 22 deletions llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,60 +7,58 @@
; CHECK-NOT: call void @llvm.nvvm.barrier0

; Function Attrs: nounwind
define void @foo(float* %output) #1 {
define void @foo(ptr %output) #1 {
entry:
%output.addr = alloca float*, align 8
store float* %output, float** %output.addr, align 8
%0 = load float*, float** %output.addr, align 8
%arrayidx = getelementptr inbounds float, float* %0, i64 0
%1 = load float, float* %arrayidx, align 4
%output.addr = alloca ptr, align 8
store ptr %output, ptr %output.addr, align 8
%0 = load ptr, ptr %output.addr, align 8
%1 = load float, ptr %0, align 4
%conv = fpext float %1 to double
%cmp = fcmp olt double %conv, 1.000000e+01
br i1 %cmp, label %if.then, label %if.else

if.then: ; preds = %entry
%2 = load float*, float** %output.addr, align 8
%3 = load float, float* %2, align 4
%2 = load ptr, ptr %output.addr, align 8
%3 = load float, ptr %2, align 4
%conv1 = fpext float %3 to double
%add = fadd double %conv1, 1.000000e+00
%conv2 = fptrunc double %add to float
store float %conv2, float* %2, align 4
store float %conv2, ptr %2, align 4
br label %if.end

if.else: ; preds = %entry
%4 = load float*, float** %output.addr, align 8
%5 = load float, float* %4, align 4
%4 = load ptr, ptr %output.addr, align 8
%5 = load float, ptr %4, align 4
%conv3 = fpext float %5 to double
%add4 = fadd double %conv3, 2.000000e+00
%conv5 = fptrunc double %add4 to float
store float %conv5, float* %4, align 4
store float %conv5, ptr %4, align 4
br label %if.end

if.end: ; preds = %if.else, %if.then
call void @llvm.nvvm.barrier0()
%6 = load float*, float** %output.addr, align 8
%arrayidx6 = getelementptr inbounds float, float* %6, i64 0
%7 = load float, float* %arrayidx6, align 4
%6 = load ptr, ptr %output.addr, align 8
%7 = load float, ptr %6, align 4
%conv7 = fpext float %7 to double
%cmp8 = fcmp olt double %conv7, 1.000000e+01
br i1 %cmp8, label %if.then9, label %if.else13

if.then9: ; preds = %if.end
%8 = load float*, float** %output.addr, align 8
%9 = load float, float* %8, align 4
%8 = load ptr, ptr %output.addr, align 8
%9 = load float, ptr %8, align 4
%conv10 = fpext float %9 to double
%add11 = fadd double %conv10, 3.000000e+00
%conv12 = fptrunc double %add11 to float
store float %conv12, float* %8, align 4
store float %conv12, ptr %8, align 4
br label %if.end17

if.else13: ; preds = %if.end
%10 = load float*, float** %output.addr, align 8
%11 = load float, float* %10, align 4
%10 = load ptr, ptr %output.addr, align 8
%11 = load float, ptr %10, align 4
%conv14 = fpext float %11 to double
%add15 = fadd double %conv14, 4.000000e+00
%conv16 = fptrunc double %add15 to float
store float %conv16, float* %10, align 4
store float %conv16, ptr %10, align 4
br label %if.end17

if.end17: ; preds = %if.else13, %if.then9
Expand All @@ -70,5 +68,5 @@ if.end17: ; preds = %if.else13, %if.then
; Function Attrs: noduplicate nounwind
declare void @llvm.nvvm.barrier0() #2

!0 = !{void (float*)* @foo, !"kernel", i32 1}
!0 = !{ptr @foo, !"kernel", i32 1}
!1 = !{null, !"align", i32 8}
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/nofunc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

@Funcs = local_unnamed_addr addrspace(1) externally_initialized
global [1 x void (i8*)*] [void (i8*)* @func], align 8
global [1 x ptr] [ptr @func], align 8

declare void @func(i8*)
declare void @func(ptr)

; CHECK: Funcs[1] = {func}
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/NVPTX/nounroll.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target triple = "nvptx64-unknown-unknown"
; #pragma nounroll
; for (int i = 0; i < 2; ++i)
; output[i] = input[i];
define void @nounroll(float* %input, float* %output) {
define void @nounroll(ptr %input, ptr %output) {
; CHECK-LABEL: .visible .func nounroll(
entry:
br label %for.body
Expand All @@ -18,11 +18,11 @@ for.body:
; CHECK: .pragma "nounroll"
%i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%idxprom = sext i32 %i.06 to i64
%arrayidx = getelementptr inbounds float, float* %input, i64 %idxprom
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %input, i64 %idxprom
%0 = load float, ptr %arrayidx, align 4
; CHECK: ld.f32
%arrayidx2 = getelementptr inbounds float, float* %output, i64 %idxprom
store float %0, float* %arrayidx2, align 4
%arrayidx2 = getelementptr inbounds float, ptr %output, i64 %idxprom
store float %0, ptr %arrayidx2, align 4
; CHECK: st.f32
%inc = add nuw nsw i32 %i.06, 1
%exitcond = icmp eq i32 %inc, 2
Expand All @@ -39,7 +39,7 @@ for.end:
; #pragma unroll 1
; for (int i = 0; i < 2; ++i)
; output[i] = input[i];
define void @unroll1(float* %input, float* %output) {
define void @unroll1(ptr %input, ptr %output) {
; CHECK-LABEL: .visible .func unroll1(
entry:
br label %for.body
Expand All @@ -48,11 +48,11 @@ for.body:
; CHECK: .pragma "nounroll"
%i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%idxprom = sext i32 %i.06 to i64
%arrayidx = getelementptr inbounds float, float* %input, i64 %idxprom
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %input, i64 %idxprom
%0 = load float, ptr %arrayidx, align 4
; CHECK: ld.f32
%arrayidx2 = getelementptr inbounds float, float* %output, i64 %idxprom
store float %0, float* %arrayidx2, align 4
%arrayidx2 = getelementptr inbounds float, ptr %output, i64 %idxprom
store float %0, ptr %arrayidx2, align 4
; CHECK: st.f32
%inc = add nuw nsw i32 %i.06, 1
%exitcond = icmp eq i32 %inc, 2
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

target triple = "nvptx-unknown-nvcl"

define void @foo(i64 %img, i64 %sampler, <5 x float>* align 32 %v1, i32* %v2) {
define void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
; The parameter alignment is determined by the align attribute (default 1).
; CHECK-LABEL: .entry foo(
; CHECK: .param .u32 .ptr .align 32 foo_param_2
Expand All @@ -12,6 +12,6 @@ define void @foo(i64 %img, i64 %sampler, <5 x float>* align 32 %v1, i32* %v2) {
}

!nvvm.annotations = !{!1, !2, !3}
!1 = !{void (i64, i64, <5 x float>*, i32*)* @foo, !"kernel", i32 1}
!2 = !{void (i64, i64, <5 x float>*, i32*)* @foo, !"rdoimage", i32 0}
!3 = !{void (i64, i64, <5 x float>*, i32*)* @foo, !"sampler", i32 1}
!1 = !{ptr @foo, !"kernel", i32 1}
!2 = !{ptr @foo, !"rdoimage", i32 0}
!3 = !{ptr @foo, !"sampler", i32 1}
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@

@"$str" = private addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"

declare i32 @__nvvm_reflect(i8*)
declare i32 @__nvvm_reflect(ptr)

; COMMON-LABEL: @foo
define i32 @foo(float %a, float %b) {
; COMMON-NOT: call i32 @__nvvm_reflect
%reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([12 x i8], [12 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%reflect = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @"$str" to ptr))
; SM20: ret i32 200
; SM35: ret i32 350
ret i32 %reflect
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-reflect | FileCheck %s
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-reflect | FileCheck %s

declare i32 @__nvvm_reflect(i8*)
declare i32 @__nvvm_reflect(ptr)
@str = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"

define i32 @foo() {
%call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
%call = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @str to ptr))
; CHECK: ret i32 42
ret i32 %call
}
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@

@str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"

declare i32 @__nvvm_reflect(i8*)
declare i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)*)
declare i32 @__nvvm_reflect(ptr)
declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4))

; CHECK-LABEL: @foo
define float @foo(float %a, float %b) {
; CHECK-NOT: call i32 @__nvvm_reflect
%ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(4)* @str, i32 0, i32 0))
%reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
%ptr = tail call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) @str)
%reflect = tail call i32 @__nvvm_reflect(ptr %ptr)
%cmp = icmp ugt i32 %reflect, 0
br i1 %cmp, label %use_mul, label %use_add

Expand All @@ -41,15 +41,15 @@ exit:
ret float %ret
}

declare i32 @llvm.nvvm.reflect.p0i8(i8*)
declare i32 @llvm.nvvm.reflect.p0(ptr)

; CHECK-LABEL: define i32 @intrinsic
define i32 @intrinsic() {
; CHECK-NOT: call i32 @llvm.nvvm.reflect
; USE_FTZ_0: ret i32 0
; USE_FTZ_1: ret i32 1
%ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(4)* @str, i32 0, i32 0))
%reflect = tail call i32 @llvm.nvvm.reflect.p0i8(i8* %ptr)
%ptr = tail call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) @str)
%reflect = tail call i32 @llvm.nvvm.reflect.p0(ptr %ptr)
ret i32 %reflect
}

Expand All @@ -61,7 +61,7 @@ define i32 @intrinsic() {
; CHECK-LABEL: @bar
define float @bar(float %a, float %b) {
; CHECK-NOT: call i32 @__nvvm_reflect
%reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%reflect = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @"$str" to ptr))
%cmp = icmp ne i32 %reflect, 0
br i1 %cmp, label %use_mul, label %use_add

Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/NVPTX/packed-aggr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,37 @@ declare void @func()
; CHECK: .extern .func func
; CHECK: .u8 p;

%t1 = type <{ i16, i8*, i8, void ()*, i8*, i32 }>
%t1 = type <{ i16, ptr, i8, ptr, ptr, i32 }>
@s1 = addrspace(1) global %t1 <{
; ERROR: initialized packed aggregate with pointers 's1' requires at least PTX ISA version 7.1
; CHECK32: .global .align 1 .u8 s1[19] = {
; CHECK64: .global .align 1 .u8 s1[31] = {
i16 12,
; CHECK-SAME: 12, 0,
i8* addrspacecast (i8 addrspace(1)* @p to i8*),
ptr addrspacecast (ptr addrspace(1) @p to ptr),
; CHECK-SAME: 0xFF(generic(p)), 0xFF00(generic(p)), 0xFF0000(generic(p)), 0xFF000000(generic(p)),
; CHECK64-SAME: 0xFF00000000(generic(p)), 0xFF0000000000(generic(p)), 0xFF000000000000(generic(p)), 0xFF00000000000000(generic(p)),
i8 34,
; CHECK-SAME: 34
void ()* @func,
ptr @func,
; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
i8* addrspacecast (i8 addrspace(1)* getelementptr (i8, i8 addrspace(1)* @p, i32 3) to i8*),
ptr addrspacecast (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @p, i32 3) to ptr),
; CHECK-SAME: 0xFF(generic(p)+3), 0xFF00(generic(p)+3), 0xFF0000(generic(p)+3), 0xFF000000(generic(p)+3),
; CHECK64-SAME: 0xFF00000000(generic(p)+3), 0xFF0000000000(generic(p)+3), 0xFF000000000000(generic(p)+3), 0xFF00000000000000(generic(p)+3),
i32 56 }>, align 1
; CHECK-SAME: 56, 0, 0, 0};

;; Test a case than an unaligned pointer is in a nested struct.

%t2i = type <{ void ()* }>
%t2i = type <{ ptr }>
%t2o = type { i8, %t2i, i32 }
@s2 = addrspace(1) global %t2o {
; CHECK32: .global .align 8 .u8 s2[12] = {
; CHECK64: .global .align 8 .u8 s2[16] = {
i8 12,
; CHECK-SAME: 12,
%t2i <{ void()* @func }>,
%t2i <{ ptr @func }>,
; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
i32 34}
Expand All @@ -57,32 +57,32 @@ declare void @func()
;; is printed in bytes and uses the mask() operator for pointers even though
;; the pointers are aligned.

%t3 = type <{ void ()*, i8 }>
%t3 = type <{ ptr, i8 }>
@s3 = addrspace(1) global %t3 <{
; CHECK32: .global .align 1 .u8 s3[5] = {
; CHECK64: .global .align 1 .u8 s3[9] = {
void ()* @func,
ptr @func,
; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
i8 56 }>, align 1
; CHECK-SAME: 56};

;; Test that a packed struct with aligned pointers is printed in words.

%t4 = type <{ void ()*, i64 }>
%t4 = type <{ ptr, i64 }>
@s4 = addrspace(1) global %t4 <{
; CHECK32: .global .align 1 .u32 s4[3] = {
; CHECK64: .global .align 1 .u64 s4[2] = {
void()* @func,
ptr @func,
; CHECK-SAME: func,
i64 15}>, align 1
; CHECK32-SAME: 15, 0};
; CHECK64-SAME: 15};

;; Test that a packed struct with unaligned pointers inside an array is handled.

%t5 = type <{ void ()*, i16 }>
@a5 = addrspace(1) global [2 x %t5] [%t5 <{ void()* @func, i16 5 }>, %t5 <{ void()* @func, i16 9 }> ]
%t5 = type <{ ptr, i16 }>
@a5 = addrspace(1) global [2 x %t5] [%t5 <{ ptr @func, i16 5 }>, %t5 <{ ptr @func, i16 9 }> ]
; CHECK32: .global .align 8 .u8 a5[12] = {
; CHECK32-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), 5, 0,
; CHECK32-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), 9, 0};
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/NVPTX/param-align.ll
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}

;;; Need 4-byte alignment on float* passed byval
define ptx_device void @t1(float* byval(float) %x) {
;;; Need 4-byte alignment on ptr passed byval
define ptx_device void @t1(ptr byval(float) %x) {
; CHECK: .func t1
; CHECK: .param .align 4 .b8 t1_param_0[4]
ret void
}


;;; Need 8-byte alignment on double* passed byval
define ptx_device void @t2(double* byval(double) %x) {
;;; Need 8-byte alignment on ptr passed byval
define ptx_device void @t2(ptr byval(double) %x) {
; CHECK: .func t2
; CHECK: .param .align 8 .b8 t2_param_0[8]
ret void
Expand All @@ -19,27 +19,27 @@ define ptx_device void @t2(double* byval(double) %x) {

;;; Need 4-byte alignment on float2* passed byval
%struct.float2 = type { float, float }
define ptx_device void @t3(%struct.float2* byval(%struct.float2) %x) {
define ptx_device void @t3(ptr byval(%struct.float2) %x) {
; CHECK: .func t3
; CHECK: .param .align 4 .b8 t3_param_0[8]
ret void
}

;;; Need at least 4-byte alignment in order to avoid miscompilation by
;;; ptxas for sm_50+
define ptx_device void @t4(i8* byval(i8) %x) {
define ptx_device void @t4(ptr byval(i8) %x) {
; CHECK: .func t4
; CHECK: .param .align 4 .b8 t4_param_0[1]
ret void
}

;;; Make sure we adjust alignment at the call site as well.
define ptx_device void @t5(i8* align 2 byval(i8) %x) {
define ptx_device void @t5(ptr align 2 byval(i8) %x) {
; CHECK: .func t5
; CHECK: .param .align 4 .b8 t5_param_0[1]
; CHECK: {
; CHECK: .param .align 4 .b8 param0[1];
; CHECK: call.uni
call void @t4(i8* byval(i8) %x)
call void @t4(ptr byval(i8) %x)
ret void
}
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,26 @@
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}

define ptx_kernel void @t1(i1* %a) {
define ptx_kernel void @t1(ptr %a) {
; PTX32: mov.u16 %rs{{[0-9]+}}, 0;
; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
; PTX64: mov.u16 %rs{{[0-9]+}}, 0;
; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
store i1 false, i1* %a
store i1 false, ptr %a
ret void
}


define ptx_kernel void @t2(i1* %a, i8* %b) {
define ptx_kernel void @t2(ptr %a, ptr %b) {
; PTX32: ld.global.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
; PTX64: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;

%t1 = load i1, i1* %a
%t1 = load i1, ptr %a
%t2 = select i1 %t1, i8 1, i8 2
store i8 %t2, i8* %b
store i8 %t2, ptr %b
ret void
}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/pr16278.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@

define float @foo() {
; CHECK: ld.const.f32
%val = load float, float addrspace(4)* @one_f
%val = load float, ptr addrspace(4) @one_f
ret float %val
}
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/NVPTX/pr17529.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@ target triple = "nvptx64-nvidia-cuda"

; Function Attrs: nounwind
; CHECK: .func kernelgen_memcpy
define ptx_device void @kernelgen_memcpy(i8* nocapture %dst) #0 {
define ptx_device void @kernelgen_memcpy(ptr nocapture %dst) #0 {
entry:
br i1 undef, label %for.end, label %vector.body

vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%scevgep9 = getelementptr i8, i8* %dst, i64 %index
%scevgep910 = bitcast i8* %scevgep9 to <4 x i8>*
store <4 x i8> undef, <4 x i8>* %scevgep910, align 1
%scevgep9 = getelementptr i8, ptr %dst, i64 %index
store <4 x i8> undef, ptr %scevgep9, align 1
%index.next = add i64 %index, 4
%0 = icmp eq i64 undef, %index.next
br i1 %0, label %middle.block, label %vector.body
Expand All @@ -23,13 +22,12 @@ middle.block: ; preds = %vector.body
br i1 undef, label %for.end, label %for.body.preheader1

for.body.preheader1: ; preds = %middle.block
%scevgep2 = getelementptr i8, i8* %dst, i64 0
br label %for.body

for.body: ; preds = %for.body, %for.body.preheader1
%lsr.iv3 = phi i8* [ %scevgep2, %for.body.preheader1 ], [ %scevgep4, %for.body ]
store i8 undef, i8* %lsr.iv3, align 1
%scevgep4 = getelementptr i8, i8* %lsr.iv3, i64 1
%lsr.iv3 = phi ptr [ %dst, %for.body.preheader1 ], [ %scevgep4, %for.body ]
store i8 undef, ptr %lsr.iv3, align 1
%scevgep4 = getelementptr i8, ptr %lsr.iv3, i64 1
br label %for.body

for.end: ; preds = %middle.block, %entry
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,20 @@
; CHECK-LABEL: test_gv_float()
define float @test_gv_float() {
; CHECK: ld.global.nc.f32
%v = load float, float* @gv_float
%v = load float, ptr @gv_float
ret float %v
}

; CHECK-LABEL: test_gv_float2()
define <2 x float> @test_gv_float2() {
; CHECK: ld.global.nc.v2.f32
%v = load <2 x float>, <2 x float>* @gv_float2
%v = load <2 x float>, ptr @gv_float2
ret <2 x float> %v
}

; CHECK-LABEL: test_gv_float4()
define <4 x float> @test_gv_float4() {
; CHECK: ld.global.nc.v4.f32
%v = load <4 x float>, <4 x float>* @gv_float4
%v = load <4 x float>, ptr @gv_float4
ret <4 x float> %v
}
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/refl1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ target triple = "nvptx-nvidia-cuda"

; Function Attrs: nounwind
; CHECK: .entry foo
define void @foo(float* nocapture %a) #0 {
%val = load float, float* %a
define void @foo(ptr nocapture %a) #0 {
%val = load float, ptr %a
%tan = tail call fastcc float @__nv_fast_tanf(float %val)
store float %tan, float* %a
store float %tan, ptr %a
ret void
}

Expand Down Expand Up @@ -37,4 +37,4 @@ attributes #2 = { alwaysinline inlinehint nounwind readnone }

!nvvm.annotations = !{!0}

!0 = !{void (float*)* @foo, !"kernel", i32 1}
!0 = !{ptr @foo, !"kernel", i32 1}
28 changes: 13 additions & 15 deletions llvm/test/CodeGen/NVPTX/reg-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,19 @@
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-unknown-unknown"

define void @PR24303(float* %f) {
define void @PR24303(ptr %f) {
; CHECK-LABEL: .visible .entry PR24303(
; Do not use mov.f or mov.u to convert between float and int.
; CHECK-NOT: mov.{{f|u}}{{32|64}} %f{{[0-9]+}}, %r{{[0-9]+}}
; CHECK-NOT: mov.{{f|u}}{{32|64}} %r{{[0-9]+}}, %f{{[0-9]+}}
entry:
%arrayidx1 = getelementptr inbounds float, float* %f, i64 1
%0 = load float, float* %f, align 4
%1 = load float, float* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds float, float* %f, i64 2
%arrayidx3 = getelementptr inbounds float, float* %f, i64 3
%2 = load float, float* %arrayidx2, align 4
%3 = load float, float* %arrayidx3, align 4
%arrayidx1 = getelementptr inbounds float, ptr %f, i64 1
%0 = load float, ptr %f, align 4
%1 = load float, ptr %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds float, ptr %f, i64 2
%arrayidx3 = getelementptr inbounds float, ptr %f, i64 3
%2 = load float, ptr %arrayidx2, align 4
%3 = load float, ptr %arrayidx3, align 4
%mul.i = fmul float %0, %2
%mul4.i = fmul float %1, %3
%mul5.i = fmul float %0, %3
Expand Down Expand Up @@ -209,17 +209,15 @@ if.then.93.i: ; preds = %if.then.88.i, %if.e
_ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit: ; preds = %if.then.93.i, %lor.lhs.false.67.i, %land.lhs.true.i, %entry
%84 = phi i32 [ %4, %land.lhs.true.i ], [ %4, %entry ], [ %82, %if.then.93.i ], [ %4, %lor.lhs.false.67.i ]
%85 = phi i32 [ %5, %land.lhs.true.i ], [ %5, %entry ], [ %83, %if.then.93.i ], [ %5, %lor.lhs.false.67.i ]
%arrayidx5 = getelementptr inbounds float, float* %f, i64 5
%86 = bitcast float* %arrayidx5 to i32*
store i32 %84, i32* %86, align 4
%arrayidx7 = getelementptr inbounds float, float* %f, i64 6
%87 = bitcast float* %arrayidx7 to i32*
store i32 %85, i32* %87, align 4
%arrayidx5 = getelementptr inbounds float, ptr %f, i64 5
store i32 %84, ptr %arrayidx5, align 4
%arrayidx7 = getelementptr inbounds float, ptr %f, i64 6
store i32 %85, ptr %arrayidx7, align 4
ret void
}

declare float @llvm.nvvm.fabs.f(float)

!nvvm.annotations = !{!0}

!0 = !{void (float*)* @PR24303, !"kernel", i32 1}
!0 = !{ptr @PR24303, !"kernel", i32 1}
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/NVPTX/reg-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,39 +30,39 @@ entry:
; CHECK-DAG: .reg .f64 %fd<

; Verify that we use correct register types.
store i8 1, i8* %s8, align 1
store i8 1, ptr %s8, align 1
; CHECK: mov.u16 [[R1:%rs[0-9]]], 1;
; CHECK-NEXT: st.u8 {{.*}}, [[R1]]
store i8 2, i8* %u8, align 1
store i8 2, ptr %u8, align 1
; CHECK: mov.u16 [[R2:%rs[0-9]]], 2;
; CHECK-NEXT: st.u8 {{.*}}, [[R2]]
store i16 3, i16* %s16, align 2
store i16 3, ptr %s16, align 2
; CHECK: mov.u16 [[R3:%rs[0-9]]], 3;
; CHECK-NEXT: st.u16 {{.*}}, [[R3]]
store i16 4, i16* %u16, align 2
store i16 4, ptr %u16, align 2
; CHECK: mov.u16 [[R4:%rs[0-9]]], 4;
; CHECK-NEXT: st.u16 {{.*}}, [[R4]]
store i32 5, i32* %s32, align 4
store i32 5, ptr %s32, align 4
; CHECK: mov.u32 [[R5:%r[0-9]]], 5;
; CHECK-NEXT: st.u32 {{.*}}, [[R5]]
store i32 6, i32* %u32, align 4
store i32 6, ptr %u32, align 4
; CHECK: mov.u32 [[R6:%r[0-9]]], 6;
; CHECK-NEXT: st.u32 {{.*}}, [[R6]]
store i64 7, i64* %s64, align 8
store i64 7, ptr %s64, align 8
; CHECK: mov.u64 [[R7:%rd[0-9]]], 7;
; CHECK-NEXT: st.u64 {{.*}}, [[R7]]
store i64 8, i64* %u64, align 8
store i64 8, ptr %u64, align 8
; CHECK: mov.u64 [[R8:%rd[0-9]]], 8;
; CHECK-NEXT: st.u64 {{.*}}, [[R8]]

; FP constants are stored via integer registers, but that's an
; implementation detail that's irrelevant here.
store float 9.000000e+00, float* %f32, align 4
store double 1.000000e+01, double* %f64, align 8
store float 9.000000e+00, ptr %f32, align 4
store double 1.000000e+01, ptr %f64, align 8
; Instead, we force a load into a register and then verify register type.
%f32v = load volatile float, float* %f32, align 4
%f32v = load volatile float, ptr %f32, align 4
; CHECK: ld.volatile.f32 %f{{[0-9]+}}
%f64v = load volatile double, double* %f64, align 8
%f64v = load volatile double, ptr %f64, align 8
; CHECK: ld.volatile.f64 %fd{{[0-9]+}}
ret void
; CHECK: ret;
Expand Down
19 changes: 9 additions & 10 deletions llvm/test/CodeGen/NVPTX/sched1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

; Ensure source scheduling is working

define void @foo(i32* %a) {
define void @foo(ptr %a) {
; CHECK: .func foo
; CHECK: ld.u32
; CHECK-NEXT: ld.u32
Expand All @@ -12,20 +12,19 @@ define void @foo(i32* %a) {
; CHECK-NEXT: add.s32
; CHECK-NEXT: add.s32
; CHECK-NEXT: add.s32
%ptr0 = getelementptr i32, i32* %a, i32 0
%val0 = load i32, i32* %ptr0
%ptr1 = getelementptr i32, i32* %a, i32 1
%val1 = load i32, i32* %ptr1
%ptr2 = getelementptr i32, i32* %a, i32 2
%val2 = load i32, i32* %ptr2
%ptr3 = getelementptr i32, i32* %a, i32 3
%val3 = load i32, i32* %ptr3
%val0 = load i32, ptr %a
%ptr1 = getelementptr i32, ptr %a, i32 1
%val1 = load i32, ptr %ptr1
%ptr2 = getelementptr i32, ptr %a, i32 2
%val2 = load i32, ptr %ptr2
%ptr3 = getelementptr i32, ptr %a, i32 3
%val3 = load i32, ptr %ptr3

%t0 = add i32 %val0, %val1
%t1 = add i32 %t0, %val2
%t2 = add i32 %t1, %val3

store i32 %t2, i32* %a
store i32 %t2, ptr %a

ret void
}
Expand Down
19 changes: 9 additions & 10 deletions llvm/test/CodeGen/NVPTX/sched2.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}

define void @foo(<2 x i32>* %a) {
define void @foo(ptr %a) {
; CHECK: .func foo
; CHECK: ld.v2.u32
; CHECK-NEXT: ld.v2.u32
Expand All @@ -13,20 +13,19 @@ define void @foo(<2 x i32>* %a) {
; CHECK-NEXT: add.s32
; CHECK-NEXT: add.s32
; CHECK-NEXT: add.s32
%ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0
%val0 = load <2 x i32>, <2 x i32>* %ptr0
%ptr1 = getelementptr <2 x i32>, <2 x i32>* %a, i32 1
%val1 = load <2 x i32>, <2 x i32>* %ptr1
%ptr2 = getelementptr <2 x i32>, <2 x i32>* %a, i32 2
%val2 = load <2 x i32>, <2 x i32>* %ptr2
%ptr3 = getelementptr <2 x i32>, <2 x i32>* %a, i32 3
%val3 = load <2 x i32>, <2 x i32>* %ptr3
%val0 = load <2 x i32>, ptr %a
%ptr1 = getelementptr <2 x i32>, ptr %a, i32 1
%val1 = load <2 x i32>, ptr %ptr1
%ptr2 = getelementptr <2 x i32>, ptr %a, i32 2
%val2 = load <2 x i32>, ptr %ptr2
%ptr3 = getelementptr <2 x i32>, ptr %a, i32 3
%val3 = load <2 x i32>, ptr %ptr3

%t0 = add <2 x i32> %val0, %val1
%t1 = add <2 x i32> %t0, %val2
%t2 = add <2 x i32> %t1, %val3

store <2 x i32> %t2, <2 x i32>* %a
store <2 x i32> %t2, ptr %a

ret void
}
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/NVPTX/sext-in-reg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"


define void @one(i64 %a, i64 %b, i64* %p1, i64* %p2) {
define void @one(i64 %a, i64 %b, ptr %p1, ptr %p2) {
; CHECK: cvt.s64.s8
; CHECK: cvt.s64.s8
entry:
Expand All @@ -15,14 +15,14 @@ entry:
%shr = ashr i64 %a, 16
%shr9 = ashr i64 %b, 16
%add = add nsw i64 %conv4, %conv1
store i64 %add, i64* %p1, align 8
store i64 %add, ptr %p1, align 8
%add17 = add nsw i64 %shr9, %shr
store i64 %add17, i64* %p2, align 8
store i64 %add17, ptr %p2, align 8
ret void
}


define void @two(i64 %a, i64 %b, i64* %p1, i64* %p2) {
define void @two(i64 %a, i64 %b, ptr %p1, ptr %p2) {
entry:
; CHECK: cvt.s64.s32
; CHECK: cvt.s64.s32
Expand All @@ -33,14 +33,14 @@ entry:
%shr = ashr i64 %a, 16
%shr9 = ashr i64 %b, 16
%add = add nsw i64 %conv4, %conv1
store i64 %add, i64* %p1, align 8
store i64 %add, ptr %p1, align 8
%add17 = add nsw i64 %shr9, %shr
store i64 %add17, i64* %p2, align 8
store i64 %add17, ptr %p2, align 8
ret void
}


define void @three(i64 %a, i64 %b, i64* %p1, i64* %p2) {
define void @three(i64 %a, i64 %b, ptr %p1, ptr %p2) {
entry:
; CHECK: cvt.s64.s16
; CHECK: cvt.s64.s16
Expand All @@ -51,14 +51,14 @@ entry:
%shr = ashr i64 %a, 16
%shr9 = ashr i64 %b, 16
%add = add nsw i64 %conv4, %conv1
store i64 %add, i64* %p1, align 8
store i64 %add, ptr %p1, align 8
%add17 = add nsw i64 %shr9, %shr
store i64 %add17, i64* %p2, align 8
store i64 %add17, ptr %p2, align 8
ret void
}


define void @four(i32 %a, i32 %b, i32* %p1, i32* %p2) {
define void @four(i32 %a, i32 %b, ptr %p1, ptr %p2) {
entry:
; CHECK: cvt.s32.s8
; CHECK: cvt.s32.s8
Expand All @@ -69,14 +69,14 @@ entry:
%shr = ashr i32 %a, 16
%shr9 = ashr i32 %b, 16
%add = add nsw i32 %conv4, %conv1
store i32 %add, i32* %p1, align 4
store i32 %add, ptr %p1, align 4
%add17 = add nsw i32 %shr9, %shr
store i32 %add17, i32* %p2, align 4
store i32 %add17, ptr %p2, align 4
ret void
}


define void @five(i32 %a, i32 %b, i32* %p1, i32* %p2) {
define void @five(i32 %a, i32 %b, ptr %p1, ptr %p2) {
entry:
; CHECK: cvt.s32.s16
; CHECK: cvt.s32.s16
Expand All @@ -87,14 +87,14 @@ entry:
%shr = ashr i32 %a, 16
%shr9 = ashr i32 %b, 16
%add = add nsw i32 %conv4, %conv1
store i32 %add, i32* %p1, align 4
store i32 %add, ptr %p1, align 4
%add17 = add nsw i32 %shr9, %shr
store i32 %add17, i32* %p2, align 4
store i32 %add17, ptr %p2, align 4
ret void
}


define void @six(i16 %a, i16 %b, i16* %p1, i16* %p2) {
define void @six(i16 %a, i16 %b, ptr %p1, ptr %p2) {
entry:
; CHECK: cvt.s16.s8
; CHECK: cvt.s16.s8
Expand All @@ -105,8 +105,8 @@ entry:
%shr = ashr i16 %a, 8
%shr9 = ashr i16 %b, 8
%add = add nsw i16 %conv4, %conv1
store i16 %add, i16* %p1, align 4
store i16 %add, ptr %p1, align 4
%add17 = add nsw i16 %shr9, %shr
store i16 %add17, i16* %p2, align 4
store i16 %add17, ptr %p2, align 4
ret void
}
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/NVPTX/shfl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -62,30 +62,30 @@ define float @shfl_down_float(float %in) {

; Try the rest of the shfl modes. Hopefully they're declared in such a way
; that if shfl.down works correctly, they also work correctly.
define void @shfl_rest(i32 %in_i32, float %in_float, i32* %out_i32, float* %out_float) {
define void @shfl_rest(i32 %in_i32, float %in_float, ptr %out_i32, ptr %out_float) {
; CHECK: shfl.up.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 1, 2;
%up_i32 = call i32 @llvm.nvvm.shfl.up.i32(i32 %in_i32, i32 1, i32 2)
store i32 %up_i32, i32* %out_i32
store i32 %up_i32, ptr %out_i32

; CHECK: shfl.up.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 3, 4;
%up_float = call float @llvm.nvvm.shfl.up.f32(float %in_float, i32 3, i32 4)
store float %up_float, float* %out_float
store float %up_float, ptr %out_float

; CHECK: shfl.bfly.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 5, 6;
%bfly_i32 = call i32 @llvm.nvvm.shfl.bfly.i32(i32 %in_i32, i32 5, i32 6)
store i32 %bfly_i32, i32* %out_i32
store i32 %bfly_i32, ptr %out_i32

; CHECK: shfl.bfly.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 7, 8;
%bfly_float = call float @llvm.nvvm.shfl.bfly.f32(float %in_float, i32 7, i32 8)
store float %bfly_float, float* %out_float
store float %bfly_float, ptr %out_float

; CHECK: shfl.idx.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 9, 10;
%idx_i32 = call i32 @llvm.nvvm.shfl.idx.i32(i32 %in_i32, i32 9, i32 10)
store i32 %idx_i32, i32* %out_i32
store i32 %idx_i32, ptr %out_i32

; CHECK: shfl.idx.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 11, 12;
%idx_float = call float @llvm.nvvm.shfl.idx.f32(float %in_float, i32 11, i32 12)
store float %idx_float, float* %out_float
store float %idx_float, ptr %out_float

ret void
}
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/NVPTX/shift-parts.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}

; CHECK: shift_parts_left_128
define void @shift_parts_left_128(i128* %val, i128* %amtptr) {
define void @shift_parts_left_128(ptr %val, ptr %amtptr) {
; CHECK: shl.b64
; CHECK: mov.u32
; CHECK: sub.s32
Expand All @@ -13,15 +13,15 @@ define void @shift_parts_left_128(i128* %val, i128* %amtptr) {
; CHECK: setp.gt.s32
; CHECK: selp.b64
; CHECK: shl.b64
%amt = load i128, i128* %amtptr
%a = load i128, i128* %val
%amt = load i128, ptr %amtptr
%a = load i128, ptr %val
%val0 = shl i128 %a, %amt
store i128 %val0, i128* %val
store i128 %val0, ptr %val
ret void
}

; CHECK: shift_parts_right_128
define void @shift_parts_right_128(i128* %val, i128* %amtptr) {
define void @shift_parts_right_128(ptr %val, ptr %amtptr) {
; CHECK: shr.u64
; CHECK: sub.s32
; CHECK: shl.b64
Expand All @@ -31,9 +31,9 @@ define void @shift_parts_right_128(i128* %val, i128* %amtptr) {
; CHECK: setp.gt.s32
; CHECK: selp.b64
; CHECK: shr.s64
%amt = load i128, i128* %amtptr
%a = load i128, i128* %val
%amt = load i128, ptr %amtptr
%a = load i128, ptr %val
%val0 = ashr i128 %a, %amt
store i128 %val0, i128* %val
store i128 %val0, ptr %val
ret void
}
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/simple-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@ define float @device_func(float %a) noinline {
}

; CHECK: .entry kernel_func
define void @kernel_func(float* %a) {
%val = load float, float* %a
define void @kernel_func(ptr %a) {
%val = load float, ptr %a
; CHECK: call.uni (retval0),
; CHECK: device_func,
%mul = call float @device_func(float %val)
store float %mul, float* %a
store float %mul, ptr %a
ret void
}



!nvvm.annotations = !{!1}

!1 = !{void (float*)* @kernel_func, !"kernel", i32 1}
!1 = !{ptr @kernel_func, !"kernel", i32 1}
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/NVPTX/st-generic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,66 +5,66 @@

;; i8

define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
define void @st_global_i8(ptr addrspace(0) %ptr, i8 %a) {
; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i8 %a, i8 addrspace(0)* %ptr
store i8 %a, ptr addrspace(0) %ptr
ret void
}

;; i16

define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) {
define void @st_global_i16(ptr addrspace(0) %ptr, i16 %a) {
; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
; PTX64: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i16 %a, i16 addrspace(0)* %ptr
store i16 %a, ptr addrspace(0) %ptr
ret void
}

;; i32

define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) {
define void @st_global_i32(ptr addrspace(0) %ptr, i32 %a) {
; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
; PTX32: ret
; PTX64: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; PTX64: ret
store i32 %a, i32 addrspace(0)* %ptr
store i32 %a, ptr addrspace(0) %ptr
ret void
}

;; i64

define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) {
define void @st_global_i64(ptr addrspace(0) %ptr, i64 %a) {
; PTX32: st.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
; PTX32: ret
; PTX64: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; PTX64: ret
store i64 %a, i64 addrspace(0)* %ptr
store i64 %a, ptr addrspace(0) %ptr
ret void
}

;; f32

define void @st_global_f32(float addrspace(0)* %ptr, float %a) {
define void @st_global_f32(ptr addrspace(0) %ptr, float %a) {
; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
; PTX32: ret
; PTX64: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; PTX64: ret
store float %a, float addrspace(0)* %ptr
store float %a, ptr addrspace(0) %ptr
ret void
}

;; f64

define void @st_global_f64(double addrspace(0)* %ptr, double %a) {
define void @st_global_f64(ptr addrspace(0) %ptr, double %a) {
; PTX32: st.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
; PTX32: ret
; PTX64: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; PTX64: ret
store double %a, double addrspace(0)* %ptr
store double %a, ptr addrspace(0) %ptr
ret void
}
15 changes: 6 additions & 9 deletions llvm/test/CodeGen/NVPTX/store-retval.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

%struct.StNoalign = type { [5 x i32] }

define %struct.StNoalign @func_StNoalign(%struct.StNoalign* nocapture noundef readonly byval(%struct.StNoalign) align 4 %in) {
define %struct.StNoalign @func_StNoalign(ptr nocapture noundef readonly byval(%struct.StNoalign) align 4 %in) {
; CHECK-LABEL: .func{{.*}}func_StNoalign
; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [func_StNoalign_param_0];
; CHECK-NOT: st.param.b32 [func_retval0+0], %r{{[0-9]+}};
Expand All @@ -30,15 +30,14 @@ define %struct.StNoalign @func_StNoalign(%struct.StNoalign* nocapture noundef re
; CHECK-NOT: st.param.b32 [func_retval0+12], %r{{[0-9]+}};
; CHECK: st.param.b32 [func_retval0+16], [[R1]];
; CHECK-NEXT: ret;
%arrayidx = getelementptr inbounds %struct.StNoalign, %struct.StNoalign* %in, i32 0, i32 0, i32 0
%1 = load i32, i32* %arrayidx, align 4
%1 = load i32, ptr %in, align 4
%.fca.0.4.insert = insertvalue %struct.StNoalign { [5 x i32] [i32 undef, i32 undef, i32 undef, i32 undef, i32 poison] }, i32 %1, 0, 4
ret %struct.StNoalign %.fca.0.4.insert
}

%struct.StAlign8 = type { [5 x i32], [4 x i8] }

define %struct.StAlign8 @func_StAlign8(%struct.StAlign8* nocapture noundef readonly byval(%struct.StAlign8) align 8 %in) {
define %struct.StAlign8 @func_StAlign8(ptr nocapture noundef readonly byval(%struct.StAlign8) align 8 %in) {
; CHECK-LABEL: .func{{.*}}func_StAlign8
; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [func_StAlign8_param_0];
; CHECK-NOT: st.param.b32 [func_retval0+0], %r{{[0-9]+}};
Expand All @@ -48,15 +47,14 @@ define %struct.StAlign8 @func_StAlign8(%struct.StAlign8* nocapture noundef reado
; CHECK: st.param.b32 [func_retval0+16], [[R1]];
; CHECK-NOT: st.param.v4.b8 [func_retval0+20], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}};
; CHECK-NEXT: ret;
%arrayidx = getelementptr inbounds %struct.StAlign8, %struct.StAlign8* %in, i32 0, i32 0, i32 0
%1 = load i32, i32* %arrayidx, align 8
%1 = load i32, ptr %in, align 8
%.fca.0.4.insert = insertvalue %struct.StAlign8 { [5 x i32] [i32 undef, i32 undef, i32 undef, i32 undef, i32 poison], [4 x i8] poison }, i32 %1, 0, 4
ret %struct.StAlign8 %.fca.0.4.insert
}

%struct.StAlign16 = type { [5 x i32], [12 x i8] }

define %struct.StAlign16 @func_StAlign16(%struct.StAlign16* nocapture noundef readonly byval(%struct.StAlign16) align 16 %in) {
define %struct.StAlign16 @func_StAlign16(ptr nocapture noundef readonly byval(%struct.StAlign16) align 16 %in) {
; CHECK-LABEL: .func{{.*}}func_StAlign16
; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [func_StAlign16_param_0];
; CHECK-NOT: st.param.b32 [func_retval0+0], %r{{[0-9]+}};
Expand All @@ -68,8 +66,7 @@ define %struct.StAlign16 @func_StAlign16(%struct.StAlign16* nocapture noundef re
; CHECK-NOT: st.param.v4.b8 [func_retval0+24], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}};
; CHECK-NOT: st.param.v4.b8 [func_retval0+28], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}};
; CHECK-NEXT: ret;
%arrayidx = getelementptr inbounds %struct.StAlign16, %struct.StAlign16* %in, i32 0, i32 0, i32 0
%1 = load i32, i32* %arrayidx, align 16
%1 = load i32, ptr %in, align 16
%.fca.0.4.insert = insertvalue %struct.StAlign16 { [5 x i32] [i32 undef, i32 undef, i32 undef, i32 undef, i32 poison], [12 x i8] poison }, i32 %1, 0, 4
ret %struct.StAlign16 %.fca.0.4.insert
}
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
target triple = "nvptx-unknown-cuda"

declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))


; SM20-LABEL: .entry foo
; SM30-LABEL: .entry foo
define void @foo(i64 %img, float* %red, i32 %idx) {
define void @foo(i64 %img, ptr %red, i32 %idx) {
; SM20: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0];
; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
; SM30: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0];
Expand All @@ -22,17 +22,17 @@ define void @foo(i64 %img, float* %red, i32 %idx) {
%ret = sitofp i32 %val to float
; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
store float %ret, float* %red
store float %ret, ptr %red
ret void
}

@surf0 = internal addrspace(1) global i64 0, align 8

; SM20-LABEL: .entry bar
; SM30-LABEL: .entry bar
define void @bar(float* %red, i32 %idx) {
define void @bar(ptr %red, i32 %idx) {
; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
%surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
%surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0)
; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [surf0, {%r{{[0-9]+}}}]
; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFHANDLE]], {%r{{[0-9]+}}}]
%val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx)
Expand All @@ -41,15 +41,15 @@ define void @bar(float* %red, i32 %idx) {
%ret = sitofp i32 %val to float
; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
store float %ret, float* %red
store float %ret, ptr %red
ret void
}




!nvvm.annotations = !{!1, !2, !3}
!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
!2 = !{void (float*, i32)* @bar, !"kernel", i32 1}
!3 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
!1 = !{ptr @foo, !"kernel", i32 1}
!2 = !{ptr @bar, !"kernel", i32 1}
!3 = !{ptr addrspace(1) @surf0, !"surface", i32 1}

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/surf-read.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@ target triple = "nvptx-unknown-nvcl"
declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)

; CHECK: .entry foo
define void @foo(i64 %img, float* %red, i32 %idx) {
define void @foo(i64 %img, ptr %red, i32 %idx) {
; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]
%val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
%ret = sitofp i32 %val to float
; CHECK: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
store float %ret, float* %red
store float %ret, ptr %red
ret void
}

!nvvm.annotations = !{!1, !2}
!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
!2 = !{void (i64, float*, i32)* @foo, !"rdwrimage", i32 0}
!1 = !{ptr @foo, !"kernel", i32 1}
!2 = !{ptr @foo, !"rdwrimage", i32 0}
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
target triple = "nvptx-unknown-cuda"

declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))


; SM20-LABEL: .entry foo
Expand All @@ -29,7 +29,7 @@ define void @foo(i64 %img, i32 %val, i32 %idx) {
; SM30-LABEL: .entry bar
define void @bar(i32 %val, i32 %idx) {
; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
%surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
%surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0)
; SM20: sust.b.1d.b32.trap [surf0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %surfHandle, i32 %idx, i32 %val)
Expand All @@ -38,7 +38,7 @@ define void @bar(i32 %val, i32 %idx) {


!nvvm.annotations = !{!1, !2, !3}
!1 = !{void (i64, i32, i32)* @foo, !"kernel", i32 1}
!2 = !{void (i32, i32)* @bar, !"kernel", i32 1}
!3 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
!1 = !{ptr @foo, !"kernel", i32 1}
!2 = !{ptr @bar, !"kernel", i32 1}
!3 = !{ptr addrspace(1) @surf0, !"surface", i32 1}

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/surf-write.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ define void @foo(i64 %img, i32 %val, i32 %idx) {
}

!nvvm.annotations = !{!1, !2}
!1 = !{void (i64, i32, i32)* @foo, !"kernel", i32 1}
!2 = !{void (i64, i32, i32)* @foo, !"wroimage", i32 0}
!1 = !{ptr @foo, !"kernel", i32 1}
!2 = !{ptr @foo, !"wroimage", i32 0}
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/NVPTX/symbol-naming.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ target triple = "nvptx64-unknown-unknown"
; Function Attrs: nounwind
define internal void @.function.() {
entry:
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
%call = call i32 (ptr, ...) @printf(ptr @.str)
ret void
}

; Function Attrs: nounwind
define internal void @_$_function_$_() {
entry:
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @_$_str, i32 0, i32 0))
%call = call i32 (ptr, ...) @printf(ptr @_$_str)
ret void
}

Expand All @@ -45,4 +45,4 @@ entry:
ret void
}

declare i32 @printf(i8*, ...)
declare i32 @printf(ptr, ...)
26 changes: 13 additions & 13 deletions llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
target triple = "nvptx-unknown-cuda"

declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)
declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))

; SM20-LABEL: .entry foo
; SM30-LABEL: .entry foo
define void @foo(i64 %img, float* %red, i32 %idx) {
define void @foo(i64 %img, ptr %red, i32 %idx) {
; SM20: ld.param.u64 %rd[[TEXREG:[0-9]+]], [foo_param_0];
; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
; SM30: ld.param.u64 %rd[[TEXREG:[0-9]+]], [foo_param_0];
Expand All @@ -20,7 +20,7 @@ define void @foo(i64 %img, float* %red, i32 %idx) {
%ret = extractvalue { float, float, float, float } %val, 0
; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
store float %ret, float* %red
store float %ret, ptr %red
ret void
}

Expand All @@ -29,26 +29,26 @@ define void @foo(i64 %img, float* %red, i32 %idx) {

; SM20-LABEL: .entry bar
; SM30-LABEL: .entry bar
define void @bar(float* %red, i32 %idx) {
define void @bar(ptr %red, i32 %idx) {
; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0
%texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
%texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}]
; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}]
%val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
%ret = extractvalue { float, float, float, float } %val, 0
; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
store float %ret, float* %red
store float %ret, ptr %red
ret void
}

declare float @texfunc(i64)

; SM20-LABEL: .entry baz
; SM30-LABEL: .entry baz
define void @baz(float* %red, i32 %idx) {
define void @baz(ptr %red, i32 %idx) {
; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0
%texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
%texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}]
; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}]
%val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
Expand All @@ -65,12 +65,12 @@ define void @baz(float* %red, i32 %idx) {
%ret2 = fadd float %ret, %texcall
; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RET2]]
; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RET2]]
store float %ret2, float* %red
store float %ret2, ptr %red
ret void
}

!nvvm.annotations = !{!1, !2, !3, !4}
!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
!2 = !{void (float*, i32)* @bar, !"kernel", i32 1}
!3 = !{i64 addrspace(1)* @tex0, !"texture", i32 1}
!4 = !{void (float*, i32)* @baz, !"kernel", i32 1}
!1 = !{ptr @foo, !"kernel", i32 1}
!2 = !{ptr @bar, !"kernel", i32 1}
!3 = !{ptr addrspace(1) @tex0, !"texture", i32 1}
!4 = !{ptr @baz, !"kernel", i32 1}
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/NVPTX/tex-read.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@ target triple = "nvptx-unknown-nvcl"
declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)

; CHECK: .entry foo
define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
define void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
%val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
%ret = extractvalue { float, float, float, float } %val, 0
; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]]
store float %ret, float* %red
store float %ret, ptr %red
ret void
}

!nvvm.annotations = !{!1, !2, !3}
!1 = !{void (i64, i64, float*, i32)* @foo, !"kernel", i32 1}
!2 = !{void (i64, i64, float*, i32)* @foo, !"rdoimage", i32 0}
!3 = !{void (i64, i64, float*, i32)* @foo, !"sampler", i32 1}
!1 = !{ptr @foo, !"kernel", i32 1}
!2 = !{ptr @foo, !"rdoimage", i32 0}
!3 = !{ptr @foo, !"sampler", i32 1}
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/NVPTX/texsurf-queries.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ declare i32 @llvm.nvvm.txq.width(i64)
declare i32 @llvm.nvvm.txq.height(i64)
declare i32 @llvm.nvvm.suq.width(i64)
declare i32 @llvm.nvvm.suq.height(i64)
declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))


; SM20-LABEL: @t0
Expand All @@ -28,7 +28,7 @@ define i32 @t0(i64 %texHandle) {
; SM30-LABEL: @t1
define i32 @t1() {
; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0
%texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
%texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
; SM20: txq.width.b32 %r{{[0-9]+}}, [tex0]
; SM30: txq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
%width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle)
Expand All @@ -49,7 +49,7 @@ define i32 @t2(i64 %texHandle) {
; SM30-LABEL: @t3
define i32 @t3() {
; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0
%texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
%texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
; SM20: txq.height.b32 %r{{[0-9]+}}, [tex0]
; SM30: txq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
%height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle)
Expand All @@ -70,7 +70,7 @@ define i32 @s0(i64 %surfHandle) {
; SM30-LABEL: @s1
define i32 @s1() {
; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0
%surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
%surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0)
; SM20: suq.width.b32 %r{{[0-9]+}}, [surf0]
; SM30: suq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
%width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle)
Expand All @@ -91,7 +91,7 @@ define i32 @s2(i64 %surfHandle) {
; SM30-LABEL: @s3
define i32 @s3() {
; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0
%surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
%surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0)
; SM20: suq.height.b32 %r{{[0-9]+}}, [surf0]
; SM30: suq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
%height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle)
Expand All @@ -101,5 +101,5 @@ define i32 @s3() {


!nvvm.annotations = !{!1, !2}
!1 = !{i64 addrspace(1)* @tex0, !"texture", i32 1}
!2 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
!1 = !{ptr addrspace(1) @tex0, !"texture", i32 1}
!2 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/tuple-literal.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}

define ptx_device void @test_function({i8, i8}*) {
define ptx_device void @test_function(ptr) {
ret void
}
42 changes: 20 additions & 22 deletions llvm/test/CodeGen/NVPTX/vaargs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,27 @@

; CHECK: .address_size [[BITS:32|64]]

%struct.__va_list_tag = type { i8*, i8*, i32, i32 }
%struct.__va_list_tag = type { ptr, ptr, i32, i32 }

@foo_ptr = internal addrspace(1) global i32 (i32, ...)* @foo, align 8
@foo_ptr = internal addrspace(1) global ptr @foo, align 8

define i32 @foo(i32 %a, ...) {
entry:
%al = alloca [1 x %struct.__va_list_tag], align 8
%ap = bitcast [1 x %struct.__va_list_tag]* %al to i8*
%al2 = alloca [1 x %struct.__va_list_tag], align 8
%ap2 = bitcast [1 x %struct.__va_list_tag]* %al2 to i8*

; Test va_start
; CHECK: .param .align 8 .b8 foo_vararg[]
; CHECK: mov.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg;
; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR]];

call void @llvm.va_start(i8* %ap)
call void @llvm.va_start(ptr %al)

; Test va_copy()
; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0];
; CHECK-NEXT: st.u[[BITS]] [%SP+{{[0-9]+}}], [[VA_PTR]];

call void @llvm.va_copy(i8* %ap2, i8* %ap)
call void @llvm.va_copy(ptr %al2, ptr %al)

; Test va_arg(ap, int32_t)
; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0];
Expand All @@ -37,7 +35,7 @@ entry:
; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR_NEXT]];
; CHECK-NEXT: ld.local.u32 %r{{[0-9]+}}, [[[VA_PTR_ALIGN]]];

%0 = va_arg i8* %ap, i32
%0 = va_arg ptr %al, i32

; Test va_arg(ap, int64_t)
; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0];
Expand All @@ -47,7 +45,7 @@ entry:
; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR_NEXT]];
; CHECK-NEXT: ld.local.u64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]];

%1 = va_arg i8* %ap, i64
%1 = va_arg ptr %al, i64

; Test va_arg(ap, double)
; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0];
Expand All @@ -57,9 +55,9 @@ entry:
; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR_NEXT]];
; CHECK-NEXT: ld.local.f64 %fd{{[0-9]+}}, [[[VA_PTR_ALIGN]]];

%2 = va_arg i8* %ap, double
%2 = va_arg ptr %al, double

; Test va_arg(ap, void *)
; Test va_arg(ap, ptr)
; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0];
; CHECK32-NEXT: add.s32 [[VA_PTR_TMP:%r[0-9]+]], [[VA_PTR]], 3;
; CHECK64-NEXT: add.s64 [[VA_PTR_TMP:%rd[0-9]+]], [[VA_PTR]], 7;
Expand All @@ -70,17 +68,17 @@ entry:
; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR_NEXT]];
; CHECK-NEXT: ld.local.u[[BITS]] %{{(r|rd)[0-9]+}}, [[[VA_PTR_ALIGN]]];

%3 = va_arg i8* %ap, i8*
%call = call i32 @bar(i32 %a, i32 %0, i64 %1, double %2, i8* %3)
%3 = va_arg ptr %al, ptr
%call = call i32 @bar(i32 %a, i32 %0, i64 %1, double %2, ptr %3)

call void @llvm.va_end(i8* %ap)
%4 = va_arg i8* %ap2, i32
call void @llvm.va_end(i8* %ap2)
call void @llvm.va_end(ptr %al)
%4 = va_arg ptr %al2, i32
call void @llvm.va_end(ptr %al2)
%5 = add i32 %call, %4
ret i32 %5
}

define i32 @test_foo(i32 %i, i64 %l, double %d, i8* %p) {
define i32 @test_foo(i32 %i, i64 %l, double %d, ptr %p) {
; Test indirect variadic function call.

; Load arguments to temporary variables
Expand All @@ -101,12 +99,12 @@ define i32 @test_foo(i32 %i, i64 %l, double %d, i8* %p) {
; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[]

entry:
%ptr = load i32 (i32, ...)*, i32 (i32, ...)** addrspacecast (i32 (i32, ...)* addrspace(1)* @foo_ptr to i32 (i32, ...)**), align 8
%call = call i32 (i32, ...) %ptr(i32 4, i32 %i, i64 %l, double %d, i8* %p)
%ptr = load ptr, ptr addrspacecast (ptr addrspace(1) @foo_ptr to ptr), align 8
%call = call i32 (i32, ...) %ptr(i32 4, i32 %i, i64 %l, double %d, ptr %p)
ret i32 %call
}

declare void @llvm.va_start(i8*)
declare void @llvm.va_end(i8*)
declare void @llvm.va_copy(i8*, i8*)
declare i32 @bar(i32, i32, i64, double, i8*)
declare void @llvm.va_start(ptr)
declare void @llvm.va_end(ptr)
declare void @llvm.va_copy(ptr, ptr)
declare i32 @bar(i32, i32, i64, double, ptr)
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/NVPTX/vec8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
target triple = "nvptx-unknown-cuda"

; CHECK: .visible .func foo
define void @foo(<8 x i8> %a, i8* %b) {
define void @foo(<8 x i8> %a, ptr %b) {
; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1]
Expand All @@ -13,7 +13,7 @@ define void @foo(<8 x i8> %a, i8* %b) {
%t0 = extractelement <8 x i8> %a, i32 1
%t1 = extractelement <8 x i8> %a, i32 6
%t = add i8 %t0, %t1
store i8 %t, i8* %b
store i8 %t, ptr %b
ret void
}

10 changes: 5 additions & 5 deletions llvm/test/CodeGen/NVPTX/vector-compare.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
; scalarized. If codegen fails, then the type legalizer incorrectly
; tried to promote <2 x i1> to <2 x i8> and instruction selection failed.

define void @foo(<2 x i32>* %a, <2 x i32>* %b, i32* %r1, i32* %r2) {
%aval = load <2 x i32>, <2 x i32>* %a
%bval = load <2 x i32>, <2 x i32>* %b
define void @foo(ptr %a, ptr %b, ptr %r1, ptr %r2) {
%aval = load <2 x i32>, ptr %a
%bval = load <2 x i32>, ptr %b
%res = icmp slt <2 x i32> %aval, %bval
%t1 = extractelement <2 x i1> %res, i32 0
%t2 = extractelement <2 x i1> %res, i32 1
%t1a = zext i1 %t1 to i32
%t2a = zext i1 %t2 to i32
store i32 %t1a, i32* %r1
store i32 %t2a, i32* %r2
store i32 %t1a, ptr %r1
store i32 %t2a, ptr %r2
ret void
}
49 changes: 24 additions & 25 deletions llvm/test/CodeGen/NVPTX/vector-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,60 +9,60 @@
; which will load two floats at once into scalar registers.

; CHECK-LABEL: foo
define void @foo(<2 x float>* %a) {
define void @foo(ptr %a) {
; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
%t1 = load <2 x float>, <2 x float>* %a
%t1 = load <2 x float>, ptr %a
%t2 = fmul <2 x float> %t1, %t1
store <2 x float> %t2, <2 x float>* %a
store <2 x float> %t2, ptr %a
ret void
}

; CHECK-LABEL: foo2
define void @foo2(<4 x float>* %a) {
define void @foo2(ptr %a) {
; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
%t1 = load <4 x float>, <4 x float>* %a
%t1 = load <4 x float>, ptr %a
%t2 = fmul <4 x float> %t1, %t1
store <4 x float> %t2, <4 x float>* %a
store <4 x float> %t2, ptr %a
ret void
}

; CHECK-LABEL: foo3
define void @foo3(<8 x float>* %a) {
define void @foo3(ptr %a) {
; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
%t1 = load <8 x float>, <8 x float>* %a
%t1 = load <8 x float>, ptr %a
%t2 = fmul <8 x float> %t1, %t1
store <8 x float> %t2, <8 x float>* %a
store <8 x float> %t2, ptr %a
ret void
}



; CHECK-LABEL: foo4
define void @foo4(<2 x i32>* %a) {
define void @foo4(ptr %a) {
; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
%t1 = load <2 x i32>, <2 x i32>* %a
%t1 = load <2 x i32>, ptr %a
%t2 = mul <2 x i32> %t1, %t1
store <2 x i32> %t2, <2 x i32>* %a
store <2 x i32> %t2, ptr %a
ret void
}

; CHECK-LABEL: foo5
define void @foo5(<4 x i32>* %a) {
define void @foo5(ptr %a) {
; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
%t1 = load <4 x i32>, <4 x i32>* %a
%t1 = load <4 x i32>, ptr %a
%t2 = mul <4 x i32> %t1, %t1
store <4 x i32> %t2, <4 x i32>* %a
store <4 x i32> %t2, ptr %a
ret void
}

; CHECK-LABEL: foo6
define void @foo6(<8 x i32>* %a) {
define void @foo6(ptr %a) {
; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
%t1 = load <8 x i32>, <8 x i32>* %a
%t1 = load <8 x i32>, ptr %a
%t2 = mul <8 x i32> %t1, %t1
store <8 x i32> %t2, <8 x i32>* %a
store <8 x i32> %t2, ptr %a
ret void
}

Expand All @@ -71,8 +71,7 @@ define void @foo6(<8 x i32>* %a) {
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; CHECK-LABEL: foo_complex
define void @foo_complex(i8* nocapture readonly align 16 dereferenceable(134217728) %alloc0) {
%targ0.1.typed = bitcast i8* %alloc0 to [1024 x [131072 x i8]]*
define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(134217728) %alloc0) {
%t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1
%t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%t2 = lshr i32 %t1, 8
Expand All @@ -86,14 +85,14 @@ define void @foo_complex(i8* nocapture readonly align 16 dereferenceable(1342177
%t10 = or i32 %t4, 129
%t11 = zext i32 %t10 to i64
%t20 = zext i32 %t2 to i64
%t27 = getelementptr inbounds [1024 x [131072 x i8]], [1024 x [131072 x i8]]* %targ0.1.typed, i64 0, i64 %t20, i64 %t9
%t27 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t9
; CHECK: ld.v2.u8
%t28 = load i8, i8* %t27, align 2
%t31 = getelementptr inbounds [1024 x [131072 x i8]], [1024 x [131072 x i8]]* %targ0.1.typed, i64 0, i64 %t20, i64 %t11
%t32 = load i8, i8* %t31, align 1
%t28 = load i8, ptr %t27, align 2
%t31 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t11
%t32 = load i8, ptr %t31, align 1
%t33 = icmp ult i8 %t28, %t32
%t34 = select i1 %t33, i8 %t32, i8 %t28
store i8 %t34, i8* %t31
store i8 %t34, ptr %t31
; CHECK: ret
ret void
}
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/NVPTX/vector-select.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
; This test makes sure that vector selects are scalarized by the type legalizer.
; If not, type legalization will fail.

define void @foo(<2 x i32> addrspace(1)* %def_a, <2 x i32> addrspace(1)* %def_b, <2 x i32> addrspace(1)* %def_c) {
define void @foo(ptr addrspace(1) %def_a, ptr addrspace(1) %def_b, ptr addrspace(1) %def_c) {
entry:
%tmp4 = load <2 x i32>, <2 x i32> addrspace(1)* %def_a
%tmp6 = load <2 x i32>, <2 x i32> addrspace(1)* %def_c
%tmp8 = load <2 x i32>, <2 x i32> addrspace(1)* %def_b
%tmp4 = load <2 x i32>, ptr addrspace(1) %def_a
%tmp6 = load <2 x i32>, ptr addrspace(1) %def_c
%tmp8 = load <2 x i32>, ptr addrspace(1) %def_b
%0 = icmp sge <2 x i32> %tmp4, zeroinitializer
%cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8
store <2 x i32> %cond, <2 x i32> addrspace(1)* %def_c
store <2 x i32> %cond, ptr addrspace(1) %def_c
ret void
}
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/NVPTX/vector-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,29 @@

; CHECK: .visible .func foo1
; CHECK: st.v2.f32
define void @foo1(<2 x float> %val, <2 x float>* %ptr) {
store <2 x float> %val, <2 x float>* %ptr
define void @foo1(<2 x float> %val, ptr %ptr) {
store <2 x float> %val, ptr %ptr
ret void
}

; CHECK: .visible .func foo2
; CHECK: st.v4.f32
define void @foo2(<4 x float> %val, <4 x float>* %ptr) {
store <4 x float> %val, <4 x float>* %ptr
define void @foo2(<4 x float> %val, ptr %ptr) {
store <4 x float> %val, ptr %ptr
ret void
}

; CHECK: .visible .func foo3
; CHECK: st.v2.u32
define void @foo3(<2 x i32> %val, <2 x i32>* %ptr) {
store <2 x i32> %val, <2 x i32>* %ptr
define void @foo3(<2 x i32> %val, ptr %ptr) {
store <2 x i32> %val, ptr %ptr
ret void
}

; CHECK: .visible .func foo4
; CHECK: st.v4.u32
define void @foo4(<4 x i32> %val, <4 x i32>* %ptr) {
store <4 x i32> %val, <4 x i32>* %ptr
define void @foo4(<4 x i32> %val, ptr %ptr) {
store <4 x i32> %val, ptr %ptr
ret void
}

30 changes: 15 additions & 15 deletions llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,24 @@ target triple = "nvptx64-nvidia-cuda"
; CHECK: ld.global.v2.f32
; CHECK: st.global.v2.f32
; CHECK: st.global.v2.f32
define void @test1(float addrspace(1)* noalias align 8 %in, float addrspace(1)* noalias align 8 %out) {
%in.1 = getelementptr float, float addrspace(1)* %in, i32 1
%in.2 = getelementptr float, float addrspace(1)* %in, i32 2
%in.3 = getelementptr float, float addrspace(1)* %in, i32 3
%v0 = load float, float addrspace(1)* %in, align 8
%v1 = load float, float addrspace(1)* %in.1, align 4
%v2 = load float, float addrspace(1)* %in.2, align 8
%v3 = load float, float addrspace(1)* %in.3, align 4
define void @test1(ptr addrspace(1) noalias align 8 %in, ptr addrspace(1) noalias align 8 %out) {
%in.1 = getelementptr float, ptr addrspace(1) %in, i32 1
%in.2 = getelementptr float, ptr addrspace(1) %in, i32 2
%in.3 = getelementptr float, ptr addrspace(1) %in, i32 3
%v0 = load float, ptr addrspace(1) %in, align 8
%v1 = load float, ptr addrspace(1) %in.1, align 4
%v2 = load float, ptr addrspace(1) %in.2, align 8
%v3 = load float, ptr addrspace(1) %in.3, align 4
%sum0 = fadd float %v0, %v1
%sum1 = fadd float %v1, %v2
%sum2 = fadd float %v3, %v1
%sum3 = fadd float %v2, %v3
%out.1 = getelementptr float, float addrspace(1)* %out, i32 1
%out.2 = getelementptr float, float addrspace(1)* %out, i32 2
%out.3 = getelementptr float, float addrspace(1)* %out, i32 3
store float %sum0, float addrspace(1)* %out, align 8
store float %sum1, float addrspace(1)* %out.1, align 4
store float %sum2, float addrspace(1)* %out.2, align 8
store float %sum3, float addrspace(1)* %out.3, align 4
%out.1 = getelementptr float, ptr addrspace(1) %out, i32 1
%out.2 = getelementptr float, ptr addrspace(1) %out, i32 2
%out.3 = getelementptr float, ptr addrspace(1) %out, i32 3
store float %sum0, ptr addrspace(1) %out, align 8
store float %sum1, ptr addrspace(1) %out.1, align 4
store float %sum2, ptr addrspace(1) %out.2, align 8
store float %sum3, ptr addrspace(1) %out.3, align 4
ret void
}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/weak-global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
@g = common addrspace(1) global i32 zeroinitializer

define i32 @func0() {
%val = load i32, i32 addrspace(1)* @g
%val = load i32, ptr addrspace(1) @g
ret i32 %val
}