diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll index 1b09dcdbc2c6ec..c3ce12201dec0c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll @@ -23,17 +23,16 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; GCN-PROMOTE-NEXT: v_addc_u32_e32 [[RESULT:v[0-9]+]], vcc, 0, v0, vcc ; GCN: buffer_store_dword [[RESULT]] -define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @work_item_info(ptr addrspace(1) %out, i32 %in) { entry: %0 = alloca [2 x i32], addrspace(5) - %1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0 - %2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %1 - store i32 1, i32 addrspace(5)* %2 - %3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in - %4 = load i32, i32 addrspace(5)* %3 - %5 = call i32 @llvm.amdgcn.workitem.id.x() - %6 = add i32 %4, %5 - store i32 %6, i32 addrspace(1)* %out + %1 = getelementptr [2 x i32], ptr addrspace(5) %0, i32 0, i32 1 + store i32 0, ptr addrspace(5) %0 + store i32 1, ptr addrspace(5) %1 + %2 = getelementptr [2 x i32], ptr addrspace(5) %0, i32 0, i32 %in + %3 = load i32, ptr addrspace(5) %2 + %4 = call i32 @llvm.amdgcn.workitem.id.x() + %5 = add i32 %3, %4 + store i32 %5, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index a2010c9f07f340..3c328e96a11689 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -49,12 +49,11 @@ ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 -; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() -; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* -; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 1 -; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP0]], align 4, !invariant.load !0 -; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 2 -; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP1]], align 4, !range !1, !invariant.load !0 +; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, ptr addrspace(4) [[DISPATCH_PTR]], i64 1 +; HSAOPT: [[LDXY:%[0-9]+]] = load i32, ptr addrspace(4) [[GEP0]], align 4, !invariant.load !0 +; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, ptr addrspace(4) [[DISPATCH_PTR]], i64 2 +; HSAOPT: [[LDZU:%[0-9]+]] = load i32, ptr addrspace(4) [[GEP1]], align 4, !range !1, !invariant.load !0 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16 ; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2 @@ -67,11 +66,10 @@ ; HSAOPT: [[ADD_YZ_X_X_YZ_SIZE:%[0-9]+]] = add i32 [[YZ_X_XID]], [[Y_X_Z_SIZE]] ; HSAOPT: [[ADD_ZID:%[0-9]+]] = add i32 [[ADD_YZ_X_X_YZ_SIZE]], [[WORKITEM_ID_Z]] -; HSAOPT: [[LOCAL_GEP:%[0-9]+]] = getelementptr inbounds [256 x [5 x i32]], [256 x [5 x i32]] addrspace(3)* @mova_same_clause.stack, i32 0, i32 [[ADD_ZID]] -; HSAOPT: %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 {{%[0-9]+}} -; HSAOPT: %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 {{%[0-9]+}} -; HSAOPT: %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 0 -; HSAOPT: %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 1 +; HSAOPT: [[LOCAL_GEP:%[0-9]+]] = getelementptr inbounds [256 x [5 x i32]], ptr addrspace(3) @mova_same_clause.stack, i32 0, i32 [[ADD_ZID]] +; HSAOPT: %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(3) [[LOCAL_GEP]], i32 0, i32 {{%[0-9]+}} +; HSAOPT: %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(3) [[LOCAL_GEP]], i32 0, i32 {{%[0-9]+}} +; HSAOPT: %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(3) [[LOCAL_GEP]], i32 0, i32 1 ; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0 @@ -79,45 +77,43 @@ ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !1 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !1 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1 -define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @mova_same_clause(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } ; OPT-LABEL: @high_alignment( -; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}} -define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +; OPT: getelementptr inbounds [256 x [8 x i32]], ptr addrspace(3) @high_alignment.stack, i32 0, i32 %{{[0-9]+}} +define amdgpu_kernel void @high_alignment(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { entry: %stack = alloca [8 x i32], align 16, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [8 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [8 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [8 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } @@ -126,23 +122,22 @@ entry: ; OPT: alloca [5 x i32] ; SI-NOT: ds_write -define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @no_replace_inbounds_gep(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } @@ -161,24 +156,20 @@ entry: ; SI-NOT: v_movrel %struct.point = type { i32, i32 } -define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @multiple_structs(ptr addrspace(1) %out) #0 { entry: %a = alloca %struct.point, addrspace(5) %b = alloca %struct.point, addrspace(5) - %a.x.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 - %a.y.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 1 - %b.x.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 - %b.y.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %a.x.ptr - store i32 1, i32 addrspace(5)* %a.y.ptr - store i32 2, i32 addrspace(5)* %b.x.ptr - store i32 3, i32 addrspace(5)* %b.y.ptr - %a.indirect.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 - %b.indirect.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 - %a.indirect = load i32, i32 addrspace(5)* %a.indirect.ptr - %b.indirect = load i32, i32 addrspace(5)* %b.indirect.ptr + %a.y.ptr = getelementptr %struct.point, ptr addrspace(5) %a, i32 0, i32 1 + %b.y.ptr = getelementptr %struct.point, ptr addrspace(5) %b, i32 0, i32 1 + store i32 0, ptr addrspace(5) %a + store i32 1, ptr addrspace(5) %a.y.ptr + store i32 2, ptr addrspace(5) %b + store i32 3, ptr addrspace(5) %b.y.ptr + %a.indirect = load i32, ptr addrspace(5) %a + %b.indirect = load i32, ptr addrspace(5) %b %0 = add i32 %a.indirect, %b.indirect - store i32 %0, i32 addrspace(1)* %out + store i32 %0, ptr addrspace(1) %out ret void } @@ -190,35 +181,31 @@ entry: ; R600-NOT: MOVA_INT ; SI-NOT: v_movrel -define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @direct_loop(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %prv_array_const = alloca [2 x i32], addrspace(5) %prv_array = alloca [2 x i32], addrspace(5) - %a = load i32, i32 addrspace(1)* %in - %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %b = load i32, i32 addrspace(1)* %b_src_ptr - %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 - store i32 %a, i32 addrspace(5)* %a_dst_ptr - %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 1 - store i32 %b, i32 addrspace(5)* %b_dst_ptr + %a = load i32, ptr addrspace(1) %in + %b_src_ptr = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %b = load i32, ptr addrspace(1) %b_src_ptr + store i32 %a, ptr addrspace(5) %prv_array_const + %b_dst_ptr = getelementptr inbounds [2 x i32], ptr addrspace(5) %prv_array_const, i32 0, i32 1 + store i32 %b, ptr addrspace(5) %b_dst_ptr br label %for.body for.body: %inc = phi i32 [0, %entry], [%count, %for.body] - %x_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 - %x = load i32, i32 addrspace(5)* %x_ptr - %y_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 - %y = load i32, i32 addrspace(5)* %y_ptr + %x = load i32, ptr addrspace(5) %prv_array_const + %y = load i32, ptr addrspace(5) %prv_array %xy = add i32 %x, %y - store i32 %xy, i32 addrspace(5)* %y_ptr + store i32 %xy, ptr addrspace(5) %prv_array %count = add i32 %inc, 1 %done = icmp eq i32 %count, 4095 br i1 %done, label %for.end, label %for.body for.end: - %value_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 - %value = load i32, i32 addrspace(5)* %value_ptr - store i32 %value, i32 addrspace(1)* %out + %value = load i32, ptr addrspace(5) %prv_array + store i32 %value, ptr addrspace(1) %out ret void } @@ -235,17 +222,16 @@ for.end: ; SI-PROMOTE-VECT: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 4 ; SI-PROMOTE-VECT: s_lshr_b32 [[SREG:s[0-9]+]], 0x10000, [[SCALED_IDX]] ; SI-PROMOTE-VECT: s_and_b32 s{{[0-9]+}}, [[SREG]], 0xffff -define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @short_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %0 = alloca [2 x i16], addrspace(5) - %1 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 0 - %2 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 1 - store i16 0, i16 addrspace(5)* %1 - store i16 1, i16 addrspace(5)* %2 - %3 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 %index - %4 = load i16, i16 addrspace(5)* %3 - %5 = sext i16 %4 to i32 - store i32 %5, i32 addrspace(1)* %out + %1 = getelementptr inbounds [2 x i16], ptr addrspace(5) %0, i32 0, i32 1 + store i16 0, ptr addrspace(5) %0 + store i16 1, ptr addrspace(5) %1 + %2 = getelementptr inbounds [2 x i16], ptr addrspace(5) %0, i32 0, i32 %index + %3 = load i16, ptr addrspace(5) %2 + %4 = sext i16 %3 to i32 + store i32 %4, ptr addrspace(1) %out ret void } @@ -258,17 +244,16 @@ entry: ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x60,0xe0 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:5 ; encoding: [0x05,0x00,0x60,0xe0 -define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %0 = alloca [2 x i8], addrspace(5) - %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 0 - %2 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 1 - store i8 0, i8 addrspace(5)* %1 - store i8 1, i8 addrspace(5)* %2 - %3 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 %index - %4 = load i8, i8 addrspace(5)* %3 - %5 = sext i8 %4 to i32 - store i32 %5, i32 addrspace(1)* %out + %1 = getelementptr inbounds [2 x i8], ptr addrspace(5) %0, i32 0, i32 1 + store i8 0, ptr addrspace(5) %0 + store i8 1, ptr addrspace(5) %1 + %2 = getelementptr inbounds [2 x i8], ptr addrspace(5) %0, i32 0, i32 %index + %3 = load i8, ptr addrspace(5) %2 + %4 = sext i8 %3 to i32 + store i32 %4, ptr addrspace(1) %out ret void } @@ -278,109 +263,103 @@ entry: ; ; A total of 5 bytes should be allocated and used. ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; -define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1, addrspace(5) %1 = alloca [2 x i8], align 1, addrspace(5) - %2 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 0 - %3 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 1 - %4 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 2 - %5 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 0 - %6 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 1 - store i8 0, i8 addrspace(5)* %2 - store i8 1, i8 addrspace(5)* %3 - store i8 2, i8 addrspace(5)* %4 - store i8 1, i8 addrspace(5)* %5 - store i8 0, i8 addrspace(5)* %6 - %7 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 %in - %8 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 %in - %9 = load i8, i8 addrspace(5)* %7 - %10 = load i8, i8 addrspace(5)* %8 - %11 = add i8 %9, %10 - %12 = sext i8 %11 to i32 - store i32 %12, i32 addrspace(1)* %out + %2 = getelementptr [3 x i8], ptr addrspace(5) %0, i32 0, i32 1 + %3 = getelementptr [3 x i8], ptr addrspace(5) %0, i32 0, i32 2 + %4 = getelementptr [2 x i8], ptr addrspace(5) %1, i32 0, i32 1 + store i8 0, ptr addrspace(5) %0 + store i8 1, ptr addrspace(5) %2 + store i8 2, ptr addrspace(5) %3 + store i8 1, ptr addrspace(5) %1 + store i8 0, ptr addrspace(5) %4 + %5 = getelementptr [3 x i8], ptr addrspace(5) %0, i32 0, i32 %in + %6 = getelementptr [2 x i8], ptr addrspace(5) %1, i32 0, i32 %in + %7 = load i8, ptr addrspace(5) %5 + %8 = load i8, ptr addrspace(5) %6 + %9 = add i8 %7, %8 + %10 = sext i8 %9 to i32 + store i32 %10, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i8]], addrspace(5) - %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 - store i8 0, i8 addrspace(5)* %gep0 - store i8 1, i8 addrspace(5)* %gep1 - %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index - %load = load i8, i8 addrspace(5)* %gep2 + %gep1 = getelementptr [2 x [2 x i8]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + store i8 0, ptr addrspace(5) %alloca + store i8 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [2 x [2 x i8]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index + %load = load i8, ptr addrspace(5) %gep2 %sext = sext i8 %load to i32 - store i32 %sext, i32 addrspace(1)* %out + store i32 %sext, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i32_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]], addrspace(5) - %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32 addrspace(5)* %gep2 - store i32 %load, i32 addrspace(1)* %out + %gep1 = getelementptr [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index + %load = load i32, ptr addrspace(5) %gep2 + store i32 %load, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i64_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i64]], addrspace(5) - %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 - store i64 0, i64 addrspace(5)* %gep0 - store i64 1, i64 addrspace(5)* %gep1 - %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index - %load = load i64, i64 addrspace(5)* %gep2 - store i64 %load, i64 addrspace(1)* %out + %gep1 = getelementptr [2 x [2 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + store i64 0, ptr addrspace(5) %alloca + store i64 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [2 x [2 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index + %load = load i64, ptr addrspace(5) %gep2 + store i64 %load, ptr addrspace(1) %out ret void } %struct.pair32 = type { i32, i32 } -define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5) - %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0, i32 1 - %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1, i32 1 - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index, i32 0 - %load = load i32, i32 addrspace(5)* %gep2 - store i32 %load, i32 addrspace(1)* %out + %gep0 = getelementptr [2 x [2 x %struct.pair32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x [2 x %struct.pair32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 1 + store i32 0, ptr addrspace(5) %gep0 + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [2 x [2 x %struct.pair32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index, i32 0 + %load = load i32, ptr addrspace(5) %gep2 + store i32 %load, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_pair32_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x %struct.pair32], addrspace(5) - %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 0, i32 1 - %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 1, i32 0 - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 %index, i32 0 - %load = load i32, i32 addrspace(5)* %gep2 - store i32 %load, i32 addrspace(1)* %out + %gep0 = getelementptr [2 x %struct.pair32], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x %struct.pair32], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + store i32 0, ptr addrspace(5) %gep0 + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [2 x %struct.pair32], ptr addrspace(5) %alloca, i32 0, i32 %index, i32 0 + %load = load i32, ptr addrspace(5) %gep2 + store i32 %load, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @select_private(ptr addrspace(1) %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32], addrspace(5) - %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %tmp1 - store i32 1, i32 addrspace(5)* %tmp2 + %tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %tmp2 %cmp = icmp eq i32 %in, 0 - %sel = select i1 %cmp, i32 addrspace(5)* %tmp1, i32 addrspace(5)* %tmp2 - %load = load i32, i32 addrspace(5)* %sel - store i32 %load, i32 addrspace(1)* %out + %sel = select i1 %cmp, ptr addrspace(5) %tmp, ptr addrspace(5) %tmp2 + %load = load i32, ptr addrspace(5) %sel + store i32 %load, ptr addrspace(1) %out ret void } @@ -392,35 +371,34 @@ entry: ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; SI: v_add_{{[iu]}}32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5, ; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offen ; -define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @ptrtoint(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], addrspace(5) - %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a - store i32 5, i32 addrspace(5)* %tmp0 - %tmp1 = ptrtoint [16 x i32] addrspace(5)* %alloca to i32 + %tmp0 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a + store i32 5, ptr addrspace(5) %tmp0 + %tmp1 = ptrtoint ptr addrspace(5) %alloca to i32 %tmp2 = add i32 %tmp1, 5 - %tmp3 = inttoptr i32 %tmp2 to i32 addrspace(5)* - %tmp4 = getelementptr i32, i32 addrspace(5)* %tmp3, i32 %b - %tmp5 = load i32, i32 addrspace(5)* %tmp4 - store i32 %tmp5, i32 addrspace(1)* %out + %tmp3 = inttoptr i32 %tmp2 to ptr addrspace(5) + %tmp4 = getelementptr i32, ptr addrspace(5) %tmp3, i32 %b + %tmp5 = load i32, ptr addrspace(5) %tmp4 + store i32 %tmp5, ptr addrspace(1) %out ret void } ; OPT-LABEL: @pointer_typed_alloca( -; OPT: getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}} -; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) #1 { +; OPT: getelementptr inbounds [256 x ptr addrspace(1)], ptr addrspace(3) @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}} +; OPT: load ptr addrspace(1), ptr addrspace(3) %{{[0-9]+}}, align 4 +define amdgpu_kernel void @pointer_typed_alloca(ptr addrspace(1) %A) #1 { entry: - %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) - store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 - %ld0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 - %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0 - store i32 1, i32 addrspace(1)* %arrayidx, align 4 - %ld1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 - %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1 - store i32 2, i32 addrspace(1)* %arrayidx1, align 4 - %ld2 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %ld2, i32 2 - store i32 3, i32 addrspace(1)* %arrayidx2, align 4 + %A.addr = alloca ptr addrspace(1), align 4, addrspace(5) + store ptr addrspace(1) %A, ptr addrspace(5) %A.addr, align 4 + %ld0 = load ptr addrspace(1), ptr addrspace(5) %A.addr, align 4 + store i32 1, ptr addrspace(1) %ld0, align 4 + %ld1 = load ptr addrspace(1), ptr addrspace(5) %A.addr, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %ld1, i32 1 + store i32 2, ptr addrspace(1) %arrayidx1, align 4 + %ld2 = load ptr addrspace(1), ptr addrspace(5) %A.addr, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %ld2, i32 2 + store i32 3, ptr addrspace(1) %arrayidx2, align 4 ret void } @@ -460,11 +438,11 @@ entry: ; SI: buffer_load_dword ; SI: buffer_load_dword -define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v16i32_stack(ptr addrspace(1) %out, i32 %a) { %alloca = alloca [2 x <16 x i32>], addrspace(5) - %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>] addrspace(5)* %alloca, i32 0, i32 %a - %tmp5 = load <16 x i32>, <16 x i32> addrspace(5)* %tmp0 - store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out + %tmp0 = getelementptr [2 x <16 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %a + %tmp5 = load <16 x i32>, ptr addrspace(5) %tmp0 + store <16 x i32> %tmp5, ptr addrspace(1) %out ret void } @@ -504,11 +482,11 @@ define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { ; SI: buffer_load_dword ; SI: buffer_load_dword -define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v16float_stack(ptr addrspace(1) %out, i32 %a) { %alloca = alloca [2 x <16 x float>], addrspace(5) - %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>] addrspace(5)* %alloca, i32 0, i32 %a - %tmp5 = load <16 x float>, <16 x float> addrspace(5)* %tmp0 - store <16 x float> %tmp5, <16 x float> addrspace(1)* %out + %tmp0 = getelementptr [2 x <16 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a + %tmp5 = load <16 x float>, ptr addrspace(5) %tmp0 + store <16 x float> %tmp5, ptr addrspace(1) %out ret void } @@ -520,35 +498,35 @@ define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 % ; SI: buffer_load_dword ; SI: buffer_load_dword -define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v2float_stack(ptr addrspace(1) %out, i32 %a) { %alloca = alloca [16 x <2 x float>], addrspace(5) - %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>] addrspace(5)* %alloca, i32 0, i32 %a - %tmp5 = load <2 x float>, <2 x float> addrspace(5)* %tmp0 - store <2 x float> %tmp5, <2 x float> addrspace(1)* %out + %tmp0 = getelementptr [16 x <2 x float>], ptr addrspace(5) %alloca, i32 0, i32 %a + %tmp5 = load <2 x float>, ptr addrspace(5) %tmp0 + store <2 x float> %tmp5, ptr addrspace(1) %out ret void } ; OPT-LABEL: @direct_alloca_read_0xi32( -; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)* -; OPT: load [0 x i32], [0 x i32] addrspace(3)* -define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) { +; OPT: store [0 x i32] undef, ptr addrspace(3) +; OPT: load [0 x i32], ptr addrspace(3) +define amdgpu_kernel void @direct_alloca_read_0xi32(ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [0 x i32], addrspace(5) - store [0 x i32] [], [0 x i32] addrspace(5)* %tmp - %load = load [0 x i32], [0 x i32] addrspace(5)* %tmp - store [0 x i32] %load, [0 x i32] addrspace(1)* %out + store [0 x i32] [], ptr addrspace(5) %tmp + %load = load [0 x i32], ptr addrspace(5) %tmp + store [0 x i32] %load, ptr addrspace(1) %out ret void } ; OPT-LABEL: @direct_alloca_read_1xi32( -; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)* -; OPT: load [1 x i32], [1 x i32] addrspace(3)* -define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) { +; OPT: store [1 x i32] zeroinitializer, ptr addrspace(3) +; OPT: load [1 x i32], ptr addrspace(3) +define amdgpu_kernel void @direct_alloca_read_1xi32(ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [1 x i32], addrspace(5) - store [1 x i32] [i32 0], [1 x i32] addrspace(5)* %tmp - %load = load [1 x i32], [1 x i32] addrspace(5)* %tmp - store [1 x i32] %load, [1 x i32] addrspace(1)* %out + store [1 x i32] [i32 0], ptr addrspace(5) %tmp + %load = load [1 x i32], ptr addrspace(5) %tmp + store [1 x i32] %load, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll index 570498e86564ce..b509c52c40e11e 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll @@ -7,7 +7,7 @@ ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @no_args() { %alloca = alloca i8, addrspace(5) - store volatile i8 0, i8 addrspace(5)* %alloca + store volatile i8 0, ptr addrspace(5) %alloca ret void } @@ -15,7 +15,7 @@ define amdgpu_kernel void @no_args() { ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @force_align32(<8 x i32>) { %alloca = alloca i8, addrspace(5) - store volatile i8 0, i8 addrspace(5)* %alloca + store volatile i8 0, ptr addrspace(5) %alloca ret void } @@ -23,7 +23,7 @@ define amdgpu_kernel void @force_align32(<8 x i32>) { ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @force_align64(<16 x i32>) { %alloca = alloca i8, addrspace(5) - store volatile i8 0, i8 addrspace(5)* %alloca + store volatile i8 0, ptr addrspace(5) %alloca ret void } @@ -31,7 +31,7 @@ define amdgpu_kernel void @force_align64(<16 x i32>) { ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @force_align128(<32 x i32>) { %alloca = alloca i8, addrspace(5) - store volatile i8 0, i8 addrspace(5)* %alloca + store volatile i8 0, ptr addrspace(5) %alloca ret void } @@ -39,6 +39,6 @@ define amdgpu_kernel void @force_align128(<32 x i32>) { ; CHECK: ScratchSize: 5{{$}} define amdgpu_kernel void @force_align256(<64 x i32>) { %alloca = alloca i8, addrspace(5) - store volatile i8 0, i8 addrspace(5)* %alloca + store volatile i8 0, ptr addrspace(5) %alloca ret void } diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 4cd9940b014827..0faf4749755d57 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -59,11 +59,11 @@ ; ALL: ; ScratchSize: 32772 define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { %large = alloca [8192 x i32], align 4, addrspace(5) - %gep = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 8191 - store volatile i32 %x, i32 addrspace(5)* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 %y - %val = load volatile i32, i32 addrspace(5)* %gep1 - store volatile i32 %val, i32 addrspace(1)* undef + %gep = getelementptr [8192 x i32], ptr addrspace(5) %large, i32 0, i32 8191 + store volatile i32 %x, ptr addrspace(5) %gep + %gep1 = getelementptr [8192 x i32], ptr addrspace(5) %large, i32 0, i32 %y + %val = load volatile i32, ptr addrspace(5) %gep1 + store volatile i32 %val, ptr addrspace(1) undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll index 4436b60be2a9de..43539b6052fb97 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -20,11 +20,11 @@ ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { %large = alloca [8192 x i32], align 4, addrspace(5) - %gep = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 8191 - store volatile i32 %x, i32 addrspace(5)* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 %y - %val = load volatile i32, i32 addrspace(5)* %gep1 - store volatile i32 %val, i32 addrspace(1)* undef + %gep = getelementptr [8192 x i32], ptr addrspace(5) %large, i32 0, i32 8191 + store volatile i32 %x, ptr addrspace(5) %gep + %gep1 = getelementptr [8192 x i32], ptr addrspace(5) %large, i32 0, i32 %y + %val = load volatile i32, ptr addrspace(5) %gep1 + store volatile i32 %val, ptr addrspace(1) undef ret void } @@ -46,11 +46,11 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 { %large = alloca [8192 x i32], align 4, addrspace(5) - %gep = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 8191 - store volatile i32 %x, i32 addrspace(5)* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %large, i32 0, i32 %y - %val = load volatile i32, i32 addrspace(5)* %gep1 - store volatile i32 %val, i32 addrspace(1)* undef + %gep = getelementptr [8192 x i32], ptr addrspace(5) %large, i32 0, i32 8191 + store volatile i32 %x, ptr addrspace(5) %gep + %gep1 = getelementptr [8192 x i32], ptr addrspace(5) %large, i32 0, i32 %y + %val = load volatile i32, ptr addrspace(5) %gep1 + store volatile i32 %val, ptr addrspace(1) undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 9df93bc970a96a..619c3f4c16b002 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -14,7 +14,7 @@ ; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an ; alignment less than the stack alignment. -define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) #1 { +define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) #1 { ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_add_u32 s0, s0, s9 @@ -90,24 +90,23 @@ entry: bb.0: %alloca = alloca [16 x i32], align 4, addrspace(5) - %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 + %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 %cond1 = icmp eq i32 %arg.cond1, 0 br i1 %cond1, label %bb.1, label %bb.2 bb.1: ; Use the alloca outside of the defining block. - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in - %load = load i32, i32 addrspace(5)* %gep2 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in + %load = load i32, ptr addrspace(5) %gep2 %tid = call i32 @llvm.amdgcn.workitem.id.x() %add = add i32 %load, %tid - store i32 %add, i32 addrspace(1)* %out + store i32 %add, ptr addrspace(1) %out br label %bb.2 bb.2: - store volatile i32 0, i32 addrspace(1)* undef + store volatile i32 0, ptr addrspace(1) undef ret void } ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 @@ -119,7 +118,7 @@ bb.2: ; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 ; ASSUME1024: ; ScratchSize: 1040 -define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { +define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 @@ -190,19 +189,18 @@ entry: bb.0: %alloca = alloca [16 x i32], align 64, addrspace(5) - %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in - %load = load i32, i32 addrspace(5)* %gep2 + %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in + %load = load i32, ptr addrspace(5) %gep2 %tid = call i32 @llvm.amdgcn.workitem.id.x() %add = add i32 %load, %tid - store i32 %add, i32 addrspace(1)* %out + store i32 %add, ptr addrspace(1) %out br label %bb.1 bb.1: - store volatile i32 0, i32 addrspace(1)* undef + store volatile i32 0, ptr addrspace(1) undef ret void } @@ -216,7 +214,7 @@ bb.1: ; ASSUME1024: ; ScratchSize: 1088 -define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { +define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { ; MUBUF-LABEL: func_non_entry_block_static_alloca_align4: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -293,28 +291,27 @@ entry: bb.0: %alloca = alloca [16 x i32], align 4, addrspace(5) - %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 + %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 %cond1 = icmp eq i32 %arg.cond1, 0 br i1 %cond1, label %bb.1, label %bb.2 bb.1: ; Use the alloca outside of the defining block. - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in - %load = load i32, i32 addrspace(5)* %gep2 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in + %load = load i32, ptr addrspace(5) %gep2 %tid = call i32 @llvm.amdgcn.workitem.id.x() %add = add i32 %load, %tid - store i32 %add, i32 addrspace(1)* %out + store i32 %add, ptr addrspace(1) %out br label %bb.2 bb.2: - store volatile i32 0, i32 addrspace(1)* undef + store volatile i32 0, ptr addrspace(1) undef ret void } -define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { +define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { ; MUBUF-LABEL: func_non_entry_block_static_alloca_align64: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -386,19 +383,18 @@ entry: bb.0: %alloca = alloca [16 x i32], align 64, addrspace(5) - %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in - %load = load i32, i32 addrspace(5)* %gep2 + %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in + %load = load i32, ptr addrspace(5) %gep2 %tid = call i32 @llvm.amdgcn.workitem.id.x() %add = add i32 %load, %tid - store i32 %add, i32 addrspace(1)* %out + store i32 %add, ptr addrspace(1) %out br label %bb.1 bb.1: - store volatile i32 0, i32 addrspace(1)* undef + store volatile i32 0, ptr addrspace(1) undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll index 234412c72b2476..f1e5e68927be71 100644 --- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll +++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -18,7 +18,7 @@ ; OPTNONE-NOT: s_mov_b32 ; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} define amdgpu_kernel void @store_to_undef() #0 { - store volatile i32 0, i32 addrspace(5)* undef + store volatile i32 0, ptr addrspace(5) undef ret void } @@ -27,7 +27,7 @@ define amdgpu_kernel void @store_to_undef() #0 { ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]], s[2:3] ; OPT: buffer_store_dword v{{[0-9]+}}, off, s[[[RSRC_LO]]:[[RSRC_HI]]], 0 offset:124{{$}} define amdgpu_kernel void @store_to_inttoptr() #0 { - store volatile i32 0, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) + store volatile i32 0, ptr addrspace(5) inttoptr (i32 124 to ptr addrspace(5)) ret void } @@ -36,7 +36,7 @@ define amdgpu_kernel void @store_to_inttoptr() #0 { ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]], s[2:3] ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[[[RSRC_LO]]:[[RSRC_HI]]], 0 offen glc{{$}} define amdgpu_kernel void @load_from_undef() #0 { - %ld = load volatile i32, i32 addrspace(5)* undef + %ld = load volatile i32, ptr addrspace(5) undef ret void } @@ -45,7 +45,7 @@ define amdgpu_kernel void @load_from_undef() #0 { ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]], s[2:3] ; OPT: buffer_load_dword v{{[0-9]+}}, off, s[[[RSRC_LO]]:[[RSRC_HI]]], 0 offset:124 glc{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { - %ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) + %ld = load volatile i32, ptr addrspace(5) inttoptr (i32 124 to ptr addrspace(5)) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll index c5c6467550ef9f..cafb1e1cd836b0 100644 --- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll +++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll @@ -36,21 +36,20 @@ ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}} -define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 - %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom - %index.load = load i32, i32 addrspace(1)* %gep.index + %gep.index = getelementptr inbounds i32, ptr addrspace(1) %index.array, i64 %idxprom + %index.load = load i32, ptr addrspace(1) %gep.index %index = and i32 %index.load, 2 %alloca = alloca [2 x <4 x i32>], align 16, addrspace(5) - %gep0 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] addrspace(5)* %alloca, i32 0, i32 1 - store <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep0 - store <4 x i32> , <4 x i32> addrspace(5)* %gep1 - %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] addrspace(5)* %alloca, i32 0, i32 %index - %load = load <4 x i32>, <4 x i32> addrspace(5)* %gep2 - store <4 x i32> %load, <4 x i32> addrspace(1)* %out + %gep1 = getelementptr inbounds [2 x <4 x i32>], ptr addrspace(5) %alloca, i32 0, i32 1 + store <4 x i32> zeroinitializer, ptr addrspace(5) %alloca + store <4 x i32> , ptr addrspace(5) %gep1 + %gep2 = getelementptr inbounds [2 x <4 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %index + %load = load <4 x i32>, ptr addrspace(5) %gep2 + store <4 x i32> %load, ptr addrspace(1) %out ret void } @@ -106,21 +105,20 @@ entry: ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:20{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:24{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:28{{$}} -define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 - %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom - %index.load = load i32, i32 addrspace(1)* %gep.index + %gep.index = getelementptr inbounds i32, ptr addrspace(1) %index.array, i64 %idxprom + %index.load = load i32, ptr addrspace(1) %gep.index %index = and i32 %index.load, 2 %alloca = alloca [2 x <8 x i32>], align 32, addrspace(5) - %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 1 - store <8 x i32> zeroinitializer, <8 x i32> addrspace(5)* %gep0 - store <8 x i32> , <8 x i32> addrspace(5)* %gep1 - %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 %index - %load = load <8 x i32>, <8 x i32> addrspace(5)* %gep2 - store <8 x i32> %load, <8 x i32> addrspace(1)* %out + %gep1 = getelementptr inbounds [2 x <8 x i32>], ptr addrspace(5) %alloca, i32 0, i32 1 + store <8 x i32> zeroinitializer, ptr addrspace(5) %alloca + store <8 x i32> , ptr addrspace(5) %gep1 + %gep2 = getelementptr inbounds [2 x <8 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %index + %load = load <8 x i32>, ptr addrspace(5) %gep2 + store <8 x i32> %load, ptr addrspace(1) %out ret void } @@ -144,21 +142,20 @@ entry: ; HSA-ELT4-DAG: buffer_load_dword v[[HI:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} ; HSA-ELT4-DAG: buffer_load_dword v[[LO:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen{{$}} ; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] -define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_i64(ptr addrspace(1) %out, ptr addrspace(1) %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 - %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom - %index.load = load i32, i32 addrspace(1)* %gep.index + %gep.index = getelementptr inbounds i32, ptr addrspace(1) %index.array, i64 %idxprom + %index.load = load i32, ptr addrspace(1) %gep.index %index = and i32 %index.load, 2 %alloca = alloca [2 x i64], align 16, addrspace(5) - %gep0 = getelementptr inbounds [2 x i64], [2 x i64] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x i64], [2 x i64] addrspace(5)* %alloca, i32 0, i32 1 - store i64 0, i64 addrspace(5)* %gep0 - store i64 34359738602, i64 addrspace(5)* %gep1 - %gep2 = getelementptr inbounds [2 x i64], [2 x i64] addrspace(5)* %alloca, i32 0, i32 %index - %load = load i64, i64 addrspace(5)* %gep2 - store i64 %load, i64 addrspace(1)* %out + %gep1 = getelementptr inbounds [2 x i64], ptr addrspace(5) %alloca, i32 0, i32 1 + store i64 0, ptr addrspace(5) %alloca + store i64 34359738602, ptr addrspace(5) %gep1 + %gep2 = getelementptr inbounds [2 x i64], ptr addrspace(5) %alloca, i32 0, i32 %index + %load = load i64, ptr addrspace(5) %gep2 + store i64 %load, ptr addrspace(1) %out ret void } @@ -181,21 +178,20 @@ entry: ; HSA-ELT4-DAG: buffer_load_dword v[[HI:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} ; HSA-ELT4-DAG: buffer_load_dword v[[LO:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen{{$}} ; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] -define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_f64(ptr addrspace(1) %out, ptr addrspace(1) %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 - %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom - %index.load = load i32, i32 addrspace(1)* %gep.index + %gep.index = getelementptr inbounds i32, ptr addrspace(1) %index.array, i64 %idxprom + %index.load = load i32, ptr addrspace(1) %gep.index %index = and i32 %index.load, 2 %alloca = alloca [2 x double], align 16, addrspace(5) - %gep0 = getelementptr inbounds [2 x double], [2 x double] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x double], [2 x double] addrspace(5)* %alloca, i32 0, i32 1 - store double 0.0, double addrspace(5)* %gep0 - store double 4.0, double addrspace(5)* %gep1 - %gep2 = getelementptr inbounds [2 x double], [2 x double] addrspace(5)* %alloca, i32 0, i32 %index - %load = load double, double addrspace(5)* %gep2 - store double %load, double addrspace(1)* %out + %gep1 = getelementptr inbounds [2 x double], ptr addrspace(5) %alloca, i32 0, i32 1 + store double 0.0, ptr addrspace(5) %alloca + store double 4.0, ptr addrspace(5) %gep1 + %gep2 = getelementptr inbounds [2 x double], ptr addrspace(5) %alloca, i32 0, i32 %index + %load = load double, ptr addrspace(5) %gep2 + store double %load, ptr addrspace(1) %out ret void } @@ -230,21 +226,20 @@ entry: ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} -define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 - %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom - %index.load = load i32, i32 addrspace(1)* %gep.index + %gep.index = getelementptr inbounds i32, ptr addrspace(1) %index.array, i64 %idxprom + %index.load = load i32, ptr addrspace(1) %gep.index %index = and i32 %index.load, 2 %alloca = alloca [2 x <2 x i64>], align 16, addrspace(5) - %gep0 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] addrspace(5)* %alloca, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] addrspace(5)* %alloca, i32 0, i32 1 - store <2 x i64> zeroinitializer, <2 x i64> addrspace(5)* %gep0 - store <2 x i64> , <2 x i64> addrspace(5)* %gep1 - %gep2 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] addrspace(5)* %alloca, i32 0, i32 %index - %load = load <2 x i64>, <2 x i64> addrspace(5)* %gep2 - store <2 x i64> %load, <2 x i64> addrspace(1)* %out + %gep1 = getelementptr inbounds [2 x <2 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1 + store <2 x i64> zeroinitializer, ptr addrspace(5) %alloca + store <2 x i64> , ptr addrspace(5) %gep1 + %gep2 = getelementptr inbounds [2 x <2 x i64>], ptr addrspace(5) %alloca, i32 0, i32 %index + %load = load <2 x i64>, ptr addrspace(5) %gep2 + store <2 x i64> %load, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index c0cb4701f29303..7a0c0ef2566efe 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -2,9 +2,9 @@ ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -atomic-expand < %s | FileCheck -check-prefix=IR %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s -define i32 @load_atomic_private_seq_cst_i32(i32 addrspace(5)* %ptr) { +define i32 @load_atomic_private_seq_cst_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @load_atomic_private_seq_cst_i32( -; IR-NEXT: [[LOAD:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: ret i32 [[LOAD]] ; ; GCN-LABEL: load_atomic_private_seq_cst_i32: @@ -13,13 +13,13 @@ define i32 @load_atomic_private_seq_cst_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %load = load atomic i32, i32 addrspace(5)* %ptr seq_cst, align 4 + %load = load atomic i32, ptr addrspace(5) %ptr seq_cst, align 4 ret i32 %load } -define i64 @load_atomic_private_seq_cst_i64(i64 addrspace(5)* %ptr) { +define i64 @load_atomic_private_seq_cst_i64(ptr addrspace(5) %ptr) { ; IR-LABEL: @load_atomic_private_seq_cst_i64( -; IR-NEXT: [[LOAD:%.*]] = load i64, i64 addrspace(5)* [[PTR:%.*]], align 8 +; IR-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 8 ; IR-NEXT: ret i64 [[LOAD]] ; ; GCN-LABEL: load_atomic_private_seq_cst_i64: @@ -30,13 +30,13 @@ define i64 @load_atomic_private_seq_cst_i64(i64 addrspace(5)* %ptr) { ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %load = load atomic i64, i64 addrspace(5)* %ptr seq_cst, align 8 + %load = load atomic i64, ptr addrspace(5) %ptr seq_cst, align 8 ret i64 %load } -define void @atomic_store_seq_cst_i32(i32 addrspace(5)* %ptr, i32 %val) { +define void @atomic_store_seq_cst_i32(ptr addrspace(5) %ptr, i32 %val) { ; IR-LABEL: @atomic_store_seq_cst_i32( -; IR-NEXT: store i32 [[VAL:%.*]], i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: store i32 [[VAL:%.*]], ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_store_seq_cst_i32: @@ -45,13 +45,13 @@ define void @atomic_store_seq_cst_i32(i32 addrspace(5)* %ptr, i32 %val) { ; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - store atomic i32 %val, i32 addrspace(5)* %ptr seq_cst, align 4 + store atomic i32 %val, ptr addrspace(5) %ptr seq_cst, align 4 ret void } -define void @atomic_store_seq_cst_i64(i64 addrspace(5)* %ptr, i64 %val) { +define void @atomic_store_seq_cst_i64(ptr addrspace(5) %ptr, i64 %val) { ; IR-LABEL: @atomic_store_seq_cst_i64( -; IR-NEXT: store i64 [[VAL:%.*]], i64 addrspace(5)* [[PTR:%.*]], align 8 +; IR-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[PTR:%.*]], align 8 ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_store_seq_cst_i64: @@ -62,13 +62,13 @@ define void @atomic_store_seq_cst_i64(i64 addrspace(5)* %ptr, i64 %val) { ; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - store atomic i64 %val, i64 addrspace(5)* %ptr seq_cst, align 8 + store atomic i64 %val, ptr addrspace(5) %ptr seq_cst, align 8 ret void } -define i32 @load_atomic_private_seq_cst_syncscope_i32(i32 addrspace(5)* %ptr) { +define i32 @load_atomic_private_seq_cst_syncscope_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @load_atomic_private_seq_cst_syncscope_i32( -; IR-NEXT: [[LOAD:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: ret i32 [[LOAD]] ; ; GCN-LABEL: load_atomic_private_seq_cst_syncscope_i32: @@ -77,13 +77,13 @@ define i32 @load_atomic_private_seq_cst_syncscope_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %load = load atomic i32, i32 addrspace(5)* %ptr syncscope("agent") seq_cst, align 4 + %load = load atomic i32, ptr addrspace(5) %ptr syncscope("agent") seq_cst, align 4 ret i32 %load } -define void @atomic_store_seq_cst_syncscope_i32(i32 addrspace(5)* %ptr, i32 %val) { +define void @atomic_store_seq_cst_syncscope_i32(ptr addrspace(5) %ptr, i32 %val) { ; IR-LABEL: @atomic_store_seq_cst_syncscope_i32( -; IR-NEXT: store i32 [[VAL:%.*]], i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: store i32 [[VAL:%.*]], ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_store_seq_cst_syncscope_i32: @@ -92,21 +92,21 @@ define void @atomic_store_seq_cst_syncscope_i32(i32 addrspace(5)* %ptr, i32 %val ; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - store atomic i32 %val, i32 addrspace(5)* %ptr syncscope("agent") seq_cst, align 4 + store atomic i32 %val, ptr addrspace(5) %ptr syncscope("agent") seq_cst, align 4 ret void } -define i32 @cmpxchg_private_i32(i32 addrspace(5)* %ptr) { +define i32 @cmpxchg_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @cmpxchg_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; IR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 1, i32 [[TMP1]] -; IR-NEXT: store i32 [[TMP3]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: [[TMP4:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0 ; IR-NEXT: [[TMP5:%.*]] = insertvalue { i32, i1 } [[TMP4]], i1 [[TMP2]], 1 ; IR-NEXT: [[RESULT_0:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; IR-NEXT: [[RESULT_1:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; IR-NEXT: store i1 [[RESULT_1]], i1 addrspace(1)* poison, align 1 +; IR-NEXT: store i1 [[RESULT_1]], ptr addrspace(1) poison, align 1 ; IR-NEXT: ret i32 [[RESULT_0]] ; ; GCN-LABEL: cmpxchg_private_i32: @@ -125,24 +125,24 @@ define i32 @cmpxchg_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = cmpxchg i32 addrspace(5)* %ptr, i32 0, i32 1 acq_rel monotonic + %result = cmpxchg ptr addrspace(5) %ptr, i32 0, i32 1 acq_rel monotonic %result.0 = extractvalue { i32, i1 } %result, 0 %result.1 = extractvalue { i32, i1 } %result, 1 - store i1 %result.1, i1 addrspace(1)* poison + store i1 %result.1, ptr addrspace(1) poison ret i32 %result.0 } -define i64 @cmpxchg_private_i64(i64 addrspace(5)* %ptr) { +define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) { ; IR-LABEL: @cmpxchg_private_i64( -; IR-NEXT: [[TMP1:%.*]] = load i64, i64 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0 ; IR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 1, i64 [[TMP1]] -; IR-NEXT: store i64 [[TMP3]], i64 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: [[TMP4:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP1]], 0 ; IR-NEXT: [[TMP5:%.*]] = insertvalue { i64, i1 } [[TMP4]], i1 [[TMP2]], 1 ; IR-NEXT: [[RESULT_0:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; IR-NEXT: [[RESULT_1:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; IR-NEXT: store i1 [[RESULT_1]], i1 addrspace(1)* poison, align 1 +; IR-NEXT: store i1 [[RESULT_1]], ptr addrspace(1) poison, align 1 ; IR-NEXT: ret i64 [[RESULT_0]] ; ; GCN-LABEL: cmpxchg_private_i64: @@ -165,18 +165,18 @@ define i64 @cmpxchg_private_i64(i64 addrspace(5)* %ptr) { ; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = cmpxchg i64 addrspace(5)* %ptr, i64 0, i64 1 acq_rel monotonic + %result = cmpxchg ptr addrspace(5) %ptr, i64 0, i64 1 acq_rel monotonic %result.0 = extractvalue { i64, i1 } %result, 0 %result.1 = extractvalue { i64, i1 } %result, 1 - store i1 %result.1, i1 addrspace(1)* poison + store i1 %result.1, ptr addrspace(1) poison ret i64 %result.0 } -define i32 @atomicrmw_xchg_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_xchg_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_xchg_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 -; IR-NEXT: store i32 4, i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 +; IR-NEXT: store i32 4, ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_xchg_private_i32: @@ -189,15 +189,15 @@ define i32 @atomicrmw_xchg_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw xchg i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw xchg ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_add_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_add_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_add_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[NEW:%.*]] = add i32 [[TMP1]], 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_add_private_i32: @@ -210,15 +210,15 @@ define i32 @atomicrmw_add_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw add i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw add ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_sub_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_sub_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_sub_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[NEW:%.*]] = sub i32 [[TMP1]], 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_sub_private_i32: @@ -231,15 +231,15 @@ define i32 @atomicrmw_sub_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw sub i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw sub ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_and_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_and_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_and_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[NEW:%.*]] = and i32 [[TMP1]], 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_and_private_i32: @@ -252,16 +252,16 @@ define i32 @atomicrmw_and_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw and i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw and ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_nand_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_nand_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_nand_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 4 ; IR-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_nand_private_i32: @@ -275,15 +275,15 @@ define i32 @atomicrmw_nand_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw nand i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw nand ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_or_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_or_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_or_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[NEW:%.*]] = or i32 [[TMP1]], 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_or_private_i32: @@ -296,15 +296,15 @@ define i32 @atomicrmw_or_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw or i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw or ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_xor_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_xor_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_xor_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[NEW:%.*]] = xor i32 [[TMP1]], 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_xor_private_i32: @@ -317,16 +317,16 @@ define i32 @atomicrmw_xor_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw xor i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw xor ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_max_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_max_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_max_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[TMP1]], 4 ; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_max_private_i32: @@ -339,16 +339,16 @@ define i32 @atomicrmw_max_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw max i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw max ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_min_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_min_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_min_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[TMP2:%.*]] = icmp sle i32 [[TMP1]], 4 ; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_min_private_i32: @@ -361,16 +361,16 @@ define i32 @atomicrmw_min_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw min i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw min ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_umax_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_umax_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_umax_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 4 ; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_umax_private_i32: @@ -383,16 +383,16 @@ define i32 @atomicrmw_umax_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umax i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw umax ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define i32 @atomicrmw_umin_private_i32(i32 addrspace(5)* %ptr) { +define i32 @atomicrmw_umin_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_umin_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[TMP2:%.*]] = icmp ule i32 [[TMP1]], 4 ; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 4 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret i32 [[TMP1]] ; ; GCN-LABEL: atomicrmw_umin_private_i32: @@ -405,15 +405,15 @@ define i32 @atomicrmw_umin_private_i32(i32 addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umin i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw umin ptr addrspace(5) %ptr, i32 4 seq_cst ret i32 %result } -define float @atomicrmw_fadd_private_i32(float addrspace(5)* %ptr) { +define float @atomicrmw_fadd_private_i32(ptr addrspace(5) %ptr) { ; IR-LABEL: @atomicrmw_fadd_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load float, float addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[NEW:%.*]] = fadd float [[TMP1]], 2.000000e+00 -; IR-NEXT: store float [[NEW]], float addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store float [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret float [[TMP1]] ; ; GCN-LABEL: atomicrmw_fadd_private_i32: @@ -426,15 +426,15 @@ define float @atomicrmw_fadd_private_i32(float addrspace(5)* %ptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd float addrspace(5)* %ptr, float 2.0 seq_cst + %result = atomicrmw fadd ptr addrspace(5) %ptr, float 2.0 seq_cst ret float %result } -define float @atomicrmw_fsub_private_i32(float addrspace(5)* %ptr, float %val) { +define float @atomicrmw_fsub_private_i32(ptr addrspace(5) %ptr, float %val) { ; IR-LABEL: @atomicrmw_fsub_private_i32( -; IR-NEXT: [[TMP1:%.*]] = load float, float addrspace(5)* [[PTR:%.*]], align 4 +; IR-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[PTR:%.*]], align 4 ; IR-NEXT: [[NEW:%.*]] = fsub float [[TMP1]], [[VAL:%.*]] -; IR-NEXT: store float [[NEW]], float addrspace(5)* [[PTR]], align 4 +; IR-NEXT: store float [[NEW]], ptr addrspace(5) [[PTR]], align 4 ; IR-NEXT: ret float [[TMP1]] ; ; GCN-LABEL: atomicrmw_fsub_private_i32: @@ -447,23 +447,22 @@ define float @atomicrmw_fsub_private_i32(float addrspace(5)* %ptr, float %val) { ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fsub float addrspace(5)* %ptr, float %val seq_cst + %result = atomicrmw fsub ptr addrspace(5) %ptr, float %val seq_cst ret float %result } -define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addrspace(1) %out, i32 %in) nounwind { ; IR-LABEL: @alloca_promote_atomicrmw_private_lds_promote( ; IR-NEXT: entry: ; IR-NEXT: [[TMP:%.*]] = alloca [2 x i32], align 4, addrspace(5) -; IR-NEXT: [[GEP1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* [[TMP]], i32 0, i32 0 -; IR-NEXT: [[GEP2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* [[TMP]], i32 0, i32 1 -; IR-NEXT: store i32 0, i32 addrspace(5)* [[GEP1]], align 4 -; IR-NEXT: store i32 1, i32 addrspace(5)* [[GEP2]], align 4 -; IR-NEXT: [[GEP3:%.*]] = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* [[TMP]], i32 0, i32 [[IN:%.*]] -; IR-NEXT: [[TMP0:%.*]] = load i32, i32 addrspace(5)* [[GEP3]], align 4 +; IR-NEXT: [[GEP2:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 1 +; IR-NEXT: store i32 0, ptr addrspace(5) [[TMP]], align 4 +; IR-NEXT: store i32 1, ptr addrspace(5) [[GEP2]], align 4 +; IR-NEXT: [[GEP3:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 [[IN:%.*]] +; IR-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[GEP3]], align 4 ; IR-NEXT: [[NEW:%.*]] = add i32 [[TMP0]], 7 -; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[GEP3]], align 4 -; IR-NEXT: store i32 [[TMP0]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[GEP3]], align 4 +; IR-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4 ; IR-NEXT: ret void ; ; GCN-LABEL: alloca_promote_atomicrmw_private_lds_promote: @@ -480,33 +479,31 @@ define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(i32 addr ; GCN-NEXT: s_endpgm entry: %tmp = alloca [2 x i32], addrspace(5) - %gep1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %gep1 - store i32 1, i32 addrspace(5)* %gep2 - %gep3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in - %rmw = atomicrmw add i32 addrspace(5)* %gep3, i32 7 acq_rel - store i32 %rmw, i32 addrspace(1)* %out + %gep2 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %gep2 + %gep3 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in + %rmw = atomicrmw add ptr addrspace(5) %gep3, i32 7 acq_rel + store i32 %rmw, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @alloca_promote_cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out, i32 %in) nounwind { ; IR-LABEL: @alloca_promote_cmpxchg_private( ; IR-NEXT: entry: ; IR-NEXT: [[TMP:%.*]] = alloca [2 x i32], align 4, addrspace(5) -; IR-NEXT: [[GEP1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* [[TMP]], i32 0, i32 0 -; IR-NEXT: [[GEP2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* [[TMP]], i32 0, i32 1 -; IR-NEXT: store i32 0, i32 addrspace(5)* [[GEP1]], align 4 -; IR-NEXT: store i32 1, i32 addrspace(5)* [[GEP2]], align 4 -; IR-NEXT: [[GEP3:%.*]] = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* [[TMP]], i32 0, i32 [[IN:%.*]] -; IR-NEXT: [[TMP0:%.*]] = load i32, i32 addrspace(5)* [[GEP3]], align 4 +; IR-NEXT: [[GEP2:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 1 +; IR-NEXT: store i32 0, ptr addrspace(5) [[TMP]], align 4 +; IR-NEXT: store i32 1, ptr addrspace(5) [[GEP2]], align 4 +; IR-NEXT: [[GEP3:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 [[IN:%.*]] +; IR-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[GEP3]], align 4 ; IR-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 ; IR-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 1, i32 [[TMP0]] -; IR-NEXT: store i32 [[TMP2]], i32 addrspace(5)* [[GEP3]], align 4 +; IR-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[GEP3]], align 4 ; IR-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP0]], 0 ; IR-NEXT: [[TMP4:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP1]], 1 ; IR-NEXT: [[VAL:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; IR-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; IR-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT:%.*]], align 4 ; IR-NEXT: ret void ; ; GCN-LABEL: alloca_promote_cmpxchg_private: @@ -523,13 +520,12 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(i32 addrspace(1)* %out ; GCN-NEXT: s_endpgm entry: %tmp = alloca [2 x i32], addrspace(5) - %gep1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %gep1 - store i32 1, i32 addrspace(5)* %gep2 - %gep3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in - %xchg = cmpxchg i32 addrspace(5)* %gep3, i32 0, i32 1 acq_rel monotonic + %gep2 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %gep2 + %gep3 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in + %xchg = cmpxchg ptr addrspace(5) %gep3, i32 0, i32 1 acq_rel monotonic %val = extractvalue { i32, i1 } %xchg, 0 - store i32 %val, i32 addrspace(1)* %out + store i32 %val, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll index 7cc2b8214a36d0..1556ae82910d94 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -102,7 +102,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; GFX9-NEXT: .end_amdhsa_kernel ; GFX9-NEXT: .text %alloca.align = alloca i32, align 128, addrspace(5) - store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 + store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void } @@ -205,7 +205,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9-NEXT: .end_amdhsa_kernel ; GFX9-NEXT: .text %alloca.align = alloca i32, align 4, addrspace(5) - store volatile i32 9, i32 addrspace(5)* %alloca.align, align 4 + store volatile i32 9, ptr addrspace(5) %alloca.align, align 4 ret void } @@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; GFX9-NEXT: .end_amdhsa_kernel ; GFX9-NEXT: .text %alloca.align = alloca i32, align 4, addrspace(5) - store volatile i32 9, i32 addrspace(5)* %alloca.align, align 4 + store volatile i32 9, ptr addrspace(5) %alloca.align, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 37df92e9700a85..b2c4444d7f841a 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -26,8 +26,8 @@ ; GCN: ; ScratchSize: 144 define void @needs_align16_default_stack_align(i32 %idx) #0 { %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) - %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx - store volatile <4 x i32> , <4 x i32> addrspace(5)* %gep0, align 16 + %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx + store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } @@ -47,8 +47,8 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 { ; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) - %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx - store volatile <4 x i32> , <4 x i32> addrspace(5)* %gep0, align 16 + %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx + store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } @@ -68,8 +68,8 @@ define void @needs_align16_stack_align4(i32 %idx) #2 { ; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5) - %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx - store volatile <4 x i32> , <4 x i32> addrspace(5)* %gep0, align 32 + %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx + store volatile <4 x i32> , ptr addrspace(5) %gep0, align 32 ret void } @@ -84,8 +84,8 @@ define void @needs_align32(i32 %idx) #0 { ; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { %alloca.align16 = alloca [8 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca.align16, i32 0, i32 %idx - store volatile i32 3, i32 addrspace(5)* %gep0, align 4 + %gep0 = getelementptr inbounds [8 x i32], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx + store volatile i32 3, ptr addrspace(5) %gep0, align 4 ret void } @@ -95,7 +95,7 @@ define void @force_realign4(i32 %idx) #1 { ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_8() #0 { %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 2, i32 addrspace(5)* %alloca + store volatile i32 2, ptr addrspace(5) %alloca call void @needs_align16_default_stack_align(i32 1) ret void } @@ -106,7 +106,7 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 { ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_5() { %alloca0 = alloca i8, align 1, addrspace(5) - store volatile i8 2, i8 addrspace(5)* %alloca0 + store volatile i8 2, ptr addrspace(5) %alloca0 call void @needs_align16_default_stack_align(i32 1) ret void @@ -117,7 +117,7 @@ define amdgpu_kernel void @kernel_call_align16_from_5() { ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align4_from_5() { %alloca0 = alloca i8, align 1, addrspace(5) - store volatile i8 2, i8 addrspace(5)* %alloca0 + store volatile i8 2, ptr addrspace(5) %alloca0 call void @needs_align16_stack_align4(i32 1) ret void @@ -134,7 +134,7 @@ define amdgpu_kernel void @kernel_call_align4_from_5() { ; GCN: s_mov_b32 s33, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) - store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 + store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void } @@ -144,7 +144,7 @@ define void @default_realign_align128(i32 %idx) #0 { ; GCN-NOT: s32 define void @disable_realign_align128(i32 %idx) #3 { %alloca.align = alloca i32, align 128, addrspace(5) - store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 + store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void } @@ -181,13 +181,13 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) - store volatile i32 0, i32 addrspace(5)* %temp, align 1024 + store volatile i32 0, ptr addrspace(5) %temp, align 1024 call void @extern_func(<32 x i32> %a, i32 %b) ret void } %struct.Data = type { [9 x i32] } -define i32 @needs_align1024_stack_args_used_inside_loop(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 8 %arg) local_unnamed_addr #4 { +define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocapture readonly byval(%struct.Data) align 8 %arg) local_unnamed_addr #4 { ; The local object allocation needed an alignment of 1024. ; Since the function argument is accessed in a loop with an ; index variable, the base pointer first get loaded into a VGPR @@ -212,7 +212,7 @@ define i32 @needs_align1024_stack_args_used_inside_loop(%struct.Data addrspace(5 ; GCN-NEXT: s_setpc_b64 s[30:31] begin: %local_var = alloca i32, align 1024, addrspace(5) - store volatile i32 0, i32 addrspace(5)* %local_var, align 1024 + store volatile i32 0, ptr addrspace(5) %local_var, align 1024 br label %loop_body loop_end: ; preds = %loop_body @@ -222,8 +222,8 @@ loop_end: ; preds = %loop_body loop_body: ; preds = %loop_end, %begin %lp_idx = phi i32 [ 0, %begin ], [ %idx_next, %loop_end ] - %ptr = getelementptr inbounds %struct.Data, %struct.Data addrspace(5)* %arg, i32 0, i32 0, i32 %lp_idx - %val = load i32, i32 addrspace(5)* %ptr, align 8 + %ptr = getelementptr inbounds %struct.Data, ptr addrspace(5) %arg, i32 0, i32 0, i32 %lp_idx + %val = load i32, ptr addrspace(5) %ptr, align 8 %lp_cond = icmp eq i32 %val, %lp_idx br i1 %lp_cond, label %loop_end, label %exit @@ -245,7 +245,7 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: ;;#ASMEND ; GCN: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) - store volatile i32 %b, i32 addrspace(5)* %local_val, align 128 + store volatile i32 %b, ptr addrspace(5) %local_val, align 128 ; Use all clobberable registers, so BP has to spill to a VGPR. call void asm sideeffect "", "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} @@ -265,7 +265,7 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ; GCN: v_mov_b32_e32 v0, s34 ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 %local_val = alloca i32, align 128, addrspace(5) - store volatile i32 %b, i32 addrspace(5)* %local_val, align 128 + store volatile i32 %b, ptr addrspace(5) %local_val, align 128 call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} @@ -287,7 +287,7 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ret void } -define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i32 %b, [4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #5 { +define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i32 %b, ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #5 { ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. @@ -306,7 +306,7 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill %local_val = alloca i32, align 128, addrspace(5) - store volatile i32 %b, i32 addrspace(5)* %local_val, align 128 + store volatile i32 %b, ptr addrspace(5) %local_val, align 128 call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} diff --git a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll index 9060fcd1ecc02e..6969811f672d31 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll @@ -1,15 +1,14 @@ ; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s -declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture, i8, i32, i32, i1) #1 +declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1 ; ERROR: error: :0:0: stack frame size (131061) exceeds limit (131056) in function 'stack_size_limit_wave64' ; GCN: ; ScratchSize: 131061 define amdgpu_kernel void @stack_size_limit_wave64() #0 { entry: %alloca = alloca [131057 x i8], align 1, addrspace(5) - %alloca.bc = bitcast [131057 x i8] addrspace(5)* %alloca to i8 addrspace(5)* - call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %alloca.bc, i8 9, i32 131057, i32 1, i1 true) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 9, i32 131057, i32 1, i1 true) ret void } @@ -18,8 +17,7 @@ entry: define amdgpu_kernel void @stack_size_limit_wave32() #1 { entry: %alloca = alloca [262113 x i8], align 1, addrspace(5) - %alloca.bc = bitcast [262113 x i8] addrspace(5)* %alloca to i8 addrspace(5)* - call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %alloca.bc, i8 9, i32 262113, i32 1, i1 true) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 9, i32 262113, i32 1, i1 true) ret void } @@ -28,8 +26,7 @@ entry: define amdgpu_kernel void @max_stack_size_wave64() #0 { entry: %alloca = alloca [131052 x i8], align 1, addrspace(5) - %alloca.bc = bitcast [131052 x i8] addrspace(5)* %alloca to i8 addrspace(5)* - call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %alloca.bc, i8 9, i32 131052, i32 1, i1 true) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 9, i32 131052, i32 1, i1 true) ret void } @@ -38,8 +35,7 @@ entry: define amdgpu_kernel void @max_stack_size_wave32() #1 { entry: %alloca = alloca [262108 x i8], align 1, addrspace(5) - %alloca.bc = bitcast [262108 x i8] addrspace(5)* %alloca to i8 addrspace(5)* - call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %alloca.bc, i8 9, i32 262108, i32 1, i1 true) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 9, i32 262108, i32 1, i1 true) ret void }