205 changes: 99 additions & 106 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,34 @@ declare void @function1()
declare void @function2() #0

; Function Attrs: noinline
define void @function3(i8 addrspace(4)* %argptr, i8 addrspace(4)* addrspace(1)* %sink) #2 {
store i8 addrspace(4)* %argptr, i8 addrspace(4)* addrspace(1)* %sink, align 8
define void @function3(ptr addrspace(4) %argptr, ptr addrspace(1) %sink) #2 {
store ptr addrspace(4) %argptr, ptr addrspace(1) %sink, align 8
ret void
}

; Function Attrs: noinline
define void @function4(i64 %arg, i64* %a) #2 {
store i64 %arg, i64* %a
define void @function4(i64 %arg, ptr %a) #2 {
store i64 %arg, ptr %a
ret void
}

; Function Attrs: noinline
define void @function5(i8 addrspace(4)* %ptr, i64* %sink) #2 {
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 168
%cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
%x = load i64, i64 addrspace(4)* %cast
store i64 %x, i64* %sink
define void @function5(ptr addrspace(4) %ptr, ptr %sink) #2 {
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 168
%x = load i64, ptr addrspace(4) %gep
store i64 %x, ptr %sink
ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1

; CHECK: amdhsa.kernels:
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel10
define amdgpu_kernel void @test_kernel10(i8* %a) {
store i8 3, i8* %a, align 1
define amdgpu_kernel void @test_kernel10(ptr %a) {
store i8 3, ptr %a, align 1
ret void
}

Expand All @@ -43,9 +42,9 @@ define amdgpu_kernel void @test_kernel10(i8* %a) {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel20
define amdgpu_kernel void @test_kernel20(i8* %a) {
define amdgpu_kernel void @test_kernel20(ptr %a) {
call void @function1()
store i8 3, i8* %a, align 1
store i8 3, ptr %a, align 1
ret void
}

Expand All @@ -54,9 +53,9 @@ define amdgpu_kernel void @test_kernel20(i8* %a) {
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel21
define amdgpu_kernel void @test_kernel21(i8* %a) #0 {
define amdgpu_kernel void @test_kernel21(ptr %a) #0 {
call void @function1()
store i8 3, i8* %a, align 1
store i8 3, ptr %a, align 1
ret void
}

Expand All @@ -65,9 +64,9 @@ define amdgpu_kernel void @test_kernel21(i8* %a) #0 {
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel22
define amdgpu_kernel void @test_kernel22(i8* %a) {
define amdgpu_kernel void @test_kernel22(ptr %a) {
call void @function2()
store i8 3, i8* %a, align 1
store i8 3, ptr %a, align 1
ret void
}

Expand All @@ -76,12 +75,11 @@ define amdgpu_kernel void @test_kernel22(i8* %a) {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel30
define amdgpu_kernel void @test_kernel30(i128* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 192
%cast = bitcast i8 addrspace(4)* %gep to i128 addrspace(4)*
%x = load i128, i128 addrspace(4)* %cast
store i128 %x, i128* %a
define amdgpu_kernel void @test_kernel30(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 192
%x = load i128, ptr addrspace(4) %gep
store i128 %x, ptr %a
ret void
}

Expand All @@ -90,12 +88,11 @@ define amdgpu_kernel void @test_kernel30(i128* %a) {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel40
define amdgpu_kernel void @test_kernel40(i64* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200
%cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
%x = load i64, i64 addrspace(4)* %cast
store i64 %x, i64* %a
define amdgpu_kernel void @test_kernel40(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 200
%x = load i64, ptr addrspace(4) %gep
store i64 %x, ptr %a
ret void
}

Expand All @@ -104,12 +101,11 @@ define amdgpu_kernel void @test_kernel40(i64* %a) {
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel41
define amdgpu_kernel void @test_kernel41(i64* %a) #0 {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200
%cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
%x = load i64, i64 addrspace(4)* %cast
store i64 %x, i64* %a
define amdgpu_kernel void @test_kernel41(ptr %a) #0 {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 200
%x = load i64, ptr addrspace(4) %gep
store i64 %x, ptr %a
ret void
}

Expand All @@ -118,12 +114,11 @@ define amdgpu_kernel void @test_kernel41(i64* %a) #0 {
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel42
define amdgpu_kernel void @test_kernel42(i64* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 192
%cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
%x = load i64, i64 addrspace(4)* %cast
store i64 %x, i64* %a
define amdgpu_kernel void @test_kernel42(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 192
%x = load i64, ptr addrspace(4) %gep
store i64 %x, ptr %a
ret void
}

Expand All @@ -132,12 +127,11 @@ define amdgpu_kernel void @test_kernel42(i64* %a) {
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel43
define amdgpu_kernel void @test_kernel43(i64* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 208
%cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
%x = load i64, i64 addrspace(4)* %cast
store i64 %x, i64* %a
define amdgpu_kernel void @test_kernel43(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 208
%x = load i64, ptr addrspace(4) %gep
store i64 %x, ptr %a
ret void
}

Expand All @@ -146,11 +140,11 @@ define amdgpu_kernel void @test_kernel43(i64* %a) {
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel44
define amdgpu_kernel void @test_kernel44(i8* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 199
%x = load i8, i8 addrspace(4)* %gep, align 1
store i8 %x, i8* %a, align 1
define amdgpu_kernel void @test_kernel44(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 199
%x = load i8, ptr addrspace(4) %gep, align 1
store i8 %x, ptr %a, align 1
ret void
}

Expand All @@ -159,11 +153,11 @@ define amdgpu_kernel void @test_kernel44(i8* %a) {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel45
define amdgpu_kernel void @test_kernel45(i8* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200
%x = load i8, i8 addrspace(4)* %gep, align 1
store i8 %x, i8* %a, align 1
define amdgpu_kernel void @test_kernel45(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 200
%x = load i8, ptr addrspace(4) %gep, align 1
store i8 %x, ptr %a, align 1
ret void
}

Expand All @@ -172,11 +166,11 @@ define amdgpu_kernel void @test_kernel45(i8* %a) {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel46
define amdgpu_kernel void @test_kernel46(i8* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 207
%x = load i8, i8 addrspace(4)* %gep, align 1
store i8 %x, i8* %a, align 1
define amdgpu_kernel void @test_kernel46(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 207
%x = load i8, ptr addrspace(4) %gep, align 1
store i8 %x, ptr %a, align 1
ret void
}

Expand All @@ -185,11 +179,11 @@ define amdgpu_kernel void @test_kernel46(i8* %a) {
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel47
define amdgpu_kernel void @test_kernel47(i8* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 208
%x = load i8, i8 addrspace(4)* %gep, align 1
store i8 %x, i8* %a, align 1
define amdgpu_kernel void @test_kernel47(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 208
%x = load i8, ptr addrspace(4) %gep, align 1
store i8 %x, ptr %a, align 1
ret void
}

Expand All @@ -198,11 +192,11 @@ define amdgpu_kernel void @test_kernel47(i8* %a) {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel50
define amdgpu_kernel void @test_kernel50(i8* %a, i32 %b) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 %b
%x = load i8, i8 addrspace(4)* %gep, align 1
store i8 %x, i8* %a, align 1
define amdgpu_kernel void @test_kernel50(ptr %a, i32 %b) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i32 %b
%x = load i8, ptr addrspace(4) %gep, align 1
store i8 %x, ptr %a, align 1
ret void
}

Expand All @@ -211,12 +205,12 @@ define amdgpu_kernel void @test_kernel50(i8* %a, i32 %b) {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel51
define amdgpu_kernel void @test_kernel51(i8* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep1 = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16
%gep2 = getelementptr inbounds i8, i8 addrspace(4)* %gep1, i64 184
%x = load i8, i8 addrspace(4)* %gep2, align 1
store i8 %x, i8* %a, align 1
define amdgpu_kernel void @test_kernel51(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep1 = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 16
%gep2 = getelementptr inbounds i8, ptr addrspace(4) %gep1, i64 184
%x = load i8, ptr addrspace(4) %gep2, align 1
store i8 %x, ptr %a, align 1
ret void
}

Expand All @@ -225,12 +219,12 @@ define amdgpu_kernel void @test_kernel51(i8* %a) {
; CHECK: - .args:
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel52
define amdgpu_kernel void @test_kernel52(i8* %a) {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep1 = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16
%gep2 = getelementptr inbounds i8, i8 addrspace(4)* %gep1, i64 16
%x = load i8, i8 addrspace(4)* %gep2, align 1
store i8 %x, i8* %a, align 1
define amdgpu_kernel void @test_kernel52(ptr %a) {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep1 = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 16
%gep2 = getelementptr inbounds i8, ptr addrspace(4) %gep1, i64 16
%x = load i8, ptr addrspace(4) %gep2, align 1
store i8 %x, ptr %a, align 1
ret void
}

Expand All @@ -239,12 +233,11 @@ define amdgpu_kernel void @test_kernel52(i8* %a) {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel60
define amdgpu_kernel void @test_kernel60(i64* %a) #2 {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200
%cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
%x = load i64, i64 addrspace(4)* %cast
call void @function4(i64 %x, i64* %a)
define amdgpu_kernel void @test_kernel60(ptr %a) #2 {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 200
%x = load i64, ptr addrspace(4) %gep
call void @function4(i64 %x, ptr %a)
ret void
}

Expand All @@ -253,10 +246,10 @@ define amdgpu_kernel void @test_kernel60(i64* %a) #2 {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel61
define amdgpu_kernel void @test_kernel61(i64* %a) #2 {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 32
call void @function5(i8 addrspace(4)* %gep, i64* %a)
define amdgpu_kernel void @test_kernel61(ptr %a) #2 {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 32
call void @function5(ptr addrspace(4) %gep, ptr %a)
ret void
}

Expand All @@ -265,10 +258,10 @@ define amdgpu_kernel void @test_kernel61(i64* %a) #2 {
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel70
define amdgpu_kernel void @test_kernel70(i8 addrspace(4)* addrspace(1)* %sink) #2 {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
store i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* %sink, align 8
define amdgpu_kernel void @test_kernel70(ptr addrspace(1) %sink) #2 {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i32 42
store ptr addrspace(4) %gep, ptr addrspace(1) %sink, align 8
ret void
}

Expand All @@ -277,10 +270,10 @@ define amdgpu_kernel void @test_kernel70(i8 addrspace(4)* addrspace(1)* %sink) #
; CHECK: - .args:
; CHECK: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel71
define amdgpu_kernel void @test_kernel71(i8 addrspace(4)* addrspace(1)* %sink) #2 {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
call void @function3(i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* %sink)
define amdgpu_kernel void @test_kernel71(ptr addrspace(1) %sink) #2 {
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i32 42
call void @function3(ptr addrspace(4) %gep, ptr addrspace(1) %sink)
ret void
}

Expand All @@ -290,9 +283,9 @@ define amdgpu_kernel void @test_kernel71(i8 addrspace(4)* addrspace(1)* %sink) #
; CHECK-NOT: hidden_queue_ptr
; CHECK-LABEL: .name: test_kernel72
define amdgpu_kernel void @test_kernel72() #2 {
%ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
store i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* undef, align 8
%ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i32 42
store ptr addrspace(4) %gep, ptr addrspace(1) undef, align 8
ret void
}

Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -S -opaque-pointers -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope -check-prefix=GCN %s

; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define amdgpu_kernel void @get_local_size_x_opaque_pointer(i16 addrspace(1)* %out) #0 {
define amdgpu_kernel void @get_local_size_x_opaque_pointer(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_local_size_x_opaque_pointer(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
Expand All @@ -17,12 +17,12 @@ define amdgpu_kernel void @get_local_size_x_opaque_pointer(i16 addrspace(1)* %ou
%local.size.offset = select i1 %cmp.id.count, i64 12, i64 18
%gep.local.size = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset
%local.size = load i16, ptr addrspace(4) %gep.local.size, align 2
store i16 %local.size, i16 addrspace(1)* %out
store i16 %local.size, ptr addrspace(1) %out
ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define amdgpu_kernel void @get_local_size_y_opaque_pointer(i16 addrspace(1)* %out) #0 {
define amdgpu_kernel void @get_local_size_y_opaque_pointer(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_local_size_y_opaque_pointer(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14
Expand All @@ -38,12 +38,12 @@ define amdgpu_kernel void @get_local_size_y_opaque_pointer(i16 addrspace(1)* %ou
%local.size.offset = select i1 %cmp.id.count, i64 14, i64 20
%gep.local.size = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset
%local.size = load i16, ptr addrspace(4) %gep.local.size, align 2
store i16 %local.size, i16 addrspace(1)* %out
store i16 %local.size, ptr addrspace(1) %out
ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define amdgpu_kernel void @get_local_size_z_opaque_pointer(i16 addrspace(1)* %out) #0 {
define amdgpu_kernel void @get_local_size_z_opaque_pointer(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_local_size_z_opaque_pointer(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
Expand All @@ -59,7 +59,7 @@ define amdgpu_kernel void @get_local_size_z_opaque_pointer(i16 addrspace(1)* %ou
%local.size.offset = select i1 %cmp.id.count, i64 16, i64 22
%gep.local.size = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset
%local.size = load i16, ptr addrspace(4) %gep.local.size, align 2
store i16 %local.size, i16 addrspace(1)* %out
store i16 %local.size, ptr addrspace(1) %out
ret void
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
declare i32 @llvm.amdgcn.workitem.id.x() #1

; There should be no spill code inserted between the xor and the real terminator
define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
; GCN-LABEL: name: extract_w_offset_vgpr
; GCN: bb.0.entry:
; GCN-NEXT: successors: %bb.1(0x80000000)
; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset.cast, align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1
; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
; GCN-NEXT: renamable $sgpr4 = S_MOV_B32 61440
Expand Down Expand Up @@ -106,6 +106,6 @@ entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
%index = add i32 %id, 1
%value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
store i32 %value, i32 addrspace(1)* %out
store i32 %value, ptr addrspace(1) %out
ret void
}
108 changes: 54 additions & 54 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll

Large diffs are not rendered by default.

100 changes: 50 additions & 50 deletions llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load i32, i32 addrspace(4)* %in
store i32 %ld, i32 addrspace(1)* %out
%ld = load i32, ptr addrspace(4) %in
store i32 %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -22,8 +22,8 @@ entry:
; EG: VTX_READ_64
define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
%ld = load <2 x i32>, ptr addrspace(4) %in
store <2 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -33,8 +33,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <3 x i32>, <3 x i32> addrspace(4)* %in
store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
%ld = load <3 x i32>, ptr addrspace(4) %in
store <3 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -44,8 +44,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
%ld = load <4 x i32>, ptr addrspace(4) %in
store <4 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -56,8 +56,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
%ld = load <8 x i32>, ptr addrspace(4) %in
store <8 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -70,8 +70,8 @@ entry:
; EG: VTX_READ_32
define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <9 x i32>, <9 x i32> addrspace(4)* %in
store <9 x i32> %ld, <9 x i32> addrspace(1)* %out
%ld = load <9 x i32>, ptr addrspace(4) %in
store <9 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -84,8 +84,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <10 x i32>, <10 x i32> addrspace(4)* %in
store <10 x i32> %ld, <10 x i32> addrspace(1)* %out
%ld = load <10 x i32>, ptr addrspace(4) %in
store <10 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -98,8 +98,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <11 x i32>, <11 x i32> addrspace(4)* %in
store <11 x i32> %ld, <11 x i32> addrspace(1)* %out
%ld = load <11 x i32>, ptr addrspace(4) %in
store <11 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -112,8 +112,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <12 x i32>, <12 x i32> addrspace(4)* %in
store <12 x i32> %ld, <12 x i32> addrspace(1)* %out
%ld = load <12 x i32>, ptr addrspace(4) %in
store <12 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -126,8 +126,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
%ld = load <16 x i32>, ptr addrspace(4) %in
store <16 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -140,9 +140,9 @@ entry:
; EG: CF_END
; EG: VTX_READ_32
define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load i32, i32 addrspace(4)* %in
%ld = load i32, ptr addrspace(4) %in
%ext = zext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
store i64 %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -157,19 +157,19 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.
; EG: 31
define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load i32, i32 addrspace(4)* %in
%ld = load i32, ptr addrspace(4) %in
%ext = sext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
store i64 %ext, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
; GCN: s_load_dword
; GCN: store_dwordx2
define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
%ld = load <1 x i32>, ptr addrspace(4) %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
store <1 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -178,19 +178,19 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
; GCN: store_dwordx2
define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
%ld = load <1 x i32>, ptr addrspace(4) %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
store <1 x i64> %ext, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN: store_dwordx4
define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
%ld = load <2 x i32>, ptr addrspace(4) %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
store <2 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -202,9 +202,9 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou

; GCN: store_dwordx4
define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
%ld = load <2 x i32>, ptr addrspace(4) %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
store <2 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -214,9 +214,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GCN: store_dwordx4
; GCN: store_dwordx4
define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
%ld = load <4 x i32>, ptr addrspace(4) %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
store <4 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -231,9 +231,9 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GCN: store_dwordx4
; GCN: store_dwordx4
define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
%ld = load <4 x i32>, ptr addrspace(4) %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
store <4 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -250,9 +250,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GCN-SA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
%ld = load <8 x i32>, ptr addrspace(4) %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
store <8 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -278,9 +278,9 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
%ld = load <8 x i32>, ptr addrspace(4) %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
store <8 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -299,9 +299,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GCN: store_dwordx4
; GCN: store_dwordx4
define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
%ld = load <16 x i32>, ptr addrspace(4) %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
store <16 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -326,9 +326,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
%ld = load <16 x i32>, ptr addrspace(4) %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
store <16 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -378,9 +378,9 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4

define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
%ld = load <32 x i32>, ptr addrspace(4) %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
store <32 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -429,9 +429,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
%ld = load <32 x i32>, ptr addrspace(4) %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
store <32 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -481,8 +481,8 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
%ld = load <32 x i32>, ptr addrspace(4) %in
store <32 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/load-global-f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load float, float addrspace(1)* %in
store float %tmp0, float addrspace(1)* %out
%tmp0 = load float, ptr addrspace(1) %in
store float %tmp0, ptr addrspace(1) %out
ret void
}

Expand All @@ -24,8 +24,8 @@ entry:
; R600: VTX_READ_64
define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
%tmp0 = load <2 x float>, ptr addrspace(1) %in
store <2 x float> %tmp0, ptr addrspace(1) %out
ret void
}

Expand All @@ -37,8 +37,8 @@ entry:
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
%tmp0 = load <3 x float>, ptr addrspace(1) %in
store <3 x float> %tmp0, ptr addrspace(1) %out
ret void
}

Expand All @@ -49,8 +49,8 @@ entry:
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
%tmp0 = load <4 x float>, ptr addrspace(1) %in
store <4 x float> %tmp0, ptr addrspace(1) %out
ret void
}

Expand All @@ -64,8 +64,8 @@ entry:
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
%tmp0 = load <8 x float>, ptr addrspace(1) %in
store <8 x float> %tmp0, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -162,8 +162,8 @@ entry:
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
%tmp0 = load <16 x float>, ptr addrspace(1) %in
store <16 x float> %tmp0, ptr addrspace(1) %out
ret void
}

Expand Down
100 changes: 50 additions & 50 deletions llvm/test/CodeGen/AMDGPU/load-global-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load i32, i32 addrspace(1)* %in
store i32 %ld, i32 addrspace(1)* %out
%ld = load i32, ptr addrspace(1) %in
store i32 %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -24,8 +24,8 @@ entry:
; EG: VTX_READ_64
define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
%ld = load <2 x i32>, ptr addrspace(1) %in
store <2 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -37,8 +37,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
%ld = load <3 x i32>, ptr addrspace(1) %in
store <3 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -49,8 +49,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
%ld = load <4 x i32>, ptr addrspace(1) %in
store <4 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -64,8 +64,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
%ld = load <8 x i32>, ptr addrspace(1) %in
store <8 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -78,8 +78,8 @@ entry:
; GCN-HSA: {{flat|global}}_load_dword
define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <9 x i32>, <9 x i32> addrspace(1)* %in
store <9 x i32> %ld, <9 x i32> addrspace(1)* %out
%ld = load <9 x i32>, ptr addrspace(1) %in
store <9 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -92,8 +92,8 @@ entry:
; GCN-HSA: {{flat|global}}_load_dwordx2
define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <10 x i32>, <10 x i32> addrspace(1)* %in
store <10 x i32> %ld, <10 x i32> addrspace(1)* %out
%ld = load <10 x i32>, ptr addrspace(1) %in
store <10 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -109,8 +109,8 @@ entry:
; GCN-HSA: {{flat|global}}_load_dwordx3
define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <11 x i32>, <11 x i32> addrspace(1)* %in
store <11 x i32> %ld, <11 x i32> addrspace(1)* %out
%ld = load <11 x i32>, ptr addrspace(1) %in
store <11 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -124,8 +124,8 @@ entry:
; GCN-HSA: {{flat|global}}_load_dwordx4
define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <12 x i32>, <12 x i32> addrspace(1)* %in
store <12 x i32> %ld, <12 x i32> addrspace(1)* %out
%ld = load <12 x i32>, ptr addrspace(1) %in
store <12 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -146,8 +146,8 @@ entry:
; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
%ld = load <16 x i32>, ptr addrspace(1) %in
store <16 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand All @@ -161,9 +161,9 @@ entry:

; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load i32, i32 addrspace(1)* %in
%ld = load i32, ptr addrspace(1) %in
%ext = zext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
store i64 %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -180,9 +180,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.
; EG: 31
define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load i32, i32 addrspace(1)* %in
%ld = load i32, ptr addrspace(1) %in
%ext = sext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
store i64 %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -193,9 +193,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA: {{flat|global}}_load_dword
; GCN-HSA: {{flat|global}}_store_dwordx2
define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
%ld = load <1 x i32>, ptr addrspace(1) %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
store <1 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -206,9 +206,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
%ld = load <1 x i32>, ptr addrspace(1) %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
store <1 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -219,9 +219,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA: {{flat|global}}_load_dwordx2
; GCN-HSA: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
%ld = load <2 x i32>, ptr addrspace(1) %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
store <2 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -235,9 +235,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCN-NOHSA-DAG: buffer_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
%ld = load <2 x i32>, ptr addrspace(1) %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
store <2 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -250,9 +250,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
%ld = load <4 x i32>, ptr addrspace(1) %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
store <4 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -271,9 +271,9 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
%ld = load <4 x i32>, ptr addrspace(1) %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
store <4 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand All @@ -294,9 +294,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
%ld = load <8 x i32>, ptr addrspace(1) %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
store <8 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -326,9 +326,9 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
%ld = load <8 x i32>, ptr addrspace(1) %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
store <8 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -372,9 +372,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-DAG: buffer_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
%ld = load <16 x i32>, ptr addrspace(1) %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
store <16 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -407,9 +407,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
%ld = load <16 x i32>, ptr addrspace(1) %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
store <16 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -507,9 +507,9 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4

define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
%ld = load <32 x i32>, ptr addrspace(1) %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
store <32 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -574,9 +574,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
%ld = load <32 x i32>, ptr addrspace(1) %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
store <32 x i64> %ext, ptr addrspace(1) %out
ret void
}

Expand Down Expand Up @@ -642,8 +642,8 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
%ld = load <32 x i32>, ptr addrspace(1) %in
store <32 x i32> %ld, ptr addrspace(1) %out
ret void
}

Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@ bb23: ; preds = %bb23, %bb
%tmp30 = sub i32 %tmp24, %tmp29
%tmp31 = add i32 %tmp30, %arg16
%tmp37 = icmp ult i32 %tmp31, %arg13
%tmp44 = load float, float addrspace(1)* undef, align 4
store float %tmp44, float addrspace(3)* undef, align 4
%tmp44 = load float, ptr addrspace(1) undef, align 4
store float %tmp44, ptr addrspace(3) undef, align 4
%tmp47 = add i32 %tmp24, %arg2
br i1 %tmp37, label %bb23, label %.loopexit
}

define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3)* nocapture %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(1)* nocapture readonly %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i1 zeroext %arg17, i1 zeroext %arg18) #0 {
define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) nocapture %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, ptr addrspace(1) nocapture readonly %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i1 zeroext %arg17, i1 zeroext %arg18) #0 {
; GFX9-LABEL: lsr_order_mul24_1:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -100,7 +100,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
; GFX9-NEXT: v_add_u32_e32 v6, v6, v8
; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GFX9-NEXT: s_cbranch_execnz .LBB1_2
; GFX9-NEXT: .LBB1_3: ; %Flow3
; GFX9-NEXT: .LBB1_3: ; %Flow2
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -137,11 +137,11 @@ bb23: ; preds = %bb19, %bb23
%tmp40 = and i1 %tmp39, %arg17
%tmp41 = zext i32 %tmp35 to i64
%tmp42 = select i1 %tmp40, i64 %tmp41, i64 0
%tmp43 = getelementptr inbounds float, float addrspace(1)* %arg10, i64 %tmp42
%tmp44 = load float, float addrspace(1)* %tmp43, align 4
%tmp43 = getelementptr inbounds float, ptr addrspace(1) %arg10, i64 %tmp42
%tmp44 = load float, ptr addrspace(1) %tmp43, align 4
%tmp45 = select i1 %tmp40, float %tmp44, float 0.000000e+00
%tmp46 = getelementptr inbounds float, float addrspace(3)* %arg3, i32 %tmp36
store float %tmp45, float addrspace(3)* %tmp46, align 4
%tmp46 = getelementptr inbounds float, ptr addrspace(3) %arg3, i32 %tmp36
store float %tmp45, ptr addrspace(3) %tmp46, align 4
%tmp47 = add i32 %tmp24, %arg2
%tmp48 = icmp ult i32 %tmp47, %arg1
br i1 %tmp48, label %bb23, label %.loopexit
Expand Down Expand Up @@ -170,17 +170,17 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
%mul0 = mul i32 %b, %s
; CHECK: mul i32
; CHECK-NOT: mul i32
store volatile i32 %mul0, i32 addrspace(1)* undef
store volatile i32 %mul0, ptr addrspace(1) undef

; foo((b + 1) * s);
%b1 = add i32 %b, 1
%mul1 = mul i32 %b1, %s
store volatile i32 %mul1, i32 addrspace(1)* undef
store volatile i32 %mul1, ptr addrspace(1) undef

; foo((b + 2) * s);
%b2 = add i32 %b, 2
%mul2 = mul i32 %b2, %s
store volatile i32 %mul2, i32 addrspace(1)* undef
store volatile i32 %mul2, ptr addrspace(1) undef
ret void
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64 */, def %23
; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset.cast, addrspace 4)
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `<2 x i32> addrspace(1)* undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: S_ENDPGM 0
; PEI-GFX908-LABEL: name: partial_copy
; PEI-GFX908: bb.0 (%ir-block.0):
Expand All @@ -41,17 +41,17 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
; PEI-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset.cast, addrspace 4)
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `<2 x i32> addrspace(1)* undef`, addrspace 1)
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX908-NEXT: S_ENDPGM 0
; REGALLOC-GFX90A-LABEL: name: partial_copy
; REGALLOC-GFX90A: bb.0 (%ir-block.0):
Expand All @@ -62,15 +62,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64_Align2 */, def %23
; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset.cast, addrspace 4)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; REGALLOC-GFX90A-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `<2 x i32> addrspace(1)* undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: S_ENDPGM 0
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
Expand All @@ -85,24 +85,24 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset.cast, addrspace 4)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `<2 x i32> addrspace(1)* undef`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX90A-NEXT: S_ENDPGM 0
call void asm sideeffect "; use $0", "a" (i32 undef)
%v0 = call <4 x i32> asm sideeffect "; def $0", "=v" ()
%v1 = call <2 x i32> asm sideeffect "; def $0", "=v" ()
%mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)
store volatile <4 x i32> %v0, <4 x i32> addrspace(1)* undef
store volatile <2 x i32> %v1, <2 x i32> addrspace(1)* undef
store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef
store volatile <4 x i32> %v0, ptr addrspace(1) undef
store volatile <2 x i32> %v1, ptr addrspace(1) undef
store volatile <4 x i32> %mai, ptr addrspace(1) undef
ret void
}

Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,15 @@
; The types of the users of the addrspacecast should not be changed.

; CHECK-LABEL: @invalid_bitcast_addrspace(
; CHECK: getelementptr inbounds [256 x [1 x i32]], [256 x [1 x i32]] addrspace(3)* @invalid_bitcast_addrspace.data, i32 0, i32 %14
; CHECK: bitcast [1 x i32] addrspace(3)* %{{[0-9]+}} to half addrspace(3)*
; CHECK: addrspacecast half addrspace(3)* %tmp to half*
; CHECK: bitcast half* %tmp1 to <2 x i16>*
; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [256 x [1 x i32]], ptr addrspace(3) @invalid_bitcast_addrspace.data, i32 0, i32 %{{[0-9]+}}
; CHECK: [[ASC:%[a-z0-9]+]] = addrspacecast ptr addrspace(3) [[GEP]] to ptr
; CHECK: [[LOAD:%[a-z0-9]+]] = load <2 x i16>, ptr [[ASC]]
; CHECK: bitcast <2 x i16> [[LOAD]] to <2 x half>
define amdgpu_kernel void @invalid_bitcast_addrspace() #0 {
entry:
%data = alloca [1 x i32], addrspace(5)
%tmp = bitcast [1 x i32] addrspace(5)* %data to half addrspace(5)*
%tmp1 = addrspacecast half addrspace(5)* %tmp to half*
%tmp2 = bitcast half* %tmp1 to <2 x i16>*
%tmp3 = load <2 x i16>, <2 x i16>* %tmp2, align 2
%tmp1 = addrspacecast ptr addrspace(5) %data to ptr
%tmp3 = load <2 x i16>, ptr %tmp1, align 2
%tmp4 = bitcast <2 x i16> %tmp3 to <2 x half>
ret void
}
Expand Down
277 changes: 134 additions & 143 deletions llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
Original file line number Diff line number Diff line change
Expand Up @@ -156,23 +156,23 @@ define amdgpu_kernel void @kernel_64_256() #7 {
; CHECK-NEXT: call void @merge_cycle_0()
; CHECK-NEXT: call void @default_captured_address()
; CHECK-NEXT: call void @externally_visible_default()
; CHECK-NEXT: [[F32:%.*]] = call float bitcast (i32 ()* @bitcasted_function to float ()*)()
; CHECK-NEXT: [[F32:%.*]] = call float @bitcasted_function()
; CHECK-NEXT: ret void
;
call void @merge_cycle_0()
call void @default_captured_address()
call void @externally_visible_default()
%f32 = call float bitcast (i32 ()* @bitcasted_function to float ()*)()
%f32 = call float @bitcasted_function()
ret void
}

define internal void @default_captured_address() {
; CHECK-LABEL: define {{[^@]+}}@default_captured_address
; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
; CHECK-NEXT: store volatile void ()* @default_captured_address, void ()** undef, align 8
; CHECK-NEXT: store volatile ptr @default_captured_address, ptr undef, align 8
; CHECK-NEXT: ret void
;
store volatile void ()* @default_captured_address, void ()** undef, align 8
store volatile ptr @default_captured_address, ptr undef, align 8
ret void
}

Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,19 @@ define internal void @indirect() {
define amdgpu_kernel void @test_simple_indirect_call() {
; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] {
; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8, addrspace(5)
; AKF_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast void ()* addrspace(5)* [[FPTR]] to void ()**
; AKF_GCN-NEXT: store void ()* @indirect, void ()** [[FPTR_CAST]], align 8
; AKF_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR_CAST]], align 8
; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
; AKF_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr
; AKF_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8
; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8
; AKF_GCN-NEXT: call void [[FP]]()
; AKF_GCN-NEXT: ret void
;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8, addrspace(5)
; ATTRIBUTOR_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast void ()* addrspace(5)* [[FPTR]] to void ()**
; ATTRIBUTOR_GCN-NEXT: store void ()* @indirect, void ()** [[FPTR_CAST]], align 8
; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR_CAST]], align 8
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
; ATTRIBUTOR_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr
; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8
; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8
; ATTRIBUTOR_GCN-NEXT: call void [[FP]]()
; ATTRIBUTOR_GCN-NEXT: ret void
;
Expand All @@ -62,10 +62,10 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; GFX9-NEXT: ds_write_b64 v0, v[3:4]
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-NEXT: s_endpgm
%fptr = alloca void()*, addrspace(5)
%fptr.cast = addrspacecast void()* addrspace(5)* %fptr to void()**
store void()* @indirect, void()** %fptr.cast
%fp = load void()*, void()** %fptr.cast
%fptr = alloca ptr, addrspace(5)
%fptr.cast = addrspacecast ptr addrspace(5) %fptr to ptr
store ptr @indirect, ptr %fptr.cast
%fp = load ptr, ptr %fptr.cast
call void %fp()
ret void
}
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ define void @spill_csr_s5_copy() #0 {
bb:
%alloca = alloca i32, addrspace(5)
%tmp = tail call i64 @func() #1
%tmp1 = getelementptr inbounds i32, i32 addrspace(1)* null, i64 %tmp
%tmp2 = load i32, i32 addrspace(1)* %tmp1, align 4
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) null, i64 %tmp
%tmp2 = load i32, ptr addrspace(1) %tmp1, align 4
%tmp3 = zext i32 %tmp2 to i64
store volatile i32 9, i32 addrspace(5)* %alloca
store volatile i32 9, ptr addrspace(5) %alloca
ret void
}

Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,38 @@

; GCN-LABEL: {{^}}trunc_store_v4i64_v4i8:
; GCN: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @trunc_store_v4i64_v4i8(< 4 x i8> addrspace(1)* %out, <4 x i64> %in) {
define amdgpu_kernel void @trunc_store_v4i64_v4i8(ptr addrspace(1) %out, <4 x i64> %in) {
entry:
%trunc = trunc <4 x i64> %in to < 4 x i8>
store <4 x i8> %trunc, <4 x i8> addrspace(1)* %out
store <4 x i8> %trunc, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}trunc_store_v8i64_v8i8:
; GCN: global_store_dwordx2 v{{[0-9]+}}, v{{\[[0-9]:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @trunc_store_v8i64_v8i8(< 8 x i8> addrspace(1)* %out, <8 x i64> %in) {
define amdgpu_kernel void @trunc_store_v8i64_v8i8(ptr addrspace(1) %out, <8 x i64> %in) {
entry:
%trunc = trunc <8 x i64> %in to < 8 x i8>
store <8 x i8> %trunc, <8 x i8> addrspace(1)* %out
store <8 x i8> %trunc, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}trunc_store_v8i64_v8i16:
; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @trunc_store_v8i64_v8i16(< 8 x i16> addrspace(1)* %out, <8 x i64> %in) {
define amdgpu_kernel void @trunc_store_v8i64_v8i16(ptr addrspace(1) %out, <8 x i64> %in) {
entry:
%trunc = trunc <8 x i64> %in to < 8 x i16>
store <8 x i16> %trunc, <8 x i16> addrspace(1)* %out
store <8 x i16> %trunc, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}trunc_store_v8i64_v8i32:
; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16
; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @trunc_store_v8i64_v8i32(< 8 x i32> addrspace(1)* %out, <8 x i64> %in) {
define amdgpu_kernel void @trunc_store_v8i64_v8i32(ptr addrspace(1) %out, <8 x i64> %in) {
entry:
%trunc = trunc <8 x i64> %in to <8 x i32>
store <8 x i32> %trunc, <8 x i32> addrspace(1)* %out
store <8 x i32> %trunc, ptr addrspace(1) %out
ret void
}

Expand All @@ -42,9 +42,9 @@ entry:
; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32
; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16
; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) {
define amdgpu_kernel void @trunc_store_v16i64_v16i32(ptr addrspace(1) %out, <16 x i64> %in) {
entry:
%trunc = trunc <16 x i64> %in to <16 x i32>
store <16 x i32> %trunc, <16 x i32> addrspace(1)* %out
store <16 x i32> %trunc, ptr addrspace(1) %out
ret void
}
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ B30.1:
B30.2:
%v3 = phi <4 x float> [ %sub, %B30.1 ], [ %v0, %B2 ]
%ve0 = extractelement <4 x float> %v3, i32 0
store float %ve0, float addrspace(3)* undef, align 4
store float %ve0, ptr addrspace(3) undef, align 4
ret void
}

Expand All @@ -50,10 +50,10 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: s_waitcnt expcnt(0)
; CHECK-NEXT: s_waitcnt expcnt(1)
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
; CHECK-NEXT: .LBB1_1: ; %bb9
Expand All @@ -67,25 +67,25 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; CHECK-NEXT: ; return to shader part epilog
bb:
%tmp = load volatile i32, i32 addrspace(1)* undef, align 4
%tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4
%tmp = load volatile i32, ptr addrspace(1) undef, align 4
%tmp1 = load volatile i32, ptr addrspace(1) undef, align 4
%tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0
%tmp3 = bitcast i32 %tmp1 to float
%tmp4 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp3, float %tmp3, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)
%tmp5 = extractelement <4 x float> %tmp4, i32 0
%tmp6 = fmul float %tmp5, undef
%tmp7 = fadd float %tmp6, %tmp6
%tmp8 = insertelement <4 x i32> %tmp2, i32 %tmp, i32 1
store <4 x i32> %tmp8, <4 x i32> addrspace(1)* undef, align 16
store float %tmp7, float addrspace(1)* undef, align 4
store <4 x i32> %tmp8, ptr addrspace(1) undef, align 16
store float %tmp7, ptr addrspace(1) undef, align 4
br label %bb9

bb9: ; preds = %bb9, %bb
%tmp10 = icmp eq i32 %tmp, 0
br i1 %tmp10, label %bb9, label %bb11

bb11: ; preds = %bb9
store <4 x i32> %tmp2, <4 x i32> addrspace(1)* undef, align 16
store <4 x i32> %tmp2, ptr addrspace(1) undef, align 16
ret float undef
}

Expand Down Expand Up @@ -118,7 +118,7 @@ define amdgpu_kernel void @partially_undef_copy() #0 {
%partially.undef.0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
%partially.undef.1 = insertelement <4 x i32> %partially.undef.0, i32 %tmp1, i32 0

store volatile <4 x i32> %partially.undef.1, <4 x i32> addrspace(1)* undef, align 16
store volatile <4 x i32> %partially.undef.1, ptr addrspace(1) undef, align 16
tail call void asm sideeffect "v_nop", "v={v[5:8]}"(<4 x i32> %partially.undef.0)
ret void
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
define void @foo() #0 {
; CHECK-LABEL: define {{[^@]+}}@foo
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: store i32 0, i32* @x, align 4
; CHECK-NEXT: store i32 0, ptr @x, align 4
; CHECK-NEXT: ret void
;
store i32 0, i32* @x
store i32 0, ptr @x
ret void
}

Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-attributor < %s | FileCheck %s

;.
; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = global i32* null
; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = global ptr null
; CHECK: @[[G2:[a-zA-Z0-9_$"\\.-]+]] = global i32 0
;.
define weak void @weak() {
Expand All @@ -15,17 +15,17 @@ define weak void @weak() {
ret void
}

@G1 = global i32* null
@G1 = global ptr null

define internal void @internal1() {
; CHECK-LABEL: define {{[^@]+}}@internal1
; CHECK-SAME: () #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** @G1, align 8
; CHECK-NEXT: store i32 0, i32* [[TMP1]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr @G1, align 8
; CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4
; CHECK-NEXT: ret void
;
%1 = load i32*, i32** @G1
store i32 0, i32* %1
%1 = load ptr, ptr @G1
store i32 0, ptr %1
ret void
}

Expand All @@ -44,7 +44,7 @@ define amdgpu_kernel void @kernel1() #0 {
define internal void @internal3() {
; CHECK-LABEL: define {{[^@]+}}@internal3
; CHECK-SAME: () #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @G2, align 4
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @G2, align 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
; CHECK: 3:
Expand All @@ -54,7 +54,7 @@ define internal void @internal3() {
; CHECK: 4:
; CHECK-NEXT: ret void
;
%1 = load i32, i32* @G2, align 4
%1 = load i32, ptr @G2, align 4
%2 = icmp eq i32 %1, 0
br i1 %2, label %3, label %4
3:
Expand All @@ -68,10 +68,10 @@ define internal void @internal3() {
define internal void @internal4() {
; CHECK-LABEL: define {{[^@]+}}@internal4
; CHECK-SAME: () #[[ATTR1]] {
; CHECK-NEXT: store i32 1, i32* @G2, align 4
; CHECK-NEXT: store i32 1, ptr @G2, align 4
; CHECK-NEXT: ret void
;
store i32 1, i32* @G2, align 4
store i32 1, ptr @G2, align 4
ret void
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
define void @func() #0 {
; CHECK-LABEL: define {{[^@]+}}@func
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: store i32 0, i32* @x, align 4
; CHECK-NEXT: store i32 0, ptr @x, align 4
; CHECK-NEXT: ret void
;
store i32 0, i32* @x
store i32 0, ptr @x
ret void
}

Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,20 @@ exit:
ret i32 1
}

define amdgpu_kernel void @kernel(i32 addrspace(1)* %m) #1 {
define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 {
; CHECK-LABEL: define {{[^@]+}}@kernel
; CHECK-SAME: (i32 addrspace(1)* [[M:%.*]]) #[[ATTR2:[0-9]+]] {
; CHECK-SAME: (ptr addrspace(1) [[M:%.*]]) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: [[R:%.*]] = call i32 @fib(i32 5)
; CHECK-NEXT: [[R2:%.*]] = call i32 @fib_internal(i32 5)
; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[M]], align 4
; CHECK-NEXT: store i32 [[R2]], i32 addrspace(1)* [[M]], align 4
; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[M]], align 4
; CHECK-NEXT: store i32 [[R2]], ptr addrspace(1) [[M]], align 4
; CHECK-NEXT: ret void
;
%r = call i32 @fib(i32 5)
%r2 = call i32 @fib_internal(i32 5)

store i32 %r, i32 addrspace(1)* %m
store i32 %r2, i32 addrspace(1)* %m
store i32 %r, ptr addrspace(1) %m
store i32 %r2, ptr addrspace(1) %m
ret void
}

Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@
define void @func1() {
; CHECK-LABEL: define {{[^@]+}}@func1
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: store i32 0, i32* @x, align 4
; CHECK-NEXT: store i32 0, ptr @x, align 4
; CHECK-NEXT: ret void
;
store i32 0, i32* @x
store i32 0, ptr @x
ret void
}

define void @func4() {
; CHECK-LABEL: define {{[^@]+}}@func4
; CHECK-SAME: () #[[ATTR0]] {
; CHECK-NEXT: store i32 0, i32* @x, align 4
; CHECK-NEXT: store i32 0, ptr @x, align 4
; CHECK-NEXT: ret void
;
store i32 0, i32* @x
store i32 0, ptr @x
ret void
}

Expand Down
33 changes: 16 additions & 17 deletions llvm/test/CodeGen/AMDGPU/vector-alloca-addrspacecast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,23 @@

; OPT-LABEL: @vector_addrspacecast(
; OPT: alloca [3 x i32]
; OPT: store i32 0, i32 addrspace(5)* %a0, align 4
; OPT: store i32 1, i32 addrspace(5)* %a1, align 4
; OPT: store i32 2, i32 addrspace(5)* %a2, align 4
; OPT: %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
; OPT: %ac = addrspacecast i32 addrspace(5)* %tmp to i32*
; OPT: %data = load i32, i32* %ac, align 4
define amdgpu_kernel void @vector_addrspacecast(i32 addrspace(1)* %out, i64 %index) {
; OPT: store i32 0, ptr addrspace(5) %alloca, align 4
; OPT: store i32 1, ptr addrspace(5) %a1, align 4
; OPT: store i32 2, ptr addrspace(5) %a2, align 4
; OPT: %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
; OPT: %ac = addrspacecast ptr addrspace(5) %tmp to ptr
; OPT: %data = load i32, ptr %ac, align 4
define amdgpu_kernel void @vector_addrspacecast(ptr addrspace(1) %out, i64 %index) {
entry:
%alloca = alloca [3 x i32], addrspace(5)
%a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
%a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
%a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
store i32 0, i32 addrspace(5)* %a0
store i32 1, i32 addrspace(5)* %a1
store i32 2, i32 addrspace(5)* %a2
%tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
%ac = addrspacecast i32 addrspace(5)* %tmp to i32 *
%data = load i32, i32 * %ac
store i32 %data, i32 addrspace(1)* %out
%a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
%a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
store i32 0, ptr addrspace(5) %alloca
store i32 1, ptr addrspace(5) %a1
store i32 2, ptr addrspace(5) %a2
%tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
%ac = addrspacecast ptr addrspace(5) %tmp to ptr
%data = load i32, ptr %ac
store i32 %data, ptr addrspace(1) %out
ret void
}
19 changes: 9 additions & 10 deletions llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
; RUN: opt -S -mtriple=amdgcn-- -passes=load-store-vectorizer < %s | FileCheck -check-prefix=OPT %s

; OPT-LABEL: @func(
define void @func(i32 addrspace(7)* %out) {
define void @func(ptr addrspace(7) %out) {
entry:
%a0 = getelementptr i32, i32 addrspace(7)* %out, i32 0
%a1 = getelementptr i32, i32 addrspace(7)* %out, i32 1
%a2 = getelementptr i32, i32 addrspace(7)* %out, i32 2
%a3 = getelementptr i32, i32 addrspace(7)* %out, i32 3
%a1 = getelementptr i32, ptr addrspace(7) %out, i32 1
%a2 = getelementptr i32, ptr addrspace(7) %out, i32 2
%a3 = getelementptr i32, ptr addrspace(7) %out, i32 3

; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> addrspace(7)* %0, align 4
store i32 0, i32 addrspace(7)* %a0
store i32 1, i32 addrspace(7)* %a1
store i32 2, i32 addrspace(7)* %a2
store i32 3, i32 addrspace(7)* %a3
; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(7) %out, align 4
store i32 0, ptr addrspace(7) %out
store i32 1, ptr addrspace(7) %a1
store i32 2, ptr addrspace(7) %a2
store i32 3, ptr addrspace(7) %a3
ret void
}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,

main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
store <4 x float> %v, <4 x float> addrspace(1)* undef
store <4 x float> %v, ptr addrspace(1) undef
call void @extern_func()
%v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
ret <4 x float> %v1
Expand Down