From 7740f76ed4ca04930bd7b764561dc3a45dd764a5 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 30 Sep 2025 14:07:34 +0100 Subject: [PATCH] [AMDGPU] Propagate alias information in AMDGPULowerKernelArguments. --- .../CodeGen/AMDGPU/lower-noalias-kernargs.ll | 620 ++++++++++++++++++ 1 file changed, 620 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll diff --git a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll new file mode 100644 index 0000000000000..313ae3b883e56 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll @@ -0,0 +1,620 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=amdgcn-- -S -o - -passes=amdgpu-lower-kernel-arguments %s | FileCheck %s + +define amdgpu_kernel void @aliasinfo_2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_2I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4 +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %in.gep, align 4 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone + store i32 %ctlz, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @aliasinfo_2i32_NA(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_NA( +; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_2I32_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4 +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %in.gep, align 4 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone + store i32 %ctlz, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @aliasinfo_2i32_AS(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_AS( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_2I32_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_AS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_AS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %in.gep, align 4, !alias.scope !4, !noalias !2 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone + store i32 %ctlz, ptr addrspace(1) %out, align 4, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_2i32_NA_AS(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_NA_AS( +; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_2I32_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %in.gep, align 4, !alias.scope !4, !noalias !2 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone + store i32 %ctlz, ptr addrspace(1) %out, align 4, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_v4f32_3v4i8(ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %in, ptr addrspace(1) %in1) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 60 +; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1 +; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, ptr addrspace(1) %out, align 16 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4 + ret void +} + +define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA( +; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[OUT1:%.*]], ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[IN1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1 +; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, ptr addrspace(1) %out, align 16 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4 + ret void +} + +define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_AS(ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %in, ptr addrspace(1) %in1) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_AS( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 60 +; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1, !alias.scope !4, !noalias !2 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1, !alias.scope !4, !noalias !2 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, ptr addrspace(1) %out, align 16, !alias.scope !2, !noalias !4 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA_AS(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA_AS( +; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[OUT1:%.*]], ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[IN1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT]], align 16, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1, !alias.scope !4, !noalias !2 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1, !alias.scope !4, !noalias !2 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, ptr addrspace(1) %out, align 16, !alias.scope !2, !noalias !4 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_10v16f16(ptr addrspace(3) %in, ptr addrspace(3) %out) #0 { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16( +; CHECK-SAME: ptr addrspace(3) [[IN:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_10V16F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32 +; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) +; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) +; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) +; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) +; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) +; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32 +; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32 +; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32 +; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32 +; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32 +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr + %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) + %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) + %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) + %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) + %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) + %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx + store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 + store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 + store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 + store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 + store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + ret void +} + +define amdgpu_kernel void @aliasinfo_10v16f16_NA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_NA( +; CHECK-SAME: ptr addrspace(3) noalias [[IN:%.*]], ptr addrspace(3) noalias [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32 +; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) +; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) +; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) +; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) +; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) +; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32 +; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32 +; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32 +; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32 +; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32 +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr + %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) + %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) + %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) + %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) + %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) + %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx + store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 + store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 + store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 + store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 + store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + ret void +} + +define amdgpu_kernel void @aliasinfo_10v16f16_AS(ptr addrspace(3) %in, ptr addrspace(3) %out) #0 { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_AS( +; CHECK-SAME: ptr addrspace(3) [[IN:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_10V16F16_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) +; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) +; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) +; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) +; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) +; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr, !alias.scope !4, !noalias !2 + %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr, !alias.scope !4, !noalias !2 + %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr, !alias.scope !4, !noalias !2 + %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr, !alias.scope !4, !noalias !2 + %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr, !alias.scope !4, !noalias !2 + %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) + %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) + %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) + %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) + %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) + %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx + store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr, !alias.scope !2, !noalias !4 + %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 + store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr, !alias.scope !2, !noalias !4 + %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 + store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr, !alias.scope !2, !noalias !4 + %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 + store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr, !alias.scope !2, !noalias !4 + %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 + store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr, !alias.scope !2, !noalias !4 + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + ret void +} + +define amdgpu_kernel void @aliasinfo_10v16f16_NA_AS(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_NA_AS( +; CHECK-SAME: ptr addrspace(3) noalias [[IN:%.*]], ptr addrspace(3) noalias [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) +; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) +; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) +; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) +; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) +; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr, !alias.scope !4, !noalias !2 + %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr, !alias.scope !4, !noalias !2 + %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr, !alias.scope !4, !noalias !2 + %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr, !alias.scope !4, !noalias !2 + %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr, !alias.scope !4, !noalias !2 + %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) + %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) + %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) + %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) + %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) + %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx + store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr, !alias.scope !2, !noalias !4 + %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 + store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr, !alias.scope !2, !noalias !4 + %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 + store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr, !alias.scope !2, !noalias !4 + %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 + store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr, !alias.scope !2, !noalias !4 + %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 + store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr, !alias.scope !2, !noalias !4 + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 +declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,32" } +attributes #1 = { nounwind } +attributes #2 = { nounwind readnone speculatable } + +!0 = distinct !{!0, !"alias_scope_0"} +!1 = distinct !{!1, !0, !"alias_scope_1"} +!2 = !{!1} +!3 = distinct !{!3, !0, !"alias_scope_3"} +!4 = !{!3} +;. +; CHECK: [[META0]] = !{} +; CHECK: [[META1]] = !{[[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"alias_scope_3"} +; CHECK: [[META3]] = distinct !{[[META3]], !"alias_scope_0"} +; CHECK: [[META4]] = !{[[META5:![0-9]+]]} +; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"alias_scope_1"} +;.