diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll b/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll new file mode 100644 index 0000000000000..e4567a7b2731f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll @@ -0,0 +1,425 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s + +; FIXME: Scale operands of WMMA are limited to low 256 VGPRs +; currently we are spilling it because all low VGPRs are occupied even though our budget is higher. +; Make sure we do not spill scale operands because of the low 256 restriction. +; CHECK: ; ScratchSize: 12 +; CHECK: ; Occupancy: 1 + + +; Function Attrs: nofree norecurse nounwind +define amdgpu_kernel void @spill_scale_test(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { + %3 = tail call i32 @llvm.amdgcn.workgroup.id.z() + %4 = shl i32 %3, 20 + %5 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) null, i32 0, i32 0, i32 0) + %6 = bitcast <4 x i32> %5 to <16 x i8> + %7 = shufflevector <16 x i8> %6, <16 x i8> poison, <64 x i32> + %8 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) null, i32 %4, i32 0, i32 0) + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 13328 to ptr addrspace(3))) + %11 = bitcast <2 x i32> %10 to <8 x i8> + %12 = shufflevector <8 x i8> %11, <8 x i8> poison, <64 x i32> + %13 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 13344 to ptr addrspace(3))) + %14 = bitcast <2 x i32> %13 to <8 x i8> + %15 = shufflevector <8 x i8> %14, <8 x i8> poison, <64 x i32> + %16 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 13360 to ptr addrspace(3))) + %17 = bitcast <2 x i32> %16 to <8 x i8> + %18 = shufflevector <8 x i8> %17, <8 x i8> poison, <64 x i32> + %19 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 12352 to ptr addrspace(3))) + %20 = bitcast <2 x i32> %19 to <8 x i8> + %21 = shufflevector <8 x i8> %20, <8 x i8> poison, <64 x i32> + %22 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull null) + %23 = bitcast <2 x i32> %22 to <8 x i8> + %24 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull null) + %25 = bitcast <2 x i32> %24 to <8 x i8> + %26 = shufflevector <8 x i8> %25, <8 x i8> poison, <64 x i32> + %27 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 112 to ptr addrspace(3))) + %28 = bitcast <2 x i32> %27 to <8 x i8> + %29 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 73696 to ptr addrspace(3))) + %30 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 73760 to ptr addrspace(3))) + %31 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 73792 to ptr addrspace(3))) + tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + %32 = shufflevector <16 x i8> %6, <16 x i8> %6, <64 x i32> + %33 = shufflevector <64 x i8> %32, <64 x i8> %7, <64 x i32> + %34 = shufflevector <64 x i8> %33, <64 x i8> %7, <64 x i32> + %35 = bitcast <64 x i8> %34 to <16 x i32> + %36 = shufflevector <16 x i8> %9, <16 x i8> %9, <64 x i32> + %37 = shufflevector <64 x i8> %36, <64 x i8> %7, <64 x i32> + %38 = shufflevector <64 x i8> %37, <64 x i8> %7, <64 x i32> + %39 = bitcast <64 x i8> %38 to <16 x i32> + %40 = shufflevector <64 x i8> , <64 x i8> %12, <64 x i32> + %41 = bitcast <64 x i8> %40 to <16 x i32> + %42 = shufflevector <64 x i8> , <64 x i8> %15, <64 x i32> + %43 = bitcast <64 x i8> %42 to <16 x i32> + %44 = shufflevector <64 x i8> , <64 x i8> %18, <64 x i32> + %45 = bitcast <64 x i8> %44 to <16 x i32> + %46 = shufflevector <64 x i8> , <64 x i8> %21, <64 x i32> + %47 = shufflevector <64 x i8> %46, <64 x i8> , <64 x i32> + %48 = bitcast <64 x i8> %47 to <16 x i32> + %49 = shufflevector <8 x i8> zeroinitializer, <8 x i8> %23, <64 x i32> + %50 = shufflevector <64 x i8> %49, <64 x i8> , <64 x i32> + %51 = shufflevector <64 x i8> %50, <64 x i8> , <64 x i32> + %52 = shufflevector <64 x i8> %51, <64 x i8> , <64 x i32> + %53 = shufflevector <64 x i8> %52, <64 x i8> , <64 x i32> + %54 = shufflevector <64 x i8> %53, <64 x i8> , <64 x i32> + %55 = shufflevector <64 x i8> %54, <64 x i8> , <64 x i32> + %56 = bitcast <64 x i8> %55 to <16 x i32> + %57 = shufflevector <64 x i8> , <64 x i8> %26, <64 x i32> + %58 = bitcast <64 x i8> %57 to <16 x i32> + %59 = shufflevector <8 x i8> %28, <8 x i8> zeroinitializer, <64 x i32> + %60 = shufflevector <64 x i8> %59, <64 x i8> , <64 x i32> + %61 = shufflevector <64 x i8> %60, <64 x i8> , <64 x i32> + %62 = shufflevector <64 x i8> %61, <64 x i8> , <64 x i32> + %63 = shufflevector <64 x i8> %62, <64 x i8> , <64 x i32> + %64 = shufflevector <64 x i8> %63, <64 x i8> , <64 x i32> + %65 = shufflevector <64 x i8> %64, <64 x i8> , <64 x i32> + %66 = bitcast <64 x i8> %65 to <16 x i32> + %.extract4 = extractelement <2 x i32> %29, i64 1 + %.extract5 = extractelement <2 x i32> %30, i64 0 + %.extract7 = extractelement <2 x i32> %31, i64 0 + %67 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %41, i32 0, <16 x i32> %35, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %68 = extractelement <8 x float> %67, i64 2 + %69 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %43, i32 0, <16 x i32> %39, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %70 = extractelement <8 x float> %69, i64 6 + %71 = extractelement <8 x float> %69, i64 7 + %72 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %45, i32 0, <16 x i32> %39, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 %.extract4, i32 0, i32 0, i32 0, i1 false, i1 false) + %73 = extractelement <8 x float> %72, i64 7 + %74 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %48, i32 0, <16 x i32> %39, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 %.extract5, i32 0, i32 0, i32 0, i1 false, i1 false) + %75 = extractelement <8 x float> %74, i64 0 + %76 = extractelement <8 x float> %74, i64 3 + %77 = extractelement <8 x float> %74, i64 5 + %78 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %56, i32 0, <16 x i32> %39, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %79 = extractelement <8 x float> %78, i64 0 + %80 = extractelement <8 x float> %78, i64 2 + %81 = extractelement <8 x float> %78, i64 4 + %82 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %58, i32 0, <16 x i32> %39, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 %.extract7, i32 0, i32 0, i32 0, i1 false, i1 false) + %83 = extractelement <8 x float> %82, i64 0 + %84 = extractelement <8 x float> %82, i64 1 + %85 = extractelement <8 x float> %82, i64 3 + %86 = extractelement <8 x float> %82, i64 6 + %87 = extractelement <8 x float> %82, i64 7 + %88 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %66, i32 0, <16 x i32> %39, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %89 = extractelement <8 x float> %88, i64 5 + %90 = extractelement <8 x float> %88, i64 6 + tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) + %91 = tail call float @llvm.maxnum.f32(float %68, float 0.000000e+00) + %92 = tail call float @llvm.maxnum.f32(float %70, float 0.000000e+00) + %93 = tail call float @llvm.maxnum.f32(float %92, float %71) + %94 = tail call float @llvm.maxnum.f32(float %93, float 0.000000e+00) + %95 = tail call float @llvm.maxnum.f32(float %94, float %73) + %96 = tail call float @llvm.maxnum.f32(float %95, float %75) + %97 = tail call float @llvm.maxnum.f32(float %96, float 0.000000e+00) + %98 = tail call float @llvm.maxnum.f32(float %97, float %76) + %99 = tail call float @llvm.maxnum.f32(float %98, float 0.000000e+00) + %100 = tail call float @llvm.maxnum.f32(float %99, float %77) + %101 = tail call float @llvm.maxnum.f32(float %100, float 0.000000e+00) + %102 = tail call float @llvm.maxnum.f32(float %101, float %79) + %103 = tail call float @llvm.maxnum.f32(float %102, float 0.000000e+00) + %104 = tail call float @llvm.maxnum.f32(float %103, float %80) + %105 = tail call float @llvm.maxnum.f32(float %104, float 0.000000e+00) + %106 = tail call float @llvm.maxnum.f32(float %105, float %81) + %107 = tail call float @llvm.maxnum.f32(float %106, float 0.000000e+00) + %108 = tail call float @llvm.maxnum.f32(float %107, float %83) + %109 = tail call float @llvm.maxnum.f32(float %108, float %84) + %110 = tail call float @llvm.maxnum.f32(float %109, float 0.000000e+00) + %111 = tail call float @llvm.maxnum.f32(float %110, float %85) + %112 = tail call float @llvm.maxnum.f32(float %111, float 0.000000e+00) + %113 = tail call float @llvm.maxnum.f32(float %112, float %86) + %114 = tail call float @llvm.maxnum.f32(float %113, float %87) + %115 = tail call float @llvm.maxnum.f32(float %114, float 0.000000e+00) + %116 = tail call float @llvm.maxnum.f32(float %115, float %89) + %117 = tail call float @llvm.maxnum.f32(float %116, float %90) + %118 = tail call float @llvm.maxnum.f32(float %117, float 0.000000e+00) + %119 = bitcast float %91 to i32 + %120 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %119, i32 16415) + %121 = bitcast float %118 to i32 + %122 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %121, i32 16415) + %123 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 18432 to ptr addrspace(3))) + %124 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 31760 to ptr addrspace(3))) + %125 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 31776 to ptr addrspace(3))) + %126 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 31792 to ptr addrspace(3))) + %127 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 18496 to ptr addrspace(3))) + %128 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull null) + %129 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 18512 to ptr addrspace(3))) + %130 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 18432 to ptr addrspace(3))) + %131 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 18432 to ptr addrspace(3))) + %132 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) nonnull inttoptr (i32 18432 to ptr addrspace(3))) + %133 = bitcast <2 x i32> %132 to <8 x i8> + %134 = bitcast <2 x i32> %131 to <8 x i8> + %135 = shufflevector <8 x i8> %134, <8 x i8> poison, <4 x i32> + %136 = bitcast <2 x i32> %130 to <8 x i8> + %137 = shufflevector <8 x i8> %136, <8 x i8> poison, <4 x i32> + %138 = bitcast <2 x i32> %129 to <8 x i8> + %139 = shufflevector <8 x i8> %138, <8 x i8> poison, <4 x i32> + %140 = bitcast <2 x i32> %128 to <8 x i8> + %141 = shufflevector <8 x i8> %140, <8 x i8> poison, <4 x i32> + %142 = bitcast <2 x i32> %127 to <8 x i8> + %143 = shufflevector <8 x i8> %142, <8 x i8> poison, <4 x i32> + %144 = bitcast <2 x i32> %126 to <8 x i8> + %145 = shufflevector <8 x i8> %144, <8 x i8> poison, <4 x i32> + %146 = bitcast <2 x i32> %125 to <8 x i8> + %147 = shufflevector <8 x i8> %146, <8 x i8> poison, <4 x i32> + %148 = bitcast <2 x i32> %124 to <8 x i8> + %149 = shufflevector <8 x i8> %148, <8 x i8> poison, <4 x i32> + %150 = bitcast <2 x i32> %123 to <8 x i8> + %151 = shufflevector <8 x i8> %150, <8 x i8> poison, <4 x i32> + %152 = bitcast i32 %122 to float + %153 = tail call float @llvm.maxnum.f32(float %118, float %152) + %154 = tail call float @llvm.maxnum.f32(float %153, float 0xFFF0000000000000) + %155 = fmul float %154, 0x3FC0527DC0000000 + %156 = fsub float 0.000000e+00, %155 + %157 = tail call float @llvm.amdgcn.exp2.f32(float %156) + %158 = insertelement <2 x float> , float %157, i64 1 + %159 = tail call float @llvm.amdgcn.exp2.f32(float 0.000000e+00) + %160 = bitcast i32 %120 to float + %161 = tail call float @llvm.maxnum.f32(float %91, float %160) + %162 = tail call float @llvm.maxnum.f32(float %161, float 0xFFF0000000000000) + %163 = shufflevector <8 x i8> %150, <8 x i8> poison, <64 x i32> + %164 = shufflevector <64 x i8> , <64 x i8> %163, <64 x i32> + %165 = shufflevector <4 x i8> %151, <4 x i8> poison, <64 x i32> + %166 = shufflevector <64 x i8> %164, <64 x i8> %165, <64 x i32> + %167 = bitcast <64 x i8> %166 to <16 x i32> + %168 = shufflevector <4 x i8> %149, <4 x i8> poison, <64 x i32> + %169 = shufflevector <64 x i8> , <64 x i8> %168, <64 x i32> + %170 = bitcast <64 x i8> %169 to <16 x i32> + %171 = shufflevector <4 x i8> %147, <4 x i8> poison, <64 x i32> + %172 = shufflevector <64 x i8> , <64 x i8> %171, <64 x i32> + %173 = bitcast <64 x i8> %172 to <16 x i32> + %174 = shufflevector <4 x i8> %145, <4 x i8> poison, <64 x i32> + %175 = shufflevector <64 x i8> , <64 x i8> %174, <64 x i32> + %176 = bitcast <64 x i8> %175 to <16 x i32> + %177 = shufflevector <4 x i8> %143, <4 x i8> zeroinitializer, <64 x i32> + %178 = shufflevector <64 x i8> %177, <64 x i8> , <64 x i32> + %179 = shufflevector <64 x i8> %178, <64 x i8> , <64 x i32> + %180 = shufflevector <64 x i8> %179, <64 x i8> , <64 x i32> + %181 = shufflevector <64 x i8> %180, <64 x i8> , <64 x i32> + %182 = shufflevector <64 x i8> %181, <64 x i8> , <64 x i32> + %183 = shufflevector <64 x i8> %182, <64 x i8> , <64 x i32> + %184 = shufflevector <64 x i8> %183, <64 x i8> , <64 x i32> + %185 = shufflevector <64 x i8> %184, <64 x i8> , <64 x i32> + %186 = shufflevector <64 x i8> %185, <64 x i8> , <64 x i32> + %187 = shufflevector <64 x i8> %186, <64 x i8> , <64 x i32> + %188 = shufflevector <64 x i8> %187, <64 x i8> , <64 x i32> + %189 = shufflevector <64 x i8> %188, <64 x i8> , <64 x i32> + %190 = shufflevector <64 x i8> %189, <64 x i8> , <64 x i32> + %191 = shufflevector <4 x i8> %141, <4 x i8> poison, <64 x i32> + %192 = shufflevector <64 x i8> %190, <64 x i8> %191, <64 x i32> + %193 = bitcast <64 x i8> %192 to <16 x i32> + %194 = shufflevector <4 x i8> %139, <4 x i8> zeroinitializer, <64 x i32> + %195 = shufflevector <8 x i8> %136, <8 x i8> poison, <64 x i32> + %196 = shufflevector <64 x i8> %194, <64 x i8> %195, <64 x i32> + %197 = shufflevector <4 x i8> %137, <4 x i8> poison, <64 x i32> + %198 = shufflevector <64 x i8> %196, <64 x i8> %197, <64 x i32> + %199 = shufflevector <8 x i8> %134, <8 x i8> poison, <64 x i32> + %200 = shufflevector <64 x i8> %198, <64 x i8> %199, <64 x i32> + %201 = shufflevector <4 x i8> %135, <4 x i8> poison, <64 x i32> + %202 = shufflevector <64 x i8> %200, <64 x i8> %201, <64 x i32> + %203 = shufflevector <8 x i8> %133, <8 x i8> poison, <64 x i32> + %204 = shufflevector <64 x i8> %202, <64 x i8> %203, <64 x i32> + %205 = shufflevector <64 x i8> %204, <64 x i8> , <64 x i32> + %206 = shufflevector <64 x i8> %205, <64 x i8> , <64 x i32> + %207 = shufflevector <64 x i8> %206, <64 x i8> , <64 x i32> + %208 = shufflevector <64 x i8> %207, <64 x i8> , <64 x i32> + %209 = shufflevector <64 x i8> %208, <64 x i8> , <64 x i32> + %210 = shufflevector <64 x i8> %209, <64 x i8> , <64 x i32> + %211 = shufflevector <64 x i8> %210, <64 x i8> , <64 x i32> + %212 = shufflevector <64 x i8> %211, <64 x i8> , <64 x i32> + %213 = shufflevector <64 x i8> %212, <64 x i8> , <64 x i32> + %214 = bitcast <64 x i8> %213 to <16 x i32> + %215 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %167, i32 0, <16 x i32> %35, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %216 = extractelement <8 x float> %215, i64 7 + %217 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %170, i32 0, <16 x i32> %35, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %218 = extractelement <8 x float> %217, i64 0 + %219 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %173, i32 0, <16 x i32> %35, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %220 = extractelement <8 x float> %219, i64 6 + %221 = extractelement <8 x float> %219, i64 7 + %222 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %176, i32 0, <16 x i32> %35, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %223 = extractelement <8 x float> %222, i64 0 + %224 = extractelement <8 x float> %222, i64 3 + %225 = extractelement <8 x float> %222, i64 4 + %226 = extractelement <8 x float> %222, i64 5 + %227 = extractelement <8 x float> %222, i64 7 + %228 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %193, i32 0, <16 x i32> %35, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %229 = extractelement <8 x float> %228, i64 0 + %230 = extractelement <8 x float> %228, i64 1 + %231 = extractelement <8 x float> %228, i64 2 + %232 = extractelement <8 x float> %228, i64 3 + %233 = extractelement <8 x float> %228, i64 4 + %234 = extractelement <8 x float> %228, i64 5 + %235 = extractelement <8 x float> %228, i64 6 + %236 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %214, i32 0, <16 x i32> %35, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %237 = extractelement <8 x float> %236, i64 0 + %238 = extractelement <8 x float> %236, i64 3 + %239 = extractelement <8 x float> %236, i64 5 + %240 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> zeroinitializer, i32 0, <16 x i32> %35, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %241 = extractelement <8 x float> %240, i64 0 + %242 = extractelement <8 x float> %240, i64 1 + %243 = extractelement <8 x float> %240, i64 4 + %244 = extractelement <8 x float> %240, i64 6 + %245 = tail call float @llvm.maxnum.f32(float %216, float 0.000000e+00) + %246 = tail call float @llvm.maxnum.f32(float %245, float %218) + %247 = tail call float @llvm.maxnum.f32(float %246, float 0.000000e+00) + %248 = tail call float @llvm.maxnum.f32(float %247, float %220) + %249 = tail call float @llvm.maxnum.f32(float %248, float %221) + %250 = tail call float @llvm.maxnum.f32(float %249, float %223) + %251 = tail call float @llvm.maxnum.f32(float %250, float 0.000000e+00) + %252 = tail call float @llvm.maxnum.f32(float %251, float %224) + %253 = tail call float @llvm.maxnum.f32(float %252, float %225) + %254 = tail call float @llvm.maxnum.f32(float %253, float %226) + %255 = tail call float @llvm.maxnum.f32(float %254, float 0.000000e+00) + %256 = tail call float @llvm.maxnum.f32(float %255, float %227) + %257 = tail call float @llvm.maxnum.f32(float %256, float %229) + %258 = tail call float @llvm.maxnum.f32(float %257, float %230) + %259 = tail call float @llvm.maxnum.f32(float %258, float %231) + %260 = tail call float @llvm.maxnum.f32(float %259, float %232) + %261 = tail call float @llvm.maxnum.f32(float %260, float %233) + %262 = tail call float @llvm.maxnum.f32(float %261, float %234) + %263 = tail call float @llvm.maxnum.f32(float %262, float %235) + %264 = tail call float @llvm.maxnum.f32(float %263, float 0.000000e+00) + %265 = tail call float @llvm.maxnum.f32(float %264, float %237) + %266 = tail call float @llvm.maxnum.f32(float %265, float 0.000000e+00) + %267 = tail call float @llvm.maxnum.f32(float %266, float %238) + %268 = tail call float @llvm.maxnum.f32(float %267, float 0.000000e+00) + %269 = tail call float @llvm.maxnum.f32(float %268, float %239) + %270 = tail call float @llvm.maxnum.f32(float %269, float 0.000000e+00) + %271 = tail call float @llvm.maxnum.f32(float %270, float %241) + %272 = tail call float @llvm.maxnum.f32(float %271, float %242) + %273 = tail call float @llvm.maxnum.f32(float %272, float 0.000000e+00) + %274 = tail call float @llvm.maxnum.f32(float %273, float %243) + %275 = tail call float @llvm.maxnum.f32(float %274, float 0.000000e+00) + %276 = tail call float @llvm.maxnum.f32(float %275, float %244) + %277 = tail call float @llvm.maxnum.f32(float %276, float 0.000000e+00) + %278 = bitcast float %277 to i32 + %279 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %278, i32 16415) + %280 = bitcast i32 %279 to float + %281 = tail call float @llvm.maxnum.f32(float %277, float %280) + %282 = tail call float @llvm.maxnum.f32(float %162, float %281) + %283 = fmul float %220, 0x3FC0527DC0000000 + %284 = fmul float %221, 0x3FC0527DC0000000 + %285 = fmul float %223, 0x3FC0527DC0000000 + %286 = fmul float %224, 0x3FC0527DC0000000 + %287 = fmul float %225, 0x3FC0527DC0000000 + %288 = fmul float %226, 0x3FC0527DC0000000 + %289 = fmul float %227, 0x3FC0527DC0000000 + %290 = fmul float %229, 0x3FC0527DC0000000 + %291 = fmul float %230, 0x3FC0527DC0000000 + %292 = fmul float %231, 0x3FC0527DC0000000 + %293 = fmul float %232, 0x3FC0527DC0000000 + %294 = fmul float %233, 0x3FC0527DC0000000 + %295 = fmul float %234, 0x3FC0527DC0000000 + %296 = fmul float %235, 0x3FC0527DC0000000 + %297 = fmul float %282, 0x3FC0527DC0000000 + %298 = fsub float %283, %297 + %299 = fsub float %284, %297 + %300 = fsub float %285, %297 + %301 = fsub float 0.000000e+00, %297 + %302 = fsub float %286, %297 + %303 = fsub float %287, %297 + %304 = fsub float %288, %297 + %305 = fsub float %289, %297 + %306 = fsub float %290, %297 + %307 = fsub float %291, %297 + %308 = fsub float %292, %297 + %309 = fsub float %293, %297 + %310 = fsub float %294, %297 + %311 = fsub float %295, %297 + %312 = fsub float %296, %297 + %313 = tail call float @llvm.amdgcn.exp2.f32(float %298) + %314 = tail call float @llvm.amdgcn.exp2.f32(float %299) + %315 = tail call float @llvm.amdgcn.exp2.f32(float %300) + %316 = tail call float @llvm.amdgcn.exp2.f32(float %301) + %317 = tail call float @llvm.amdgcn.exp2.f32(float %302) + %318 = tail call float @llvm.amdgcn.exp2.f32(float %303) + %319 = tail call float @llvm.amdgcn.exp2.f32(float %304) + %320 = tail call float @llvm.amdgcn.exp2.f32(float %305) + %321 = tail call float @llvm.amdgcn.exp2.f32(float %306) + %322 = tail call float @llvm.amdgcn.exp2.f32(float %307) + %323 = tail call float @llvm.amdgcn.exp2.f32(float %308) + %324 = tail call float @llvm.amdgcn.exp2.f32(float %309) + %325 = tail call float @llvm.amdgcn.exp2.f32(float %310) + %326 = tail call float @llvm.amdgcn.exp2.f32(float %311) + %327 = tail call float @llvm.amdgcn.exp2.f32(float %312) + %328 = insertelement <2 x float> , float %159, i64 0 + %329 = fadd <2 x float> %328, zeroinitializer + %330 = insertelement <2 x float> , float %313, i64 0 + %331 = fadd <2 x float> %330, %329 + %332 = insertelement <2 x float> , float %314, i64 0 + %333 = fadd <2 x float> %332, %331 + %334 = insertelement <2 x float> , float %315, i64 0 + %335 = fadd <2 x float> %334, %333 + %336 = insertelement <2 x float> , float %316, i64 0 + %337 = fadd <2 x float> %336, %335 + %338 = insertelement <2 x float> , float %317, i64 0 + %339 = fadd <2 x float> %338, %337 + %340 = insertelement <2 x float> , float %318, i64 0 + %341 = fadd <2 x float> %340, %339 + %342 = insertelement <2 x float> , float %319, i64 0 + %343 = fadd <2 x float> %342, %341 + %344 = fadd <2 x float> %343, zeroinitializer + %345 = insertelement <2 x float> , float %320, i64 0 + %346 = fadd <2 x float> %345, %344 + %347 = insertelement <2 x float> , float %321, i64 0 + %348 = fadd <2 x float> %347, %346 + %349 = insertelement <2 x float> , float %322, i64 0 + %350 = fadd <2 x float> %349, %348 + %351 = insertelement <2 x float> , float %323, i64 0 + %352 = fadd <2 x float> %351, %350 + %353 = insertelement <2 x float> , float %324, i64 0 + %354 = fadd <2 x float> %353, %352 + %355 = insertelement <2 x float> , float %325, i64 0 + %356 = fadd <2 x float> %355, %354 + %357 = insertelement <2 x float> , float %326, i64 0 + %358 = fadd <2 x float> %357, %356 + %359 = insertelement <2 x float> , float %327, i64 0 + %360 = fadd <2 x float> %359, %358 + %361 = fadd <2 x float> %360, zeroinitializer + %362 = insertelement <2 x float> , float %159, i64 1 + %363 = fadd <2 x float> %362, zeroinitializer + %364 = fadd <2 x float> %158, %363 + %365 = fadd <2 x float> %158, %364 + %bc = bitcast <2 x float> %365 to <2 x i32> + %366 = extractelement <2 x i32> %bc, i64 0 + %367 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %366, i32 16415) + %368 = insertelement <2 x i32> poison, i32 %367, i64 0 + %369 = bitcast <2 x i32> %368 to <2 x float> + %370 = fadd <2 x float> %365, %369 + %bc4593 = bitcast <2 x float> %361 to <2 x i32> + %371 = extractelement <2 x i32> %bc4593, i64 0 + %372 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %371, i32 16415) + %373 = insertelement <2 x i32> poison, i32 %372, i64 0 + %374 = bitcast <2 x i32> %373 to <2 x float> + %375 = fadd <2 x float> %361, %374 + %376 = extractelement <2 x float> %370, i64 0 + %377 = fmul float %376, 0.000000e+00 + %378 = extractelement <2 x float> %375, i64 0 + %379 = fadd float %377, %378 + %380 = fmul float %379, 0.000000e+00 + %381 = insertelement <2 x float> , float %380, i64 0 + %382 = fadd <2 x float> %381, zeroinitializer + %bc4596 = bitcast <2 x float> %382 to <2 x i32> + %383 = extractelement <2 x i32> %bc4596, i64 1 + %384 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %383, i32 16415) + %385 = bitcast i32 %384 to float + %386 = insertelement <2 x float> , float %385, i64 1 + %387 = fmul <2 x float> %386, %382 + %388 = fadd <2 x float> %386, %382 + %389 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> zeroinitializer, i32 0, <16 x i32> zeroinitializer, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2139062143, i1 false, i1 false) + %390 = shufflevector <2 x float> %387, <2 x float> poison, <2 x i32> zeroinitializer + %391 = shufflevector <2 x float> %388, <2 x float> poison, <2 x i32> + %392 = fadd <2 x float> %390, %391 + %393 = shufflevector <2 x float> %392, <2 x float> poison, <4 x i32> + %394 = shufflevector <4 x float> %393, <4 x float> zeroinitializer, <8 x i32> + %395 = fdiv <8 x float> zeroinitializer, %394 + %396 = fcmp uno <8 x float> %395, zeroinitializer + %397 = select <8 x i1> %396, <8 x bfloat> splat (bfloat 0xR7FFF), <8 x bfloat> zeroinitializer + %398 = bitcast <8 x bfloat> %397 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %398, ptr addrspace(8) null, i32 0, i32 0, i32 0) + %399 = fdiv <8 x float> %389, zeroinitializer + %400 = fcmp uno <8 x float> %399, zeroinitializer + %401 = select <8 x i1> %400, <8 x bfloat> splat (bfloat 0xR7FFF), <8 x bfloat> zeroinitializer + %402 = bitcast <8 x bfloat> %401 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %402, ptr addrspace(8) null, i32 0, i32 0, i32 0) + ret void +} + +attributes #0 = { nofree norecurse nounwind "amdgpu-flat-work-group-size"="1,128" "amdgpu-waves-per-eu"="1,1" }