diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 38718c43a61dd..7504f1a8cea09 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -150,7 +150,10 @@ class LiveRegOptimizer { if (!CVisited.insert(CII).second) continue; - if (CII->getParent() == II->getParent() && !IsLookThru(II)) + // Same-BB filter must look at the *user*; and allow non-lookthrough + // users when the def is a PHI (loop-header pattern). + if (CII->getParent() == II->getParent() && !IsLookThru(CII) && + !isa(II)) continue; if (isOpLegal(CII)) diff --git a/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll new file mode 100644 index 0000000000000..b508f739e7fd3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll @@ -0,0 +1,46 @@ +; RUN: opt -S -passes=amdgpu-late-codegenprepare \ +; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s + +; Goal: With a loop-header PHI in illegal vector type and a same-BB +; non-lookthrough user (vector add) in the header, LRO should still coerce +; the PHI to i32 because a profitable sink (store) exists across BB. + +define amdgpu_kernel void @phi_samebb_nonlookthrough_store( + ptr addrspace(1) %out, <4 x i8> %v, i1 %exit) { +; CHECK-LABEL: @phi_samebb_nonlookthrough_store( +entry: + br label %loop + +loop: ; preds = %entry, %loop + ; Loop-carried PHI in illegal vector type. + %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ] + + ; Same-BB non-lookthrough use in header. + %acc.next = add <4 x i8> %acc, %v + + ; Make it a real loop: either iterate or exit to the sink block. + br i1 %exit, label %store, label %loop + +store: ; preds = %loop + ; The across-BB sink: storing the PHI coerced to i32. + %acc.bc = bitcast <4 x i8> %acc to i32 + store i32 %acc.bc, ptr addrspace(1) %out, align 4 + ret void +} + +; After AMDGPULateCodeGenPrepare we expect: +; - PHI is coerced to i32 +; - A header bitcast materializes for the add +; This proves the same-BB non-lookthrough user (add) did not get pruned +; when the def is a PHI. + +; CHECK: loop: +; CHECK: %[[ACC_TC:[^ ]+]] = phi i32 +; CHECK: %[[ACC_TC_BC:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8> +; CHECK: %[[ACC_NEXT:[^ ]+]] = add <4 x i8> %[[ACC_TC_BC]], %v +; CHECK: br i1 %exit, label %store, label %loop +; CHECK: store: +; CHECK: %[[ACC_TC_BC2:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8> +; CHECK: %[[ST_I32:[^ ]+]] = bitcast <4 x i8> %[[ACC_TC_BC2]] to i32 +; CHECK: store i32 %[[ST_I32]], +