Skip to content

Commit

Permalink
[EarlyCSE] Update tests to use opaque pointers (NFC)
Browse files Browse the repository at this point in the history
Update the EarlyCSE tests to use opaque pointers.

Worth noting that this leaves some bitcast ptr to ptr instructions
in the input IR behind which are no longer necessary. This is
because these use numbered instructions, so it's hard to drop them
in an automated fashion (as it would require renumbering all other
instructions as well). I'm leaving that as a problem for another day.

The test updates have been performed using
https://gist.github.com/nikic/98357b71fd67756b0f064c9517b62a34.

Differential Revision: https://reviews.llvm.org/D127278
  • Loading branch information
nikic committed Jun 10, 2022
1 parent 6bc8163 commit 3c514d3
Show file tree
Hide file tree
Showing 30 changed files with 1,190 additions and 1,197 deletions.
92 changes: 46 additions & 46 deletions llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
Expand Up @@ -3,11 +3,11 @@
; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s
; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse<memssa>' | FileCheck %s

define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
define <4 x i32> @test_cse(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
; CHECK-LABEL: @test_cse
; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
Expand All @@ -19,14 +19,14 @@ for.cond: ; preds = %for.body, %entry
br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%0 = bitcast ptr %a to ptr
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
%5 = bitcast i32* %a to i8*
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %4, ptr %0)
%5 = bitcast ptr %a to ptr
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %5)
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
Expand All @@ -37,12 +37,12 @@ for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}

define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
define <4 x i32> @test_cse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
; CHECK-LABEL: @test_cse2
; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, i8* %0)
; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %3, ptr %0)
; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, ptr %a)
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
Expand All @@ -54,15 +54,15 @@ for.cond: ; preds = %for.body, %entry
br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%0 = bitcast ptr %a to ptr
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
%5 = bitcast i32* %a to i8*
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %3, ptr %0)
call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %4, ptr %0)
%5 = bitcast ptr %a to ptr
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %5)
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
Expand All @@ -73,12 +73,12 @@ for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}

define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
define <4 x i32> @test_cse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
entry:
; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
; CHECK-LABEL: @test_cse3
; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
Expand All @@ -90,12 +90,12 @@ for.cond: ; preds = %for.body, %entry
br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
%0 = bitcast ptr %a to ptr
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %0)
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
%1 = bitcast i32* %a to i8*
%vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
%1 = bitcast ptr %a to ptr
%vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %1)
%vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
%vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
Expand All @@ -107,12 +107,12 @@ for.end: ; preds = %for.cond
}


define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
define <4 x i32> @test_nocse(ptr %a, ptr %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
; away by Early CSE.
; CHECK-LABEL: @test_nocse
; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
Expand All @@ -124,15 +124,15 @@ for.cond: ; preds = %for.body, %entry
br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%0 = bitcast ptr %a to ptr
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
store i32 0, i32* %b, align 4
%5 = bitcast i32* %a to i8*
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %4, ptr %0)
store i32 0, ptr %b, align 4
%5 = bitcast ptr %a to ptr
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %5)
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
Expand All @@ -143,12 +143,12 @@ for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}

define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
define <4 x i32> @test_nocse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
; to mismatch between st2 and ld3.
; CHECK-LABEL: @test_nocse2
; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
Expand All @@ -160,14 +160,14 @@ for.cond: ; preds = %for.body, %entry
br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%0 = bitcast ptr %a to ptr
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
%5 = bitcast i32* %a to i8*
%vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %4, ptr %0)
%5 = bitcast ptr %a to ptr
%vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %5)
%vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
%vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
Expand All @@ -178,13 +178,13 @@ for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}

define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
define <4 x i32> @test_nocse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
; mismatch between st2 and st3.
; CHECK-LABEL: @test_nocse3
; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0
; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
Expand All @@ -196,15 +196,15 @@ for.cond: ; preds = %for.body, %entry
br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%0 = bitcast ptr %a to ptr
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
%5 = bitcast i32* %a to i8*
%vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, ptr %0)
call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %3, ptr %0)
%5 = bitcast ptr %a to ptr
%vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %5)
%vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
%vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
Expand All @@ -216,16 +216,16 @@ for.end: ; preds = %for.cond
}

; Function Attrs: nounwind
declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr nocapture)

; Function Attrs: nounwind
declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
declare void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, ptr nocapture)

; Function Attrs: nounwind readonly
declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr)

; Function Attrs: nounwind readonly
declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr)

define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
entry:
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/Transforms/EarlyCSE/AArch64/ldstN.ll
Expand Up @@ -3,14 +3,14 @@
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"

declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>*)
declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr)

; Although the store and the ld4 are using the same pointer, the
; data can not be reused because ld4 accesses multiple elements.
define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @foo() {
entry:
store <4 x i16> undef, <4 x i16>* undef, align 8
%0 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* undef)
store <4 x i16> undef, ptr undef, align 8
%0 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr undef)
ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %0
; CHECK-LABEL: @foo(
; CHECK: store
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
Expand Up @@ -3,33 +3,33 @@
; CHECK-LABEL: @no_cse
; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
define void @no_cse(i32 addrspace(1)* %out, <4 x i32> %in) {
define void @no_cse(ptr addrspace(1) %out, <4 x i32> %in) {
%a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
%b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
%c = add i32 %a, %b
store i32 %c, i32 addrspace(1)* %out
store i32 %c, ptr addrspace(1) %out
ret void
}

; CHECK-LABEL: @cse_zero_offset
; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
; CHECK: add i32 [[CSE]], [[CSE]]
define void @cse_zero_offset(i32 addrspace(1)* %out, <4 x i32> %in) {
define void @cse_zero_offset(ptr addrspace(1) %out, <4 x i32> %in) {
%a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
%b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
%c = add i32 %a, %b
store i32 %c, i32 addrspace(1)* %out
store i32 %c, ptr addrspace(1) %out
ret void
}

; CHECK-LABEL: @cse_nonzero_offset
; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
; CHECK: add i32 [[CSE]], [[CSE]]
define void @cse_nonzero_offset(i32 addrspace(1)* %out, <4 x i32> %in) {
define void @cse_nonzero_offset(ptr addrspace(1) %out, <4 x i32> %in) {
%a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
%b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
%c = add i32 %a, %b
store i32 %c, i32 addrspace(1)* %out
store i32 %c, ptr addrspace(1) %out
ret void
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/EarlyCSE/PowerPC/read-reg.ll
Expand Up @@ -7,7 +7,7 @@ target triple = "powerpc64-unknown-linux-gnu"
define i64 @f(i64 %x) #0 {
entry:
%0 = call i64 @llvm.read_register.i64(metadata !0)
call void bitcast (void (...)* @foo to void ()*)()
call void @foo()
%1 = call i64 @llvm.read_register.i64(metadata !0)
%add = add nsw i64 %0, %1
ret i64 %add
Expand Down

0 comments on commit 3c514d3

Please sign in to comment.