824 changes: 824 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/Vectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ void llvm::initializeVectorization(PassRegistry &Registry) {
initializeBBVectorizePass(Registry);
initializeLoopVectorizePass(Registry);
initializeSLPVectorizerPass(Registry);
initializeLoadStoreVectorizerPass(Registry);
}

void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
Expand Down
150 changes: 150 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s

target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

declare i32 @llvm.amdgcn.workitem.id.x() #1

; CHECK-LABEL: @basic_merge_sext_index(
; CHECK: sext i32 %id.x to i64
; CHECK: load <2 x float>
; CHECK: store <2 x float> zeroinitializer
define void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
entry:
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
%sext.id.x = sext i32 %id.x to i64
%a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %sext.id.x
%c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %sext.id.x
%a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
%c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1

%ld.c = load float, float addrspace(1)* %c.idx.x, align 4
%ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4

store float 0.0, float addrspace(1)* %a.idx.x, align 4
store float 0.0, float addrspace(1)* %a.idx.x.1, align 4

%add = fadd float %ld.c, %ld.c.idx.1
store float %add, float addrspace(1)* %b, align 4
ret void
}

; CHECK-LABEL: @basic_merge_zext_index(
; CHECK: zext i32 %id.x to i64
; CHECK: load <2 x float>
; CHECK: store <2 x float>
define void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
entry:
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
%zext.id.x = zext i32 %id.x to i64
%a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
%c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
%a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
%c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1

%ld.c = load float, float addrspace(1)* %c.idx.x, align 4
%ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
store float 0.0, float addrspace(1)* %a.idx.x, align 4
store float 0.0, float addrspace(1)* %a.idx.x.1, align 4

%add = fadd float %ld.c, %ld.c.idx.1
store float %add, float addrspace(1)* %b, align 4
ret void
}

; CHECK-LABEL: @merge_op_zext_index(
; CHECK: load <2 x float>
; CHECK: store <2 x float>
define void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
entry:
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
%shl = shl i32 %id.x, 2
%zext.id.x = zext i32 %shl to i64
%a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
%c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x

%id.x.1 = or i32 %shl, 1
%id.x.1.ext = zext i32 %id.x.1 to i64

%a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext
%c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext

%ld.c.0 = load float, float addrspace(1)* %c.0, align 4
store float 0.0, float addrspace(1)* %a.0, align 4
%ld.c.1 = load float, float addrspace(1)* %c.1, align 4
store float 0.0, float addrspace(1)* %a.1, align 4

%add = fadd float %ld.c.0, %ld.c.1
store float %add, float addrspace(1)* %b, align 4
ret void
}

; CHECK-LABEL: @merge_op_sext_index(
; CHECK: load <2 x float>
; CHECK: store <2 x float>
define void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
entry:
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
%shl = shl i32 %id.x, 2
%zext.id.x = sext i32 %shl to i64
%a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
%c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x

%id.x.1 = or i32 %shl, 1
%id.x.1.ext = sext i32 %id.x.1 to i64

%a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext
%c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext

%ld.c.0 = load float, float addrspace(1)* %c.0, align 4
store float 0.0, float addrspace(1)* %a.0, align 4
%ld.c.1 = load float, float addrspace(1)* %c.1, align 4
store float 0.0, float addrspace(1)* %a.1, align 4

%add = fadd float %ld.c.0, %ld.c.1
store float %add, float addrspace(1)* %b, align 4
ret void
}

; This case fails to vectorize if not using the extra extension
; handling in isConsecutiveAccess.

; CHECK-LABEL: @zext_trunc_phi_1(
; CHECK: loop:
; CHECK: load <2 x i32>
; CHECK: store <2 x i32>
define void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
entry:
%cmp0 = icmp eq i32 %n, 0
br i1 %cmp0, label %exit, label %loop

loop:
%indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
%trunc.iv = trunc i64 %indvars.iv to i32
%idx = shl i32 %trunc.iv, 4

%idx.ext = zext i32 %idx to i64
%c.0 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.ext
%a.0 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext

%idx.1 = or i32 %idx, 1
%idx.1.ext = zext i32 %idx.1 to i64
%c.1 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.1.ext
%a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.1.ext

%ld.c.0 = load i32, i32 addrspace(1)* %c.0, align 4
store i32 %ld.c.0, i32 addrspace(1)* %a.0, align 4
%ld.c.1 = load i32, i32 addrspace(1)* %c.1, align 4
store i32 %ld.c.1, i32 addrspace(1)* %a.1, align 4

%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32

%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %exit, label %loop

exit:
ret void
}

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
62 changes: 62 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s

target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

; Check relative position of the inserted vector load relative to the
; existing adds.

; CHECK-LABEL: @insert_load_point(
; CHECK: %z = add i32 %x, 4
; CHECK: %w = add i32 %y, 9
; CHECK: load <2 x float>
; CHECK: %foo = add i32 %z, %w
define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
entry:
%a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
%c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
%a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
%c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1

%z = add i32 %x, 4
%ld.c = load float, float addrspace(1)* %c.idx.x, align 4
%w = add i32 %y, 9
%ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
%foo = add i32 %z, %w

store float 0.0, float addrspace(1)* %a.idx.x, align 4
store float 0.0, float addrspace(1)* %a.idx.x.1, align 4

%add = fadd float %ld.c, %ld.c.idx.1
store float %add, float addrspace(1)* %b, align 4
store i32 %foo, i32 addrspace(3)* null, align 4
ret void
}

; CHECK-LABEL: @insert_store_point(
; CHECK: %z = add i32 %x, 4
; CHECK: %w = add i32 %y, 9
; CHECK: store <2 x float>
; CHECK: %foo = add i32 %z, %w
define void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
entry:
%a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
%c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
%a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
%c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1

%ld.c = load float, float addrspace(1)* %c.idx.x, align 4
%ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4

%z = add i32 %x, 4
store float 0.0, float addrspace(1)* %a.idx.x, align 4
%w = add i32 %y, 9
store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
%foo = add i32 %z, %w

%add = fadd float %ld.c, %ld.c.idx.1
store float %add, float addrspace(1)* %b, align 4
store i32 %foo, i32 addrspace(3)* null, align 4
ret void
}

attributes #0 = { nounwind }
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s

target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

; This is OK to vectorize the load as long as the may alias store
; occurs before the vector load.

; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
; CHECK: load <2 x double>
; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
define void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
entry:
%a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
%c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1

%ld.c = load double, double addrspace(1)* %c, align 8 ; may alias store to %a
store double 0.0, double addrspace(1)* %a, align 8

%ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8 ; may alias store to %a
store double 0.0, double addrspace(1)* %a.idx.1, align 8

%add = fadd double %ld.c, %ld.c.idx.1
store double %add, double addrspace(1)* %b

ret void
}

attributes #0 = { nounwind }
3 changes: 3 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
if not 'AMDGPU' in config.root.targets:
config.unsupported = True

635 changes: 635 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll

Large diffs are not rendered by default.

91 changes: 91 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s

target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

; CHECK-LABEL: @merge_v2i32_v2i32(
; CHECK: load <4 x i32>
; CHECK: store <4 x i32> zeroinitializer
define void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
entry:
%a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
%b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1

%ld.c = load <2 x i32>, <2 x i32> addrspace(1)* %b, align 4
%ld.c.idx.1 = load <2 x i32>, <2 x i32> addrspace(1)* %b.1, align 4

store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a, align 4
store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a.1, align 4

ret void
}

; CHECK-LABEL: @merge_v1i32_v1i32(
; CHECK: load <2 x i32>
; CHECK: store <2 x i32> zeroinitializer
define void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
entry:
%a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
%b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1

%ld.c = load <1 x i32>, <1 x i32> addrspace(1)* %b, align 4
%ld.c.idx.1 = load <1 x i32>, <1 x i32> addrspace(1)* %b.1, align 4

store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a, align 4
store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a.1, align 4

ret void
}

; CHECK-LABEL: @no_merge_v3i32_v3i32(
; CHECK: load <3 x i32>
; CHECK: load <3 x i32>
; CHECK: store <3 x i32> zeroinitializer
; CHECK: store <3 x i32> zeroinitializer
define void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
entry:
%a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
%b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1

%ld.c = load <3 x i32>, <3 x i32> addrspace(1)* %b, align 4
%ld.c.idx.1 = load <3 x i32>, <3 x i32> addrspace(1)* %b.1, align 4

store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a, align 4
store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a.1, align 4

ret void
}

; CHECK-LABEL: @merge_v2i16_v2i16(
; CHECK: load <4 x i16>
; CHECK: store <4 x i16> zeroinitializer
define void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
entry:
%a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
%b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1

%ld.c = load <2 x i16>, <2 x i16> addrspace(1)* %b, align 4
%ld.c.idx.1 = load <2 x i16>, <2 x i16> addrspace(1)* %b.1, align 4

store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a, align 4
store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a.1, align 4

ret void
}

; Ideally this would be merged
; CHECK-LABEL: @merge_load_i32_v2i16(
; CHECK: load i32,
; CHECK: load <2 x i16>
define void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
entry:
%a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
%a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*

%ld.0 = load i32, i32 addrspace(1)* %a
%ld.1 = load <2 x i16>, <2 x i16> addrspace(1)* %a.1.cast

ret void
}

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s

; CHECK-LABEL: @no_implicit_float(
; CHECK: store i32
; CHECK: store i32
; CHECK: store i32
; CHECK: store i32
define void @no_implicit_float(i32 addrspace(1)* %out) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3

store i32 123, i32 addrspace(1)* %out.gep.1
store i32 456, i32 addrspace(1)* %out.gep.2
store i32 333, i32 addrspace(1)* %out.gep.3
store i32 1234, i32 addrspace(1)* %out
ret void
}

attributes #0 = { nounwind noimplicitfloat }