Skip to content

[LICM] freeze instruction inhibits hoistng of invariants #55672

@xortator

Description

@xortator

Godbolt example:
https://godbolt.org/z/bTGT3Mdxn

opt -passes=licm -debug-only=licm -S

define void @test_01(i8 addrspace(1)* addrspace(1)* %arg, i32 %arg2) {
bb:
  %tmp103 = load atomic i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %arg unordered, align 8, !dereferenceable_or_null !5, !align !6
  %tmp117 = icmp eq i8 addrspace(1)* %tmp103, null
  %tmp118 = getelementptr inbounds i8, i8 addrspace(1)* %tmp103, i64 8
  %tmp119 = bitcast i8 addrspace(1)* %tmp118 to i32 addrspace(1)*
  br i1 %tmp117, label %bb122, label %bb149

bb122:                                            ; preds = %bb
  ret void

bb149:                                            ; preds = %bb
  br label %bb150

bb150:                                            ; preds = %bb150, %bb149
  %tmp151 = phi i32 [ 0, %bb149 ], [ %tmp163, %bb150 ]
  %tmp152 = icmp ult i32 %tmp151, %arg2
  call void (i1, ...) @llvm.experimental.guard(i1 %tmp152, i32 12) [ "deopt"() ]
  %tmp157 = load atomic i32, i32 addrspace(1)* %tmp119 unordered, align 8
  %tmp158 = icmp ult i32 %tmp151, %tmp157
  call void (i1, ...) @llvm.experimental.guard(i1 %tmp158, i32 12) [ "deopt"() ]
  %tmp163 = add i32 %tmp151, 1
  br label %bb150
}

define void @test_02(i8 addrspace(1)* addrspace(1)* %arg, i32 %arg2) {
bb:
  %tmp103 = load atomic i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %arg unordered, align 8, !dereferenceable_or_null !5, !align !6
  %tmp117 = icmp eq i8 addrspace(1)* %tmp103, null
  %tmp118 = getelementptr inbounds i8, i8 addrspace(1)* %tmp103, i64 8
  %tmp119 = bitcast i8 addrspace(1)* %tmp118 to i32 addrspace(1)*
  %freeze = freeze i1 %tmp117
  br i1 %freeze, label %bb122, label %bb149

bb122:                                            ; preds = %bb
  ret void

bb149:                                            ; preds = %bb
  br label %bb150

bb150:                                            ; preds = %bb150, %bb149
  %tmp151 = phi i32 [ 0, %bb149 ], [ %tmp163, %bb150 ]
  %tmp152 = icmp ult i32 %tmp151, %arg2
  call void (i1, ...) @llvm.experimental.guard(i1 %tmp152, i32 12) [ "deopt"() ]
  %tmp157 = load atomic i32, i32 addrspace(1)* %tmp119 unordered, align 8
  %tmp158 = icmp ult i32 %tmp151, %tmp157
  call void (i1, ...) @llvm.experimental.guard(i1 %tmp158, i32 12) [ "deopt"() ]
  %tmp163 = add i32 %tmp151, 1
  br label %bb150
}

; Function Attrs: nocallback nofree nosync willreturn
declare void @llvm.experimental.guard(i1, ...) #2

!5 = !{i64 16}
!6 = !{i64 8}

Output:

LICM hoisting to bb149:   %tmp157 = load atomic i32, i32 addrspace(1)* %tmp119 unordered, align 8
; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"

define void @test_01(i8 addrspace(1)* addrspace(1)* %arg, i32 %arg2) {
bb:
  %tmp103 = load atomic i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %arg unordered, align 8, !dereferenceable_or_null !0, !align !1
  %tmp117 = icmp eq i8 addrspace(1)* %tmp103, null
  %tmp118 = getelementptr inbounds i8, i8 addrspace(1)* %tmp103, i64 8
  %tmp119 = bitcast i8 addrspace(1)* %tmp118 to i32 addrspace(1)*
  br i1 %tmp117, label %bb122, label %bb149

bb122:                                            ; preds = %bb
  ret void

bb149:                                            ; preds = %bb
  %tmp157 = load atomic i32, i32 addrspace(1)* %tmp119 unordered, align 8
  br label %bb150

bb150:                                            ; preds = %bb150, %bb149
  %tmp151 = phi i32 [ 0, %bb149 ], [ %tmp163, %bb150 ]
  %tmp152 = icmp ult i32 %tmp151, %arg2
  call void (i1, ...) @llvm.experimental.guard(i1 %tmp152, i32 12) [ "deopt"() ]
  %tmp158 = icmp ult i32 %tmp151, %tmp157
  call void (i1, ...) @llvm.experimental.guard(i1 %tmp158, i32 12) [ "deopt"() ]
  %tmp163 = add i32 %tmp151, 1
  br label %bb150
}

define void @test_02(i8 addrspace(1)* addrspace(1)* %arg, i32 %arg2) {
bb:
  %tmp103 = load atomic i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %arg unordered, align 8, !dereferenceable_or_null !0, !align !1
  %tmp117 = icmp eq i8 addrspace(1)* %tmp103, null
  %tmp118 = getelementptr inbounds i8, i8 addrspace(1)* %tmp103, i64 8
  %tmp119 = bitcast i8 addrspace(1)* %tmp118 to i32 addrspace(1)*
  %freeze = freeze i1 %tmp117
  br i1 %freeze, label %bb122, label %bb149

bb122:                                            ; preds = %bb
  ret void

bb149:                                            ; preds = %bb
  br label %bb150

bb150:                                            ; preds = %bb150, %bb149
  %tmp151 = phi i32 [ 0, %bb149 ], [ %tmp163, %bb150 ]
  %tmp152 = icmp ult i32 %tmp151, %arg2
  call void (i1, ...) @llvm.experimental.guard(i1 %tmp152, i32 12) [ "deopt"() ]
  %tmp157 = load atomic i32, i32 addrspace(1)* %tmp119 unordered, align 8
  %tmp158 = icmp ult i32 %tmp151, %tmp157
  call void (i1, ...) @llvm.experimental.guard(i1 %tmp158, i32 12) [ "deopt"() ]
  %tmp163 = add i32 %tmp151, 1
  br label %bb150
}

; Function Attrs: nocallback nofree nosync willreturn
declare void @llvm.experimental.guard(i1, ...) #0

attributes #0 = { nocallback nofree nosync willreturn }

!0 = !{i64 16}
!1 = !{i64 8}

The only difference between test_01 and test_02 is freeze on loop-invariant condition. In former case invariant load gets hoisted, while in presence of freeze it doesn't. For us, it causes massive negative performance impact after freeze instructions started generated with -freeze-loop-unswitch-cond set to true.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions