1,784 changes: 892 additions & 892 deletions llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

Large diffs are not rendered by default.

2,290 changes: 1,144 additions & 1,146 deletions llvm/test/CodeGen/AMDGPU/load-global-i16.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
# REQUIRES: asserts

--- |
define void @sink_and_inc_idx_when_skipping_small_region_1() "amdgpu-flat-work-group-size"="1,64" {
ret void
}

define void @sink_and_inc_idx_when_skipping_small_regions_2() "amdgpu-flat-work-group-size"="1,64" {
ret void
}
---
name: sink_and_inc_idx_when_skipping_small_region_1
tracksRegLiveness: true
Expand Down
144 changes: 72 additions & 72 deletions llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir

Large diffs are not rendered by default.

12 changes: 8 additions & 4 deletions llvm/test/CodeGen/AMDGPU/memory_clause.mir
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,11 @@ body: |
# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, implicit $exec
# GCN-NEXT: dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, implicit $exec
# GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec
# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec
# GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec
# GCN-NEXT: KILL %0{{$}}
# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 80, 0, implicit $exec
# GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 96, 0, implicit $exec
# GCN-NEXT: dead %9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec
# GCN-NEXT: dead %10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec
# GCN-NEXT: KILL %1{{$}}

---
Expand All @@ -278,8 +280,10 @@ body: |
%4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, implicit $exec
%5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, implicit $exec
%6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec
%7:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 80, 0, implicit $exec
%8:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 96, 0, implicit $exec
%7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec
%8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec
%9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec
%10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec
...

# GCN-LABEL: {{^}}name: image_clause{{$}}
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,22 +73,22 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
; GFX9-NEXT: .LBB1_2: ; %bb23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0
; GFX9-NEXT: v_add_u32_e32 v18, v9, v0
; GFX9-NEXT: v_add_u32_e32 v12, v17, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5
; GFX9-NEXT: v_add_u32_e32 v19, v3, v16
; GFX9-NEXT: v_add_u32_e32 v3, v9, v0
; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18
; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4]
; GFX9-NEXT: v_mul_u32_u24_e32 v19, v3, v5
; GFX9-NEXT: v_add_u32_e32 v20, v3, v16
; GFX9-NEXT: v_sub_u32_e32 v3, v18, v19
; GFX9-NEXT: v_sub_u32_e32 v12, v12, v19
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4]
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v13
; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4]
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18
; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7]
; GFX9-NEXT: global_load_dword v3, v[18:19], off
Expand Down
140 changes: 73 additions & 67 deletions llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @max_occupancy() {
define amdgpu_kernel void @max_occupancy() #10 {
ret void
}

Expand Down Expand Up @@ -52,7 +52,7 @@ define amdgpu_kernel void @limited_occupancy_19() #2 {
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_24_vgprs() {
define amdgpu_kernel void @used_24_vgprs() #10 {
call void asm sideeffect "", "~{v23}" ()
ret void
}
Expand All @@ -63,7 +63,7 @@ define amdgpu_kernel void @used_24_vgprs() {
; GFX1010W32: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_28_vgprs() {
define amdgpu_kernel void @used_28_vgprs() #10 {
call void asm sideeffect "", "~{v27}" ()
ret void
}
Expand All @@ -74,7 +74,7 @@ define amdgpu_kernel void @used_28_vgprs() {
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_32_vgprs() {
define amdgpu_kernel void @used_32_vgprs() #10 {
call void asm sideeffect "", "~{v31}" ()
ret void
}
Expand All @@ -86,7 +86,7 @@ define amdgpu_kernel void @used_32_vgprs() {
; GFX1030W64: ; Occupancy: 12
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_36_vgprs() {
define amdgpu_kernel void @used_36_vgprs() #10 {
call void asm sideeffect "", "~{v35}" ()
ret void
}
Expand All @@ -97,7 +97,7 @@ define amdgpu_kernel void @used_36_vgprs() {
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_40_vgprs() {
define amdgpu_kernel void @used_40_vgprs() #10 {
call void asm sideeffect "", "~{v39}" ()
ret void
}
Expand All @@ -109,7 +109,7 @@ define amdgpu_kernel void @used_40_vgprs() {
; GFX1030W64: ; Occupancy: 10
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_44_vgprs() {
define amdgpu_kernel void @used_44_vgprs() #10 {
call void asm sideeffect "", "~{v43}" ()
ret void
}
Expand All @@ -120,7 +120,7 @@ define amdgpu_kernel void @used_44_vgprs() {
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_48_vgprs() {
define amdgpu_kernel void @used_48_vgprs() #10 {
call void asm sideeffect "", "~{v47}" ()
ret void
}
Expand All @@ -132,7 +132,7 @@ define amdgpu_kernel void @used_48_vgprs() {
; GFX1030W32: ; Occupancy: 16
; GFX1100W64: ; Occupancy: 12
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_56_vgprs() {
define amdgpu_kernel void @used_56_vgprs() #10 {
call void asm sideeffect "", "~{v55}" ()
ret void
}
Expand All @@ -143,7 +143,7 @@ define amdgpu_kernel void @used_56_vgprs() {
; GFX10W32: ; Occupancy: 16
; GFX1100W64: ; Occupancy: 10
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_64_vgprs() {
define amdgpu_kernel void @used_64_vgprs() #10 {
call void asm sideeffect "", "~{v63}" ()
ret void
}
Expand All @@ -155,7 +155,7 @@ define amdgpu_kernel void @used_64_vgprs() {
; GFX1030W32: ; Occupancy: 12
; GFX1100W64: ; Occupancy: 10
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_72_vgprs() {
define amdgpu_kernel void @used_72_vgprs() #10 {
call void asm sideeffect "", "~{v71}" ()
ret void
}
Expand All @@ -166,7 +166,7 @@ define amdgpu_kernel void @used_72_vgprs() {
; GFX10W32: ; Occupancy: 12
; GFX1100W64: ; Occupancy: 9
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_80_vgprs() {
define amdgpu_kernel void @used_80_vgprs() #10 {
call void asm sideeffect "", "~{v79}" ()
ret void
}
Expand All @@ -179,7 +179,7 @@ define amdgpu_kernel void @used_80_vgprs() {
; GFX1030W32: ; Occupancy: 10
; GFX1100W64: ; Occupancy: 9
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_84_vgprs() {
define amdgpu_kernel void @used_84_vgprs() #10 {
call void asm sideeffect "", "~{v83}" ()
ret void
}
Expand All @@ -191,7 +191,7 @@ define amdgpu_kernel void @used_84_vgprs() {
; GFX1030W32: ; Occupancy: 10
; GFX1100W64: ; Occupancy: 8
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_88_vgprs() {
define amdgpu_kernel void @used_88_vgprs() #10 {
call void asm sideeffect "", "~{v87}" ()
ret void
}
Expand All @@ -202,7 +202,7 @@ define amdgpu_kernel void @used_88_vgprs() {
; GFX10W32: ; Occupancy: 10
; GFX1100W64: ; Occupancy: 8
; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_96_vgprs() {
define amdgpu_kernel void @used_96_vgprs() #10 {
call void asm sideeffect "", "~{v95}" ()
ret void
}
Expand All @@ -214,7 +214,7 @@ define amdgpu_kernel void @used_96_vgprs() {
; GFX10W32: ; Occupancy: 9
; GFX1100W64: ; Occupancy: 7
; GFX1100W32: ; Occupancy: 12
define amdgpu_kernel void @used_100_vgprs() {
define amdgpu_kernel void @used_100_vgprs() #10 {
call void asm sideeffect "", "~{v99}" ()
ret void
}
Expand All @@ -225,7 +225,7 @@ define amdgpu_kernel void @used_100_vgprs() {
; GFX10W32: ; Occupancy: 9
; GFX1100W64: ; Occupancy: 6
; GFX1100W32: ; Occupancy: 12
define amdgpu_kernel void @used_112_vgprs() {
define amdgpu_kernel void @used_112_vgprs() #10 {
call void asm sideeffect "", "~{v111}" ()
ret void
}
Expand All @@ -236,7 +236,7 @@ define amdgpu_kernel void @used_112_vgprs() {
; GFX10W32: ; Occupancy: 8
; GFX1100W64: ; Occupancy: 5
; GFX1100W32: ; Occupancy: 10
define amdgpu_kernel void @used_128_vgprs() {
define amdgpu_kernel void @used_128_vgprs() #10 {
call void asm sideeffect "", "~{v127}" ()
ret void
}
Expand All @@ -247,7 +247,7 @@ define amdgpu_kernel void @used_128_vgprs() {
; GFX10W32: ; Occupancy: 7
; GFX1100W64: ; Occupancy: 5
; GFX1100W32: ; Occupancy: 10
define amdgpu_kernel void @used_144_vgprs() {
define amdgpu_kernel void @used_144_vgprs() #10 {
call void asm sideeffect "", "~{v143}" ()
ret void
}
Expand All @@ -259,7 +259,7 @@ define amdgpu_kernel void @used_144_vgprs() {
; GFX1030W32: ; Occupancy: 5
; GFX1100W64: ; Occupancy: 4
; GFX1100W32: ; Occupancy: 9
define amdgpu_kernel void @used_168_vgprs() {
define amdgpu_kernel void @used_168_vgprs() #10 {
call void asm sideeffect "", "~{v167}" ()
ret void
}
Expand All @@ -271,7 +271,7 @@ define amdgpu_kernel void @used_168_vgprs() {
; GFX1030W32: ; Occupancy: 4
; GFX1100W64: ; Occupancy: 3
; GFX1100W32: ; Occupancy: 7
define amdgpu_kernel void @used_200_vgprs() {
define amdgpu_kernel void @used_200_vgprs() #10 {
call void asm sideeffect "", "~{v199}" ()
ret void
}
Expand All @@ -282,7 +282,7 @@ define amdgpu_kernel void @used_200_vgprs() {
; GFX10W32: ; Occupancy: 4
; GFX1100W64: ; Occupancy: 2
; GFX1100W32: ; Occupancy: 5
define amdgpu_kernel void @used_256_vgprs() {
define amdgpu_kernel void @used_256_vgprs() #10 {
call void asm sideeffect "", "~{v255}" ()
ret void
}
Expand All @@ -292,7 +292,7 @@ define amdgpu_kernel void @used_256_vgprs() {
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_80_sgprs() {
define amdgpu_kernel void @used_80_sgprs() #10 {
call void asm sideeffect "", "~{s79}" ()
ret void
}
Expand All @@ -302,7 +302,7 @@ define amdgpu_kernel void @used_80_sgprs() {
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_88_sgprs() {
define amdgpu_kernel void @used_88_sgprs() #10 {
call void asm sideeffect "", "~{s87}" ()
ret void
}
Expand All @@ -312,7 +312,7 @@ define amdgpu_kernel void @used_88_sgprs() {
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_100_sgprs() {
define amdgpu_kernel void @used_100_sgprs() #10 {
call void asm sideeffect "", "~{s99}" ()
ret void
}
Expand All @@ -322,15 +322,16 @@ define amdgpu_kernel void @used_100_sgprs() {
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_101_sgprs() {
define amdgpu_kernel void @used_101_sgprs() #10 {
call void asm sideeffect "", "~{s100}" ()
ret void
}

; GCN-LABEL: {{^}}used_lds_6552:
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX9: ; Occupancy: 8
; GFX1010W64: ; Occupancy: 20
; GFX1030W64: ; Occupancy: 16
; GFX10W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
@lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4
define amdgpu_kernel void @used_lds_6552() {
Expand All @@ -339,9 +340,10 @@ define amdgpu_kernel void @used_lds_6552() {
}

; GCN-LABEL: {{^}}used_lds_6556:
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX9: ; Occupancy: 8
; GFX1010W64: ; Occupancy: 20
; GFX1030W64: ; Occupancy: 16
; GFX10W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
@lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
define amdgpu_kernel void @used_lds_6556() {
Expand All @@ -350,9 +352,10 @@ define amdgpu_kernel void @used_lds_6556() {
}

; GCN-LABEL: {{^}}used_lds_13112:
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
; GFX9: ; Occupancy: 8
; GFX1010W64: ; Occupancy: 20
; GFX1030W64: ; Occupancy: 16
; GFX10W32: ; Occupancy: 16
; GFX1100: ; Occupancy: 16
@lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
define amdgpu_kernel void @used_lds_13112() {
Expand All @@ -361,56 +364,58 @@ define amdgpu_kernel void @used_lds_13112() {
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64:
; GFX9: ; Occupancy: 7{{$}}
; GFX10W64: ; Occupancy: 7{{$}}
; GFX10W32: ; Occupancy: 14{{$}}
; GFX1100W64: ; Occupancy: 7{{$}}
; GFX1100W32: ; Occupancy: 14{{$}}
; GFX9: ; Occupancy: 2{{$}}
; GFX10W64: ; Occupancy: 4{{$}}
; GFX10W32: ; Occupancy: 8{{$}}
; GFX1100W64: ; Occupancy: 4{{$}}
; GFX1100W32: ; Occupancy: 8{{$}}
@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
store volatile i8 1, ptr addrspace(3) @lds8252
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96:
; GFX9: ; Occupancy: 10{{$}}
; GFX10W64: ; Occupancy: 14{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
; GFX1030W32: ; Occupancy: 16{{$}}
; GFX1100W64: ; Occupancy: 14{{$}}
; GFX1100W32: ; Occupancy: 16{{$}}
; GFX9: ; Occupancy: 4{{$}}
; GFX10W64: ; Occupancy: 8{{$}}
; GFX10W32: ; Occupancy: 12{{$}}
; GFX1100W64: ; Occupancy: 8{{$}}
; GFX1100W32: ; Occupancy: 12{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
store volatile i8 1, ptr addrspace(3) @lds8252
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128:
; GFX9: ; Occupancy: 10{{$}}
; GFX10W64: ; Occupancy: 14{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
; GFX1030W32: ; Occupancy: 16{{$}}
; GFX1100W64: ; Occupancy: 14{{$}}
; GFX1100W32: ; Occupancy: 16{{$}}
; GFX9: ; Occupancy: 4{{$}}
; GFX10W64: ; Occupancy: 8{{$}}
; GFX10W32: ; Occupancy: 15{{$}}
; GFX1100W64: ; Occupancy: 8{{$}}
; GFX1100W32: ; Occupancy: 15{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
store volatile i8 1, ptr addrspace(3) @lds8252
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
; GFX1100: ; Occupancy: 16{{$}}
; GFX9: ; Occupancy: 6{{$}}
; GFX10W64: ; Occupancy: 12{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
; GFX1030W32: ; Occupancy: 15{{$}}
; GFX1100W64: ; Occupancy: 12{{$}}
; GFX1100W32: ; Occupancy: 15{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
store volatile i8 1, ptr addrspace(3) @lds8252
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
; GFX1100: ; Occupancy: 16{{$}}
; GFX9: ; Occupancy: 7{{$}}
; GFX10W64: ; Occupancy: 15{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
; GFX1030W32: ; Occupancy: 16{{$}}
; GFX1100W64: ; Occupancy: 15{{$}}
; GFX1100W32: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
store volatile i8 1, ptr addrspace(3) @lds8252
ret void
Expand All @@ -427,8 +432,9 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX9: ; Occupancy: 8{{$}}
; GFX1010W32: ; Occupancy: 16{{$}}
; GFX1010W64: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
Expand All @@ -437,17 +443,17 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
; GFX9: ; Occupancy: 7{{$}}
; GFX10: ; Occupancy: 7{{$}}
; GFX1100: ; Occupancy: 7{{$}}
; GFX9: ; Occupancy: 2{{$}}
; GFX10: ; Occupancy: 4{{$}}
; GFX1100: ; Occupancy: 4{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
store volatile i8 1, ptr addrspace(3) @lds8252
ret void
}

attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" }
attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
attributes #1 = { "amdgpu-waves-per-eu"="18,18" "amdgpu-flat-work-group-size"="1,32" }
attributes #2 = { "amdgpu-waves-per-eu"="19,19" "amdgpu-flat-work-group-size"="1,32" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
attributes #4 = { "amdgpu-flat-work-group-size"="1,96" }
attributes #5 = { "amdgpu-flat-work-group-size"="1,128" }
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/pr51516.mir
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# GCN-LABEL: name: global_sextload_v32i32_to_v32i64
# GCN: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr24_vgpr25_vgpr26_vgpr27, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0
# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0

---
name: global_sextload_v32i32_to_v32i64
Expand Down
233 changes: 116 additions & 117 deletions llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -446,87 +446,87 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x5000
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v6, 0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_movk_i32 s0, 0x7f
; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB1_2 Depth 2
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v6, v2
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: .LBB1_2: ; %for.body
; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v3
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v5
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v5
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v5
; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v3
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v3
; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v4, vcc
; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v5
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v5
; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12]
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v3
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14]
; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v3
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v3
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v3
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v4, vcc
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v5
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v6, vcc
; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16]
; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18]
; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v5
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v5
; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20]
; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v6, vcc
; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22]
; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v5
; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v6, vcc
; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24]
; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v5
; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v6, vcc
; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26]
; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x10000, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: s_addk_i32 s1, 0x2000
; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff
; GFX8-NEXT: s_waitcnt vmcnt(5)
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v7, v5
; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v8, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xffffe800, v3
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xfffff000, v3
; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20]
; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6]
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(6)
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v9, v21
; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v10, v22, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xfffff800, v3
; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10]
; GFX8-NEXT: s_waitcnt vmcnt(7)
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v11, v21
; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v12, v22, vcc
; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x10000, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(10)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(9)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v9, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(8)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v11, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(7)
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v21
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v22, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v13, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(6)
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v15, v13
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v16, v14, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v16, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(5)
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v17, v13
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v18, v14, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v18, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(4)
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v19, v13
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v20, v14, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v19, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v20, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v21, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v22, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v23, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v24, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v9, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v10, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v25, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v26, v4, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v11, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v27, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v28, v4, vcc
; GFX8-NEXT: s_cbranch_scc0 .LBB1_2
; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1
Expand All @@ -540,7 +540,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
; GFX8-NEXT: s_endpgm
;
; GFX900-LABEL: clmem_read:
Expand Down Expand Up @@ -574,92 +574,91 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
; GFX900-NEXT: s_movk_i32 s0, 0x5000
; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1
; GFX900-NEXT: v_mov_b32_e32 v5, 0
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX900-NEXT: s_movk_i32 s4, 0x7f
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: s_movk_i32 s2, 0xd000
; GFX900-NEXT: s_movk_i32 s3, 0xe000
; GFX900-NEXT: s_movk_i32 s5, 0xf000
; GFX900-NEXT: s_movk_i32 s2, 0x7f
; GFX900-NEXT: v_mov_b32_e32 v4, 0
; GFX900-NEXT: s_movk_i32 s0, 0xd000
; GFX900-NEXT: s_movk_i32 s1, 0xe000
; GFX900-NEXT: s_movk_i32 s3, 0xf000
; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX900-NEXT: ; =>This Loop Header: Depth=1
; GFX900-NEXT: ; Child Loop BB1_2 Depth 2
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v3, v1
; GFX900-NEXT: s_mov_b32 s6, 0
; GFX900-NEXT: v_mov_b32_e32 v6, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v1
; GFX900-NEXT: s_mov_b32 s4, 0
; GFX900-NEXT: .LBB1_2: ; %for.body
; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v3
; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v4, vcc
; GFX900-NEXT: global_load_dwordx2 v[9:10], v[3:4], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[11:12], v[3:4], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v3
; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v5
; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v6, vcc
; GFX900-NEXT: global_load_dwordx2 v[9:10], v[5:6], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[11:12], v[5:6], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v5
; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off
; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v4, vcc
; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v6, vcc
; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off
; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v3
; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v4, vcc
; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v3
; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v5
; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v6, vcc
; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048
; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v4, vcc
; GFX900-NEXT: s_addk_i32 s6, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v5
; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21
; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v6, s[0:1]
; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off
; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s5, v3
; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v4, vcc
; GFX900-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:-2048
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
; GFX900-NEXT: global_load_dwordx2 v[13:14], v[3:4], off
; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, 0x10000, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v5
; GFX900-NEXT: global_load_dwordx2 v[13:14], v[13:14], off
; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v6, vcc
; GFX900-NEXT: global_load_dwordx2 v[23:24], v[19:20], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[25:26], v[19:20], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[27:28], v[19:20], off
; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v5
; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v6, vcc
; GFX900-NEXT: global_load_dwordx2 v[19:20], v[21:22], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[29:30], v[5:6], off
; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, 0x10000, v5
; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX900-NEXT: s_addk_i32 s4, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s4, 0x3fffff
; GFX900-NEXT: s_waitcnt vmcnt(8)
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v17, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v18, v4, vcc
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19
; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v4, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v15, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v16, v4, vcc
; GFX900-NEXT: s_waitcnt vmcnt(4)
; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15
; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v23, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v24, v4, vcc
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7
; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v25, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v26, v4, vcc
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7
; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v27, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v28, v4, vcc
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v8, vcc
; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v9, v5
; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v6, vcc
; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5
; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v12, v6, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v19, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v20, v4, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v13, v5
; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v14, v6, vcc
; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v29, v3
; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v30, v4, vcc
; GFX900-NEXT: s_cbranch_scc0 .LBB1_2
; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1
; GFX900-NEXT: s_add_i32 s0, s4, -1
; GFX900-NEXT: s_cmp_eq_u32 s4, 0
; GFX900-NEXT: s_add_i32 s4, s2, -1
; GFX900-NEXT: s_cmp_eq_u32 s2, 0
; GFX900-NEXT: s_cbranch_scc1 .LBB1_5
; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX900-NEXT: s_mov_b32 s4, s0
; GFX900-NEXT: s_mov_b32 s2, s4
; GFX900-NEXT: s_branch .LBB1_1
; GFX900-NEXT: .LBB1_5: ; %while.end
; GFX900-NEXT: v_mov_b32_e32 v1, s35
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
; GFX900-NEXT: global_store_dwordx2 v[0:1], v[3:4], off
; GFX900-NEXT: s_endpgm
;
; GFX10-LABEL: clmem_read:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ define void @test_func() !dbg !6 {
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 10
; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 8
; STDERR-NEXT: remark: foo.cl:8:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: LDS Size [bytes/block]: 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,16 @@ body: |
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def dead %11
; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def %15, 851978 /* regdef:VGPR_HI16 */, def %16
; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
; CHECK-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def %21, 851978 /* regdef:VGPR_HI16 */, def %22
; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_HI16 */, def dead [[V_MOV_B32_e32_2]], 851978 /* regdef:VGPR_HI16 */, def dead [[V_MOV_B32_e32_3]], 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_3]](tied-def 5), 851977 /* reguse:VGPR_HI16 */, %15, 851977 /* reguse:VGPR_HI16 */, %16, 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_HI16 */, [[DS_READ_B32_gfx9_2]]
; CHECK-NEXT: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
; CHECK-NEXT: DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3)
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ body: |
; CHECK-NEXT: undef %0.sub3:vreg_128 = COPY $vgpr9
; CHECK-NEXT: undef %1.sub2:vreg_128 = COPY $vgpr8
; CHECK-NEXT: undef %2.sub1:vreg_128 = COPY $vgpr7
; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY $vgpr1
; CHECK-NEXT: %8.sub0:vreg_64 = COPY $vgpr0
; CHECK-NEXT: undef %3.sub0:vreg_128 = COPY $vgpr6
; CHECK-NEXT: undef %4.sub3:vreg_128 = COPY $vgpr5
; CHECK-NEXT: undef %5.sub2:vreg_128 = COPY $vgpr4
; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY $vgpr1
; CHECK-NEXT: %8.sub0:vreg_64 = COPY $vgpr0
; CHECK-NEXT: undef %6.sub1:vreg_128 = COPY $vgpr3
; CHECK-NEXT: undef %7.sub0:vreg_128 = COPY $vgpr2
; CHECK-NEXT: undef %9.sub0:sgpr_128 = V_READFIRSTLANE_B32 %7.sub0, implicit $exec
; CHECK-NEXT: %9.sub1:sgpr_128 = V_READFIRSTLANE_B32 %6.sub1, implicit $exec
; CHECK-NEXT: S_BARRIER
; CHECK-NEXT: %9.sub2:sgpr_128 = V_READFIRSTLANE_B32 %5.sub2, implicit $exec
; CHECK-NEXT: %9.sub3:sgpr_128 = V_READFIRSTLANE_B32 %4.sub3, implicit $exec
; CHECK-NEXT: S_BARRIER
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %9, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: undef %12.sub0:sgpr_128 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
; CHECK-NEXT: %12.sub1:sgpr_128 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
Expand Down
132 changes: 71 additions & 61 deletions llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
; which currently looks to the scheduler like an occupancy reduction, even
; though it's not. TODO: Fix!
; which (incorrectly) used to look to the scheduler like an occupancy reduction.

; 6 kB of LDS, allows 10 workgroups
@lds = internal addrspace(3) global [384 x <4 x i32>] undef
Expand All @@ -20,7 +19,7 @@ define internal void @copy(ptr addrspace(1) %src, i32 %ofs) alwaysinline {
define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"="32,32" {
; CHECK-LABEL: test:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_clause 0xa
; CHECK-NEXT: s_clause 0x1f
; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off
; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32
Expand All @@ -32,81 +31,92 @@ define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"
; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:128
; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:144
; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:160
; CHECK-NEXT: v_mov_b32_e32 v86, 0
; CHECK-NEXT: s_clause 0x8
; CHECK-NEXT: global_load_b128 v[46:49], v[0:1], off offset:176
; CHECK-NEXT: global_load_b128 v[50:53], v[0:1], off offset:240
; CHECK-NEXT: global_load_b128 v[54:57], v[0:1], off offset:224
; CHECK-NEXT: global_load_b128 v[58:61], v[0:1], off offset:208
; CHECK-NEXT: global_load_b128 v[62:65], v[0:1], off offset:192
; CHECK-NEXT: global_load_b128 v[66:69], v[0:1], off offset:304
; CHECK-NEXT: global_load_b128 v[70:73], v[0:1], off offset:288
; CHECK-NEXT: global_load_b128 v[74:77], v[0:1], off offset:272
; CHECK-NEXT: global_load_b128 v[78:81], v[0:1], off offset:256
; CHECK-NEXT: global_load_b128 v[50:53], v[0:1], off offset:192
; CHECK-NEXT: global_load_b128 v[54:57], v[0:1], off offset:208
; CHECK-NEXT: global_load_b128 v[58:61], v[0:1], off offset:224
; CHECK-NEXT: global_load_b128 v[62:65], v[0:1], off offset:240
; CHECK-NEXT: global_load_b128 v[66:69], v[0:1], off offset:256
; CHECK-NEXT: global_load_b128 v[70:73], v[0:1], off offset:272
; CHECK-NEXT: global_load_b128 v[74:77], v[0:1], off offset:288
; CHECK-NEXT: global_load_b128 v[78:81], v[0:1], off offset:304
; CHECK-NEXT: global_load_b128 v[82:85], v[0:1], off offset:320
; CHECK-NEXT: global_load_b128 v[86:89], v[0:1], off offset:336
; CHECK-NEXT: global_load_b128 v[90:93], v[0:1], off offset:352
; CHECK-NEXT: global_load_b128 v[94:97], v[0:1], off offset:368
; CHECK-NEXT: global_load_b128 v[98:101], v[0:1], off offset:384
; CHECK-NEXT: global_load_b128 v[102:105], v[0:1], off offset:400
; CHECK-NEXT: global_load_b128 v[106:109], v[0:1], off offset:416
; CHECK-NEXT: global_load_b128 v[110:113], v[0:1], off offset:432
; CHECK-NEXT: global_load_b128 v[114:117], v[0:1], off offset:448
; CHECK-NEXT: global_load_b128 v[118:121], v[0:1], off offset:464
; CHECK-NEXT: global_load_b128 v[122:125], v[0:1], off offset:480
; CHECK-NEXT: global_load_b128 v[126:129], v[0:1], off offset:496
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt vmcnt(31)
; CHECK-NEXT: ds_store_b128 v0, v[2:5]
; CHECK-NEXT: s_waitcnt vmcnt(30)
; CHECK-NEXT: ds_store_b128 v0, v[6:9] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(29)
; CHECK-NEXT: ds_store_b128 v0, v[10:13] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(28)
; CHECK-NEXT: ds_store_b128 v0, v[14:17] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(27)
; CHECK-NEXT: ds_store_b128 v0, v[18:21] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(26)
; CHECK-NEXT: ds_store_b128 v0, v[22:25] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(25)
; CHECK-NEXT: ds_store_b128 v0, v[26:29] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(24)
; CHECK-NEXT: ds_store_b128 v0, v[30:33] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(23)
; CHECK-NEXT: ds_store_b128 v0, v[34:37] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(22)
; CHECK-NEXT: ds_store_b128 v0, v[38:41] offset:144
; CHECK-NEXT: s_waitcnt vmcnt(21)
; CHECK-NEXT: ds_store_b128 v0, v[42:45] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: ds_store_b128 v0, v[46:49] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(19)
; CHECK-NEXT: ds_store_b128 v86, v[2:5]
; CHECK-NEXT: ds_store_b128 v0, v[50:53] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(18)
; CHECK-NEXT: ds_store_b128 v86, v[6:9] offset:16
; CHECK-NEXT: ds_store_b128 v0, v[54:57] offset:208
; CHECK-NEXT: s_waitcnt vmcnt(17)
; CHECK-NEXT: ds_store_b128 v86, v[10:13] offset:32
; CHECK-NEXT: ds_store_b128 v0, v[58:61] offset:224
; CHECK-NEXT: s_waitcnt vmcnt(16)
; CHECK-NEXT: ds_store_b128 v86, v[14:17] offset:48
; CHECK-NEXT: ds_store_b128 v0, v[62:65] offset:240
; CHECK-NEXT: s_waitcnt vmcnt(15)
; CHECK-NEXT: ds_store_b128 v86, v[18:21] offset:64
; CHECK-NEXT: ds_store_b128 v0, v[66:69] offset:256
; CHECK-NEXT: s_waitcnt vmcnt(14)
; CHECK-NEXT: ds_store_b128 v86, v[22:25] offset:80
; CHECK-NEXT: ds_store_b128 v0, v[70:73] offset:272
; CHECK-NEXT: s_waitcnt vmcnt(13)
; CHECK-NEXT: ds_store_b128 v86, v[26:29] offset:96
; CHECK-NEXT: ds_store_b128 v0, v[74:77] offset:288
; CHECK-NEXT: s_waitcnt vmcnt(12)
; CHECK-NEXT: ds_store_b128 v86, v[30:33] offset:112
; CHECK-NEXT: ds_store_b128 v0, v[78:81] offset:304
; CHECK-NEXT: s_waitcnt vmcnt(11)
; CHECK-NEXT: ds_store_b128 v86, v[34:37] offset:128
; CHECK-NEXT: ds_store_b128 v0, v[82:85] offset:320
; CHECK-NEXT: s_waitcnt vmcnt(10)
; CHECK-NEXT: ds_store_b128 v86, v[38:41] offset:144
; CHECK-NEXT: ds_store_b128 v0, v[86:89] offset:336
; CHECK-NEXT: s_waitcnt vmcnt(9)
; CHECK-NEXT: ds_store_b128 v86, v[42:45] offset:160
; CHECK-NEXT: s_clause 0xb
; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off offset:368
; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:352
; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:336
; CHECK-NEXT: global_load_b128 v[14:17], v[0:1], off offset:320
; CHECK-NEXT: global_load_b128 v[18:21], v[0:1], off offset:432
; CHECK-NEXT: global_load_b128 v[22:25], v[0:1], off offset:416
; CHECK-NEXT: global_load_b128 v[26:29], v[0:1], off offset:400
; CHECK-NEXT: global_load_b128 v[30:33], v[0:1], off offset:384
; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:464
; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:448
; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:480
; CHECK-NEXT: global_load_b128 v[82:85], v[0:1], off offset:496
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: ds_store_b128 v86, v[46:49] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(16)
; CHECK-NEXT: ds_store_b128 v86, v[62:65] offset:192
; CHECK-NEXT: ds_store_b128 v86, v[58:61] offset:208
; CHECK-NEXT: ds_store_b128 v86, v[54:57] offset:224
; CHECK-NEXT: ds_store_b128 v86, v[50:53] offset:240
; CHECK-NEXT: s_waitcnt vmcnt(12)
; CHECK-NEXT: ds_store_b128 v86, v[78:81] offset:256
; CHECK-NEXT: ds_store_b128 v86, v[74:77] offset:272
; CHECK-NEXT: ds_store_b128 v86, v[70:73] offset:288
; CHECK-NEXT: ds_store_b128 v86, v[66:69] offset:304
; CHECK-NEXT: ds_store_b128 v0, v[90:93] offset:352
; CHECK-NEXT: s_waitcnt vmcnt(8)
; CHECK-NEXT: ds_store_b128 v86, v[14:17] offset:320
; CHECK-NEXT: ds_store_b128 v86, v[10:13] offset:336
; CHECK-NEXT: ds_store_b128 v86, v[6:9] offset:352
; CHECK-NEXT: ds_store_b128 v86, v[2:5] offset:368
; CHECK-NEXT: ds_store_b128 v0, v[94:97] offset:368
; CHECK-NEXT: s_waitcnt vmcnt(7)
; CHECK-NEXT: ds_store_b128 v0, v[98:101] offset:384
; CHECK-NEXT: s_waitcnt vmcnt(6)
; CHECK-NEXT: ds_store_b128 v0, v[102:105] offset:400
; CHECK-NEXT: s_waitcnt vmcnt(5)
; CHECK-NEXT: ds_store_b128 v0, v[106:109] offset:416
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: ds_store_b128 v86, v[30:33] offset:384
; CHECK-NEXT: ds_store_b128 v86, v[26:29] offset:400
; CHECK-NEXT: ds_store_b128 v86, v[22:25] offset:416
; CHECK-NEXT: ds_store_b128 v86, v[18:21] offset:432
; CHECK-NEXT: ds_store_b128 v0, v[110:113] offset:432
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: ds_store_b128 v0, v[114:117] offset:448
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_store_b128 v86, v[38:41] offset:448
; CHECK-NEXT: ds_store_b128 v86, v[34:37] offset:464
; CHECK-NEXT: ds_store_b128 v0, v[118:121] offset:464
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: ds_store_b128 v86, v[42:45] offset:480
; CHECK-NEXT: ds_store_b128 v0, v[122:125] offset:480
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_store_b128 v86, v[82:85] offset:496
; CHECK-NEXT: ds_store_b128 v0, v[126:129] offset:496
; CHECK-NEXT: s_endpgm
call void @copy(ptr addrspace(1) %src, i32 0)
call void @copy(ptr addrspace(1) %src, i32 1)
Expand Down
336 changes: 168 additions & 168 deletions llvm/test/CodeGen/AMDGPU/sdiv.ll

Large diffs are not rendered by default.

132 changes: 66 additions & 66 deletions llvm/test/CodeGen/AMDGPU/sdiv64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -366,96 +366,96 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v0
; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v3
; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v1
; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v0, v4
; GCN-IR-NEXT: v_subb_u32_e32 v12, vcc, v1, v4, vcc
; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v2
; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v3
; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v5
; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v0, v5, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[11:12]
; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2
; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3
; GCN-IR-NEXT: v_min_u32_e32 v0, v0, v7
; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v11
; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7
; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v12
; GCN-IR-NEXT: v_min_u32_e32 v13, v7, v8
; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v13
; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8]
; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8]
; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc
; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1
; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v0, v4
; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v1, v4, vcc
; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v2
; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v3
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
; GCN-IR-NEXT: v_min_u32_e32 v12, v2, v3
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v10
; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v11
; GCN-IR-NEXT: v_min_u32_e32 v13, v2, v3
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v12, v13
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7]
; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3]
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; GCN-IR-NEXT: v_mov_b32_e32 v6, v4
; GCN-IR-NEXT: v_mov_b32_e32 v1, v5
; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v12, 0, s[6:7]
; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[6:7]
; GCN-IR-NEXT: v_mov_b32_e32 v7, v5
; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v10, 0, s[4:5]
; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v2
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[11:12], v7
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[10:11], v2
; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v2
; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc
; GCN-IR-NEXT: v_not_b32_e32 v0, v0
; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[11:12], v14
; GCN-IR-NEXT: v_not_b32_e32 v9, 0
; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v0, v13
; GCN-IR-NEXT: v_mov_b32_e32 v16, 0
; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc
; GCN-IR-NEXT: v_not_b32_e32 v9, v12
; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[10:11], v14
; GCN-IR-NEXT: v_not_b32_e32 v8, 0
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v9, v13
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: v_mov_b32_e32 v17, 0
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v8
; GCN-IR-NEXT: v_or_b32_e32 v0, v14, v0
; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v0
; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1
; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc
; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9
; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v11
; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7
; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13
; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v3
; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v2
; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[11:12]
; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v0, v13
; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8
; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v17, v10
; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v3
; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v8
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v16, v14
; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v17, v15, vcc
; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2
; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10
; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3
; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12
; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v1
; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v0
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v14, v12
; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v13, s[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v13, v9
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GCN-IR-NEXT: v_mov_b32_e32 v16, v9
; GCN-IR-NEXT: v_mov_b32_e32 v12, v8
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: .LBB1_5: ; %Flow3
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1
; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v3
; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v2
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0
; GCN-IR-NEXT: .LBB1_6: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4
; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v6
; GCN-IR-NEXT: v_xor_b32_e32 v3, v9, v0
; GCN-IR-NEXT: v_xor_b32_e32 v2, v10, v1
; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v6
; GCN-IR-NEXT: v_xor_b32_e32 v3, v8, v0
; GCN-IR-NEXT: v_xor_b32_e32 v2, v9, v1
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/shift-i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -288,18 +288,18 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16
; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
; GCN-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9
; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12
Expand Down Expand Up @@ -337,18 +337,18 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12
Expand Down Expand Up @@ -386,18 +386,18 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v16
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -918,20 +918,20 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13
; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: shl_v4i64:
Expand Down
44 changes: 22 additions & 22 deletions llvm/test/CodeGen/AMDGPU/sra.ll
Original file line number Diff line number Diff line change
Expand Up @@ -614,20 +614,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13
; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13
; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: ashr_v4i64:
Expand All @@ -640,20 +640,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3]
; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10]
; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8]
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7]
; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5]
; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1]
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i64:
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/srl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -266,20 +266,20 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13
; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13
; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: lshr_v4i64:
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/ssubsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -755,14 +755,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc
; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20
; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v4, v20
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4
; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4
; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v17
; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX6-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5
Expand Down Expand Up @@ -874,14 +874,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc
; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v4, v20
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v17
; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/udiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -862,43 +862,43 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_lo_u32 v14, v10, v0
; GCN-NEXT: v_mul_lo_u32 v16, v11, v1
; GCN-NEXT: v_mul_lo_u32 v18, v12, v2
; GCN-NEXT: v_mul_lo_u32 v19, v13, v3
; GCN-NEXT: v_mul_lo_u32 v20, v13, v3
; GCN-NEXT: v_sub_u32_e32 v4, vcc, v4, v14
; GCN-NEXT: v_sub_u32_e32 v5, vcc, v5, v16
; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v18
; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v19
; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v20
; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10
; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11
; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12
; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13
; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12
; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0
; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1
; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
; GCN-NEXT: v_subrev_u32_e32 v18, vcc, v0, v4
; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v0, v4
; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1]
; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v1, v5
; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3]
; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v2, v6
; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5]
; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v3, v7
; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v18, s[0:1]
; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10
; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v2, v6
; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5]
; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v3, v7
; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[0:1]
; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3]
; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11
; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v17, s[4:5]
; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v14, s[6:7]
; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13
; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[4:5]
; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[6:7]
; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2
; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: s_endpgm
;
Expand Down
66 changes: 35 additions & 31 deletions llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ define hidden void @blam() {
; GCN-NEXT: v_writelane_b32 v40, s55, 23
; GCN-NEXT: v_writelane_b32 v40, s56, 24
; GCN-NEXT: v_writelane_b32 v40, s57, 25
; GCN-NEXT: v_writelane_b32 v40, s58, 26
; GCN-NEXT: v_writelane_b32 v40, s59, 27
; GCN-NEXT: v_mov_b32_e32 v41, v31
; GCN-NEXT: s_mov_b32 s46, s15
; GCN-NEXT: s_mov_b32 s47, s14
Expand All @@ -316,23 +318,26 @@ define hidden void @blam() {
; GCN-NEXT: s_mov_b64 s[50:51], 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41
; GCN-NEXT: flat_load_dword v44, v[0:1]
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v41
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v0
; GCN-NEXT: s_getpc_b64 s[52:53]
; GCN-NEXT: s_add_u32 s52, s52, spam@rel32@lo+4
; GCN-NEXT: s_addc_u32 s53, s53, spam@rel32@hi+12
; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_f32_e64 s[52:53], 0, v44
; GCN-NEXT: v_cmp_eq_f32_e64 s[54:55], 0, v44
; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v44
; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000
; GCN-NEXT: s_branch .LBB1_2
; GCN-NEXT: LBB1_1: ; %Flow7
; GCN-NEXT: .LBB1_1: ; %Flow7
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_or_b64 s[50:51], s[4:5], s[50:51]
; GCN-NEXT: s_andn2_b64 exec, exec, s[50:51]
; GCN-NEXT: s_cbranch_execz .LBB1_18
; GCN-NEXT: .LBB1_2: ; %bb2
; GCN-NEXT: .LBB1_2: ; %bb2
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: flat_load_dword v0, v[42:43]
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0
Expand All @@ -341,18 +346,15 @@ define hidden void @blam() {
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_xor_b64 s[54:55], exec, s[8:9]
; GCN-NEXT: s_xor_b64 s[56:57], exec, s[8:9]
; GCN-NEXT: s_cbranch_execz .LBB1_12
; GCN-NEXT: ; %bb.3: ; %bb6
; GCN-NEXT: ; %bb.3: ; %bb6
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: v_cmp_eq_u32_e64 s[44:45], 3, v0
; GCN-NEXT: s_and_saveexec_b64 s[56:57], s[44:45]
; GCN-NEXT: s_and_saveexec_b64 s[58:59], s[44:45]
; GCN-NEXT: s_cbranch_execz .LBB1_11
; GCN-NEXT: %bb.4: ; %bb11
; GCN-NEXT: ; %bb.4: ; %bb11
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, spam@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, spam@rel32@hi+12
; GCN-NEXT: s_mov_b64 s[4:5], s[40:41]
; GCN-NEXT: s_mov_b64 s[6:7], s[38:39]
; GCN-NEXT: s_mov_b64 s[8:9], s[36:37]
Expand All @@ -362,63 +364,63 @@ define hidden void @blam() {
; GCN-NEXT: s_mov_b32 s14, s47
; GCN-NEXT: s_mov_b32 s15, s46
; GCN-NEXT: v_mov_b32_e32 v31, v41
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_swappc_b64 s[30:31], s[52:53]
; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_10
; GCN-NEXT: ; %bb.5: ; %bb14
; GCN-NEXT: ; %bb.5: ; %bb14
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_mov_b64 s[8:9], s[52:53]
; GCN-NEXT: s_mov_b64 s[8:9], s[54:55]
; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[42:43]
; GCN-NEXT: s_cbranch_execz .LBB1_7
; GCN-NEXT: ; %bb.6: ; %bb16
; GCN-NEXT: ; %bb.6: ; %bb16
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0
; GCN-NEXT: s_or_b64 s[8:9], s[52:53], exec
; GCN-NEXT: .LBB1_7: ; %Flow3
; GCN-NEXT: s_or_b64 s[8:9], s[54:55], exec
; GCN-NEXT: .LBB1_7: ; %Flow3
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[8:9]
; GCN-NEXT: s_xor_b64 s[8:9], exec, s[10:11]
; GCN-NEXT: s_cbranch_execz .LBB1_9
; GCN-NEXT: ; %bb.8: ; %bb17
; GCN-NEXT: ; %bb.8: ; %bb17
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0
; GCN-NEXT: .LBB1_9: ; %Flow4
; GCN-NEXT: .LBB1_9: ; %Flow4
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec
; GCN-NEXT: .LBB1_10: ; %Flow2
; GCN-NEXT: .LBB1_10: ; %Flow2
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_andn2_b64 s[4:5], s[44:45], exec
; GCN-NEXT: s_and_b64 s[8:9], vcc, exec
; GCN-NEXT: s_or_b64 s[44:45], s[4:5], s[8:9]
; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec
; GCN-NEXT: .LBB1_11: ; %Flow1
; GCN-NEXT: .LBB1_11: ; %Flow1
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[56:57]
; GCN-NEXT: s_or_b64 exec, exec, s[58:59]
; GCN-NEXT: s_orn2_b64 s[4:5], s[44:45], exec
; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: .LBB1_12: ; %Flow
; GCN-NEXT: .LBB1_12: ; %Flow
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[54:55]
; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[56:57]
; GCN-NEXT: s_cbranch_execz .LBB1_16
; GCN-NEXT: ; %bb.13: ; %bb8
; GCN-NEXT: ; %bb.13: ; %bb8
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_15
; GCN-NEXT: ; %bb.14: ; %bb10
; GCN-NEXT: ; %bb.14: ; %bb10
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0
; GCN-NEXT: s_or_b64 s[10:11], s[6:7], exec
; GCN-NEXT: .LBB1_15: ; %Flow6
; GCN-NEXT: .LBB1_15: ; %Flow6
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[12:13]
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
Expand All @@ -427,18 +429,20 @@ define hidden void @blam() {
; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec
; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13]
; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; GCN-NEXT: .LBB1_16: ; %Flow5
; GCN-NEXT: .LBB1_16: ; %Flow5
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB1_1
; GCN-NEXT: ; %bb.17: ; %bb18
; GCN-NEXT: ; %bb.17: ; %bb18
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0
; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
; GCN-NEXT: s_branch .LBB1_1
; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock
; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock
; GCN-NEXT: s_or_b64 exec, exec, s[50:51]
; GCN-NEXT: v_readlane_b32 s59, v40, 27
; GCN-NEXT: v_readlane_b32 s58, v40, 26
; GCN-NEXT: v_readlane_b32 s57, v40, 25
; GCN-NEXT: v_readlane_b32 s56, v40, 24
; GCN-NEXT: v_readlane_b32 s55, v40, 23
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
# FULL-NEXT: fp64-fp16-input-denormals: true
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
# FULL-NEXT: occupancy: 10
# FULL-NEXT: occupancy: 8
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: body:

Expand Down Expand Up @@ -74,7 +74,7 @@
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: occupancy: 8
# SIMPLE-NEXT: body:
name: kernel0
machineFunctionInfo:
Expand Down Expand Up @@ -142,7 +142,7 @@ body: |
# FULL-NEXT: fp64-fp16-input-denormals: true
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
# FULL-NEXT: occupancy: 10
# FULL-NEXT: occupancy: 8
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: body:

Expand All @@ -161,7 +161,7 @@ body: |
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: occupancy: 8
# SIMPLE-NEXT: body:

name: no_mfi
Expand Down Expand Up @@ -211,7 +211,7 @@ body: |
# FULL-NEXT: fp64-fp16-input-denormals: true
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
# FULL-NEXT: occupancy: 10
# FULL-NEXT: occupancy: 8
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: body:

Expand All @@ -230,7 +230,7 @@ body: |
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: occupancy: 8
# SIMPLE-NEXT: body:

name: empty_mfi
Expand Down Expand Up @@ -281,7 +281,7 @@ body: |
# FULL-NEXT: fp64-fp16-input-denormals: true
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
# FULL-NEXT: occupancy: 10
# FULL-NEXT: occupancy: 8
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: body:

Expand All @@ -301,7 +301,7 @@ body: |
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: occupancy: 8
# SIMPLE-NEXT: body:

name: empty_mfi_entry_func
Expand Down Expand Up @@ -430,7 +430,7 @@ body: |

---
# ALL-LABEL: name: occupancy_0
# ALL: occupancy: 10
# ALL: occupancy: 8
name: occupancy_0
machineFunctionInfo:
occupancy: 0
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
; CHECK-NEXT: fp64-fp16-input-denormals: true
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: occupancy: 8
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: body:
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
Expand Down Expand Up @@ -132,7 +132,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: fp64-fp16-input-denormals: true
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: occupancy: 8
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: body:
define void @function() {
Expand Down Expand Up @@ -178,7 +178,7 @@ define void @function() {
; CHECK-NEXT: fp64-fp16-input-denormals: true
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: occupancy: 8
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: body:
define void @function_nsz() #0 {
Expand Down