| 
 | 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5  | 
 | 2 | +;; Test that carryout from 64-bit add/sub (synthesized from two 32-bit adds/subs) is utilized  | 
 | 3 | +;; (i.e. no additional compare is generated).  | 
 | 4 | + | 
 | 5 | +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s  | 
 | 6 | + | 
 | 7 | +%struct.uint96 = type { i64, i32 }  | 
 | 8 | +%struct.uint64pair = type { i64, i64 }  | 
 | 9 | + | 
 | 10 | +declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)  | 
 | 11 | +declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)  | 
 | 12 | + | 
 | 13 | +declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)  | 
 | 14 | +declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)  | 
 | 15 | + | 
 | 16 | +define %struct.uint96 @v_add64_32(i64 %val64A, i64 %val64B, i32 %val32) {  | 
 | 17 | +; CHECK-LABEL: v_add64_32:  | 
 | 18 | +; CHECK:       ; %bb.0:  | 
 | 19 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)  | 
 | 20 | +; CHECK-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2  | 
 | 21 | +; CHECK-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v3, vcc  | 
 | 22 | +; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, v[5:6], v[0:1]  | 
 | 23 | +; CHECK-NEXT:    v_mov_b32_e32 v0, v5  | 
 | 24 | +; CHECK-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc  | 
 | 25 | +; CHECK-NEXT:    v_mov_b32_e32 v1, v6  | 
 | 26 | +; CHECK-NEXT:    s_setpc_b64 s[30:31]  | 
 | 27 | +  %sum64 = add i64 %val64A, %val64B  | 
 | 28 | +  %obit = icmp ult i64 %sum64, %val64A  | 
 | 29 | +  %obit32 = zext i1 %obit to i32  | 
 | 30 | +  %sum32 = add i32 %val32, %obit32  | 
 | 31 | +  %.fca.0.insert = insertvalue %struct.uint96 poison, i64 %sum64, 0  | 
 | 32 | +  %.fca.1.insert = insertvalue %struct.uint96 %.fca.0.insert, i32 %sum32, 1  | 
 | 33 | +  ret %struct.uint96 %.fca.1.insert  | 
 | 34 | +}  | 
 | 35 | + | 
 | 36 | +define <2 x i64> @v_uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {  | 
 | 37 | +; CHECK-LABEL: v_uadd_v2i64:  | 
 | 38 | +; CHECK:       ; %bb.0:  | 
 | 39 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)  | 
 | 40 | +; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, v2, v6  | 
 | 41 | +; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, v3, v7, vcc  | 
 | 42 | +; CHECK-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v4  | 
 | 43 | +; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v5, vcc  | 
 | 44 | +; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]  | 
 | 45 | +; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]  | 
 | 46 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 47 | +; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]  | 
 | 48 | +; CHECK-NEXT:    v_mov_b32_e32 v1, v0  | 
 | 49 | +; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc  | 
 | 50 | +; CHECK-NEXT:    v_mov_b32_e32 v3, v2  | 
 | 51 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 52 | +; CHECK-NEXT:    s_setpc_b64 s[30:31]  | 
 | 53 | +  %pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)  | 
 | 54 | +  %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0  | 
 | 55 | +  %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1  | 
 | 56 | +  %res = sext <2 x i1> %obit to <2 x i64>  | 
 | 57 | +  store <2 x i64> %val, ptr %ptrval  | 
 | 58 | +  ret <2 x i64> %res  | 
 | 59 | +}  | 
 | 60 | + | 
 | 61 | +define <2 x i64> @v_usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {  | 
 | 62 | +; CHECK-LABEL: v_usub_v2i64:  | 
 | 63 | +; CHECK:       ; %bb.0:  | 
 | 64 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)  | 
 | 65 | +; CHECK-NEXT:    v_sub_co_u32_e32 v6, vcc, v2, v6  | 
 | 66 | +; CHECK-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v7, vcc  | 
 | 67 | +; CHECK-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v4  | 
 | 68 | +; CHECK-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v5, vcc  | 
 | 69 | +; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[0:1]  | 
 | 70 | +; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]  | 
 | 71 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 72 | +; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]  | 
 | 73 | +; CHECK-NEXT:    v_mov_b32_e32 v1, v0  | 
 | 74 | +; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc  | 
 | 75 | +; CHECK-NEXT:    v_mov_b32_e32 v3, v2  | 
 | 76 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 77 | +; CHECK-NEXT:    s_setpc_b64 s[30:31]  | 
 | 78 | +  %pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)  | 
 | 79 | +  %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0  | 
 | 80 | +  %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1  | 
 | 81 | +  %res = sext <2 x i1> %obit to <2 x i64>  | 
 | 82 | +  store <2 x i64> %val, ptr %ptrval  | 
 | 83 | +  ret <2 x i64> %res  | 
 | 84 | +}  | 
 | 85 | + | 
 | 86 | +define i64 @v_uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {  | 
 | 87 | +; CHECK-LABEL: v_uadd_i64:  | 
 | 88 | +; CHECK:       ; %bb.0:  | 
 | 89 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)  | 
 | 90 | +; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2  | 
 | 91 | +; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc  | 
 | 92 | +; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]  | 
 | 93 | +; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]  | 
 | 94 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 95 | +; CHECK-NEXT:    v_mov_b32_e32 v1, v0  | 
 | 96 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 97 | +; CHECK-NEXT:    s_setpc_b64 s[30:31]  | 
 | 98 | +  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)  | 
 | 99 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 100 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 101 | +  %res = sext i1 %obit to i64  | 
 | 102 | +  store i64 %val, ptr %ptrval  | 
 | 103 | +  ret i64 %res  | 
 | 104 | +}  | 
 | 105 | + | 
 | 106 | +define i64 @v_uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {  | 
 | 107 | +; CHECK-LABEL: v_uadd_p1:  | 
 | 108 | +; CHECK:       ; %bb.0:  | 
 | 109 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)  | 
 | 110 | +; CHECK-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0  | 
 | 111 | +; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc  | 
 | 112 | +; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]  | 
 | 113 | +; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]  | 
 | 114 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 115 | +; CHECK-NEXT:    v_mov_b32_e32 v1, v0  | 
 | 116 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 117 | +; CHECK-NEXT:    s_setpc_b64 s[30:31]  | 
 | 118 | +  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)  | 
 | 119 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 120 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 121 | +  %res = sext i1 %obit to i64  | 
 | 122 | +  store i64 %val, ptr %ptrval  | 
 | 123 | +  ret i64 %res  | 
 | 124 | +}  | 
 | 125 | + | 
 | 126 | +define i64 @v_uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {  | 
 | 127 | +; CHECK-LABEL: v_uadd_n1:  | 
 | 128 | +; CHECK:       ; %bb.0:  | 
 | 129 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)  | 
 | 130 | +; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, -1, v0  | 
 | 131 | +; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc  | 
 | 132 | +; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]  | 
 | 133 | +; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]  | 
 | 134 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 135 | +; CHECK-NEXT:    v_mov_b32_e32 v1, v0  | 
 | 136 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 137 | +; CHECK-NEXT:    s_setpc_b64 s[30:31]  | 
 | 138 | +  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)  | 
 | 139 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 140 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 141 | +  %res = sext i1 %obit to i64  | 
 | 142 | +  store i64 %val, ptr %ptrval  | 
 | 143 | +  ret i64 %res  | 
 | 144 | +}  | 
 | 145 | + | 
 | 146 | +define i64 @v_usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {  | 
 | 147 | +; CHECK-LABEL: v_usub_p1:  | 
 | 148 | +; CHECK:       ; %bb.0:  | 
 | 149 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)  | 
 | 150 | +; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, -1, v0  | 
 | 151 | +; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc  | 
 | 152 | +; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]  | 
 | 153 | +; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]  | 
 | 154 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 155 | +; CHECK-NEXT:    v_mov_b32_e32 v1, v0  | 
 | 156 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 157 | +; CHECK-NEXT:    s_setpc_b64 s[30:31]  | 
 | 158 | +  %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)  | 
 | 159 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 160 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 161 | +  %res = sext i1 %obit to i64  | 
 | 162 | +  store i64 %val, ptr %ptrval  | 
 | 163 | +  ret i64 %res  | 
 | 164 | +}  | 
 | 165 | + | 
 | 166 | +define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {  | 
 | 167 | +; CHECK-LABEL: v_usub_n1:  | 
 | 168 | +; CHECK:       ; %bb.0:  | 
 | 169 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)  | 
 | 170 | +; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v0  | 
 | 171 | +; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc  | 
 | 172 | +; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]  | 
 | 173 | +; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]  | 
 | 174 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 175 | +; CHECK-NEXT:    v_mov_b32_e32 v1, v0  | 
 | 176 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 177 | +; CHECK-NEXT:    s_setpc_b64 s[30:31]  | 
 | 178 | +  %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)  | 
 | 179 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 180 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 181 | +  %res = sext i1 %obit to i64  | 
 | 182 | +  store i64 %val, ptr %ptrval  | 
 | 183 | +  ret i64 %res  | 
 | 184 | +}  | 
 | 185 | + | 
 | 186 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;  | 
 | 187 | +; test SGPR  | 
 | 188 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;  | 
 | 189 | + | 
 | 190 | +define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {  | 
 | 191 | +; CHECK-LABEL: s_add64_32:  | 
 | 192 | +; CHECK:       ; %bb.0:  | 
 | 193 | +; CHECK-NEXT:    s_add_u32 s6, s0, s2  | 
 | 194 | +; CHECK-NEXT:    v_mov_b32_e32 v0, s0  | 
 | 195 | +; CHECK-NEXT:    s_addc_u32 s7, s1, s3  | 
 | 196 | +; CHECK-NEXT:    v_mov_b32_e32 v1, s1  | 
 | 197 | +; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]  | 
 | 198 | +; CHECK-NEXT:    s_mov_b32 s0, s6  | 
 | 199 | +; CHECK-NEXT:    s_cmp_lg_u64 vcc, 0  | 
 | 200 | +; CHECK-NEXT:    s_addc_u32 s2, s4, 0  | 
 | 201 | +; CHECK-NEXT:    s_mov_b32 s1, s7  | 
 | 202 | +; CHECK-NEXT:    ; return to shader part epilog  | 
 | 203 | +  %sum64 = add i64 %val64A, %val64B  | 
 | 204 | +  %obit = icmp ult i64 %sum64, %val64A  | 
 | 205 | +  %obit32 = zext i1 %obit to i32  | 
 | 206 | +  %sum32 = add i32 %val32, %obit32  | 
 | 207 | +  %.fca.0.insert = insertvalue %struct.uint96 poison, i64 %sum64, 0  | 
 | 208 | +  %.fca.1.insert = insertvalue %struct.uint96 %.fca.0.insert, i32 %sum32, 1  | 
 | 209 | +  ret %struct.uint96 %.fca.1.insert  | 
 | 210 | +}  | 
 | 211 | + | 
 | 212 | +define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {  | 
 | 213 | +; CHECK-LABEL: s_uadd_v2i64:  | 
 | 214 | +; CHECK:       ; %bb.0:  | 
 | 215 | +; CHECK-NEXT:    s_add_u32 s6, s2, s6  | 
 | 216 | +; CHECK-NEXT:    v_mov_b32_e32 v9, s3  | 
 | 217 | +; CHECK-NEXT:    s_addc_u32 s7, s3, s7  | 
 | 218 | +; CHECK-NEXT:    v_mov_b32_e32 v8, s2  | 
 | 219 | +; CHECK-NEXT:    s_add_u32 s4, s0, s4  | 
 | 220 | +; CHECK-NEXT:    v_mov_b32_e32 v7, s1  | 
 | 221 | +; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]  | 
 | 222 | +; CHECK-NEXT:    s_addc_u32 s5, s1, s5  | 
 | 223 | +; CHECK-NEXT:    v_mov_b32_e32 v6, s0  | 
 | 224 | +; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc  | 
 | 225 | +; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]  | 
 | 226 | +; CHECK-NEXT:    v_readfirstlane_b32 s2, v8  | 
 | 227 | +; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc  | 
 | 228 | +; CHECK-NEXT:    v_readfirstlane_b32 s0, v6  | 
 | 229 | +; CHECK-NEXT:    v_mov_b32_e32 v2, s4  | 
 | 230 | +; CHECK-NEXT:    v_mov_b32_e32 v3, s5  | 
 | 231 | +; CHECK-NEXT:    v_mov_b32_e32 v4, s6  | 
 | 232 | +; CHECK-NEXT:    v_mov_b32_e32 v5, s7  | 
 | 233 | +; CHECK-NEXT:    s_mov_b32 s1, s0  | 
 | 234 | +; CHECK-NEXT:    s_mov_b32 s3, s2  | 
 | 235 | +; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]  | 
 | 236 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 237 | +; CHECK-NEXT:    ; return to shader part epilog  | 
 | 238 | +  %pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)  | 
 | 239 | +  %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0  | 
 | 240 | +  %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1  | 
 | 241 | +  %res = sext <2 x i1> %obit to <2 x i64>  | 
 | 242 | +  store <2 x i64> %val, ptr %ptrval  | 
 | 243 | +  ret <2 x i64> %res  | 
 | 244 | +}  | 
 | 245 | + | 
 | 246 | +define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {  | 
 | 247 | +; CHECK-LABEL: s_usub_v2i64:  | 
 | 248 | +; CHECK:       ; %bb.0:  | 
 | 249 | +; CHECK-NEXT:    s_sub_u32 s6, s2, s6  | 
 | 250 | +; CHECK-NEXT:    v_mov_b32_e32 v9, s3  | 
 | 251 | +; CHECK-NEXT:    s_subb_u32 s7, s3, s7  | 
 | 252 | +; CHECK-NEXT:    v_mov_b32_e32 v8, s2  | 
 | 253 | +; CHECK-NEXT:    s_sub_u32 s4, s0, s4  | 
 | 254 | +; CHECK-NEXT:    v_mov_b32_e32 v7, s1  | 
 | 255 | +; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[8:9]  | 
 | 256 | +; CHECK-NEXT:    s_subb_u32 s5, s1, s5  | 
 | 257 | +; CHECK-NEXT:    v_mov_b32_e32 v6, s0  | 
 | 258 | +; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc  | 
 | 259 | +; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[6:7]  | 
 | 260 | +; CHECK-NEXT:    v_readfirstlane_b32 s2, v8  | 
 | 261 | +; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc  | 
 | 262 | +; CHECK-NEXT:    v_readfirstlane_b32 s0, v6  | 
 | 263 | +; CHECK-NEXT:    v_mov_b32_e32 v2, s4  | 
 | 264 | +; CHECK-NEXT:    v_mov_b32_e32 v3, s5  | 
 | 265 | +; CHECK-NEXT:    v_mov_b32_e32 v4, s6  | 
 | 266 | +; CHECK-NEXT:    v_mov_b32_e32 v5, s7  | 
 | 267 | +; CHECK-NEXT:    s_mov_b32 s1, s0  | 
 | 268 | +; CHECK-NEXT:    s_mov_b32 s3, s2  | 
 | 269 | +; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]  | 
 | 270 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 271 | +; CHECK-NEXT:    ; return to shader part epilog  | 
 | 272 | +  %pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)  | 
 | 273 | +  %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0  | 
 | 274 | +  %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1  | 
 | 275 | +  %res = sext <2 x i1> %obit to <2 x i64>  | 
 | 276 | +  store <2 x i64> %val, ptr %ptrval  | 
 | 277 | +  ret <2 x i64> %res  | 
 | 278 | +}  | 
 | 279 | + | 
 | 280 | +define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {  | 
 | 281 | +; CHECK-LABEL: s_uadd_i64:  | 
 | 282 | +; CHECK:       ; %bb.0:  | 
 | 283 | +; CHECK-NEXT:    s_add_u32 s2, s0, s2  | 
 | 284 | +; CHECK-NEXT:    v_mov_b32_e32 v3, s1  | 
 | 285 | +; CHECK-NEXT:    s_addc_u32 s3, s1, s3  | 
 | 286 | +; CHECK-NEXT:    v_mov_b32_e32 v2, s0  | 
 | 287 | +; CHECK-NEXT:    v_mov_b32_e32 v5, s3  | 
 | 288 | +; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]  | 
 | 289 | +; CHECK-NEXT:    v_mov_b32_e32 v4, s2  | 
 | 290 | +; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]  | 
 | 291 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 292 | +; CHECK-NEXT:    v_readfirstlane_b32 s0, v0  | 
 | 293 | +; CHECK-NEXT:    s_mov_b32 s1, s0  | 
 | 294 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 295 | +; CHECK-NEXT:    ; return to shader part epilog  | 
 | 296 | +  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)  | 
 | 297 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 298 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 299 | +  %res = sext i1 %obit to i64  | 
 | 300 | +  store i64 %val, ptr %ptrval  | 
 | 301 | +  ret i64 %res  | 
 | 302 | +}  | 
 | 303 | + | 
 | 304 | +define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {  | 
 | 305 | +; CHECK-LABEL: s_uadd_p1:  | 
 | 306 | +; CHECK:       ; %bb.0:  | 
 | 307 | +; CHECK-NEXT:    s_add_u32 s0, s0, 1  | 
 | 308 | +; CHECK-NEXT:    s_addc_u32 s1, s1, 0  | 
 | 309 | +; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0  | 
 | 310 | +; CHECK-NEXT:    v_mov_b32_e32 v3, s1  | 
 | 311 | +; CHECK-NEXT:    v_mov_b32_e32 v2, s0  | 
 | 312 | +; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0  | 
 | 313 | +; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]  | 
 | 314 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]  | 
 | 315 | +; CHECK-NEXT:    v_readfirstlane_b32 s0, v0  | 
 | 316 | +; CHECK-NEXT:    s_mov_b32 s1, s0  | 
 | 317 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 318 | +; CHECK-NEXT:    ; return to shader part epilog  | 
 | 319 | +  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)  | 
 | 320 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 321 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 322 | +  %res = sext i1 %obit to i64  | 
 | 323 | +  store i64 %val, ptr %ptrval  | 
 | 324 | +  ret i64 %res  | 
 | 325 | +}  | 
 | 326 | + | 
 | 327 | +define amdgpu_ps i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {  | 
 | 328 | +; CHECK-LABEL: s_uadd_n1:  | 
 | 329 | +; CHECK:       ; %bb.0:  | 
 | 330 | +; CHECK-NEXT:    s_add_u32 s2, s0, -1  | 
 | 331 | +; CHECK-NEXT:    s_addc_u32 s3, s1, -1  | 
 | 332 | +; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0  | 
 | 333 | +; CHECK-NEXT:    v_mov_b32_e32 v2, s2  | 
 | 334 | +; CHECK-NEXT:    v_mov_b32_e32 v3, s3  | 
 | 335 | +; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0  | 
 | 336 | +; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]  | 
 | 337 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]  | 
 | 338 | +; CHECK-NEXT:    v_readfirstlane_b32 s0, v0  | 
 | 339 | +; CHECK-NEXT:    s_mov_b32 s1, s0  | 
 | 340 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 341 | +; CHECK-NEXT:    ; return to shader part epilog  | 
 | 342 | +  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)  | 
 | 343 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 344 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 345 | +  %res = sext i1 %obit to i64  | 
 | 346 | +  store i64 %val, ptr %ptrval  | 
 | 347 | +  ret i64 %res  | 
 | 348 | +}  | 
 | 349 | + | 
 | 350 | +define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {  | 
 | 351 | +; CHECK-LABEL: s_usub_p1:  | 
 | 352 | +; CHECK:       ; %bb.0:  | 
 | 353 | +; CHECK-NEXT:    s_add_u32 s2, s0, -1  | 
 | 354 | +; CHECK-NEXT:    v_mov_b32_e32 v3, s1  | 
 | 355 | +; CHECK-NEXT:    s_addc_u32 s3, s1, -1  | 
 | 356 | +; CHECK-NEXT:    v_mov_b32_e32 v2, s0  | 
 | 357 | +; CHECK-NEXT:    v_mov_b32_e32 v5, s3  | 
 | 358 | +; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]  | 
 | 359 | +; CHECK-NEXT:    v_mov_b32_e32 v4, s2  | 
 | 360 | +; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]  | 
 | 361 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 362 | +; CHECK-NEXT:    v_readfirstlane_b32 s0, v0  | 
 | 363 | +; CHECK-NEXT:    s_mov_b32 s1, s0  | 
 | 364 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 365 | +; CHECK-NEXT:    ; return to shader part epilog  | 
 | 366 | +  %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)  | 
 | 367 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 368 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 369 | +  %res = sext i1 %obit to i64  | 
 | 370 | +  store i64 %val, ptr %ptrval  | 
 | 371 | +  ret i64 %res  | 
 | 372 | +}  | 
 | 373 | + | 
 | 374 | +define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {  | 
 | 375 | +; CHECK-LABEL: s_usub_n1:  | 
 | 376 | +; CHECK:       ; %bb.0:  | 
 | 377 | +; CHECK-NEXT:    s_add_u32 s2, s0, 1  | 
 | 378 | +; CHECK-NEXT:    v_mov_b32_e32 v3, s1  | 
 | 379 | +; CHECK-NEXT:    s_addc_u32 s3, s1, 0  | 
 | 380 | +; CHECK-NEXT:    v_mov_b32_e32 v2, s0  | 
 | 381 | +; CHECK-NEXT:    v_mov_b32_e32 v5, s3  | 
 | 382 | +; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]  | 
 | 383 | +; CHECK-NEXT:    v_mov_b32_e32 v4, s2  | 
 | 384 | +; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]  | 
 | 385 | +; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc  | 
 | 386 | +; CHECK-NEXT:    v_readfirstlane_b32 s0, v0  | 
 | 387 | +; CHECK-NEXT:    s_mov_b32 s1, s0  | 
 | 388 | +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  | 
 | 389 | +; CHECK-NEXT:    ; return to shader part epilog  | 
 | 390 | +  %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)  | 
 | 391 | +  %val = extractvalue {i64, i1} %pair, 0  | 
 | 392 | +  %obit = extractvalue {i64, i1} %pair, 1  | 
 | 393 | +  %res = sext i1 %obit to i64  | 
 | 394 | +  store i64 %val, ptr %ptrval  | 
 | 395 | +  ret i64 %res  | 
 | 396 | +}  | 
0 commit comments