172 changes: 102 additions & 70 deletions llvm/test/CodeGen/AMDGPU/load-global-i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1636,7 +1636,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
Expand All @@ -1645,21 +1645,25 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T5.W, T5.Y, literal.x,
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: LSHR * T5.W, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T5.Z, T5.Y, literal.x,
; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR * T5.Y, T5.X, literal.x,
; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T5.X, T5.X, literal.x,
; EG-NEXT: AND_INT T5.X, T0.Y, literal.x,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
; CM-LABEL: global_zextload_v4i16_to_v4i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
Expand All @@ -1668,13 +1672,17 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T5.W, T5.Y, literal.x,
; CM-NEXT: MOV * T2.X, T5.X,
; CM-NEXT: MOV T3.X, T5.Y,
; CM-NEXT: MOV * T0.Y, PV.X,
; CM-NEXT: MOV * T0.Z, PV.X,
; CM-NEXT: LSHR * T5.W, PV.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T5.Z, T5.Y, literal.x,
; CM-NEXT: AND_INT * T5.Z, T0.Z, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T5.Y, T5.X, literal.x,
; CM-NEXT: LSHR * T5.Y, T0.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T5.X, T5.X, literal.x,
; CM-NEXT: AND_INT * T5.X, T0.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
Expand Down Expand Up @@ -1752,50 +1760,58 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T5.Y, literal.x,
; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T5.X, literal.x,
; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y,
; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v4i16_to_v4i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T5.X
; CM-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
; CM-NEXT: MOV * T2.X, T5.X,
; CM-NEXT: MOV T3.X, T5.Y,
; CM-NEXT: MOV * T0.Y, PV.X,
; CM-NEXT: MOV * T0.Z, PV.X,
; CM-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
; CM-NEXT: LSHR * T0.W, T5.Y, literal.x,
; CM-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x,
; CM-NEXT: LSHR * T0.W, T0.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.Z, T5.X, literal.x,
; CM-NEXT: BFE_INT * T6.W, PV.W, 0.0, literal.x,
; CM-NEXT: LSHR T0.Z, T0.Y, literal.x,
; CM-NEXT: BFE_INT * T5.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T6.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T5.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <4 x i16>, ptr addrspace(1) %in
%ext = sext <4 x i16> %load to <4 x i32>
Expand Down Expand Up @@ -5772,25 +5788,29 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T6.Z, T5.Y, literal.x,
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T6.X, T5.Y, literal.x,
; EG-NEXT: MOV T6.Y, 0.0,
; EG-NEXT: LSHR T5.Z, T5.X, literal.y,
; EG-NEXT: AND_INT * T5.X, T5.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: AND_INT T5.X, T0.Z, literal.x,
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: MOV T6.W, 0.0,
; EG-NEXT: MOV * T5.W, 0.0,
; EG-NEXT: LSHR T6.Z, T0.Y, literal.y,
; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T6.Y, 0.0,
; EG-NEXT: MOV T5.W, 0.0,
; EG-NEXT: MOV * T6.W, 0.0,
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
Expand All @@ -5801,26 +5821,30 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T8.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T6.Z, T5.X, literal.x,
; CM-NEXT: MOV * T2.X, T5.X,
; CM-NEXT: MOV * T3.X, T5.Y,
; CM-NEXT: MOV T0.Y, PV.X,
; CM-NEXT: MOV * T0.Z, T2.X,
; CM-NEXT: LSHR * T5.Z, PV.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T6.X, T5.X, literal.x,
; CM-NEXT: MOV T6.Y, 0.0,
; CM-NEXT: LSHR * T5.Z, T5.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T5.X, T5.Y, literal.x,
; CM-NEXT: AND_INT T5.X, T0.Z, literal.x,
; CM-NEXT: MOV T5.Y, 0.0,
; CM-NEXT: MOV * T6.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T6.Z, T0.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T6.X, T0.Y, literal.x,
; CM-NEXT: MOV T6.Y, 0.0,
; CM-NEXT: MOV * T5.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV * T6.W, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T7.X, PV.W, literal.x,
Expand Down Expand Up @@ -5921,7 +5945,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
Expand All @@ -5930,17 +5954,21 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: ASHR * T5.W, T5.X, literal.x,
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PS,
; EG-NEXT: MOV * T0.Z, PV.X,
; EG-NEXT: ASHR * T5.W, PV.Z, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; EG-NEXT: ASHR T5.Z, T5.X, literal.y,
; EG-NEXT: ASHR * T7.W, T5.Y, literal.z,
; EG-NEXT: ASHR T5.Z, T0.Z, literal.y,
; EG-NEXT: ASHR * T7.W, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
; EG-NEXT: ASHR * T7.Z, T5.Y, literal.x,
; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x,
; EG-NEXT: ASHR * T7.Z, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x,
; EG-NEXT: ASHR T5.Y, PV.X, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
Expand All @@ -5952,31 +5980,35 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T6.W, T5.Y, literal.y,
; CM-NEXT: MOV * T2.X, T5.X,
; CM-NEXT: MOV T3.X, T5.Y,
; CM-NEXT: MOV * T0.Y, PV.X,
; CM-NEXT: MOV * T0.Z, PV.X,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T5.W, PV.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: LSHR T7.X, PV.Z, literal.x,
; CM-NEXT: ASHR T6.Z, T5.Y, literal.y,
; CM-NEXT: ASHR * T5.W, T5.X, literal.z,
; CM-NEXT: LSHR T6.X, PV.Z, literal.x,
; CM-NEXT: ASHR T5.Z, T0.Z, literal.y,
; CM-NEXT: ASHR * T7.W, T0.Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T6.X, T5.Y, 0.0, literal.x,
; CM-NEXT: ASHR * T5.Z, T5.X, literal.x,
; CM-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x,
; CM-NEXT: ASHR * T7.Z, T0.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
; CM-NEXT: ASHR * T6.Y, PV.X, literal.y,
; CM-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x,
; CM-NEXT: ASHR * T5.Y, PV.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T5.Y, PV.X, literal.y,
; CM-NEXT: ASHR * T7.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <4 x i16>, ptr addrspace(1) %in
%ext = sext <4 x i16> %load to <4 x i64>
Expand Down
42 changes: 25 additions & 17 deletions llvm/test/CodeGen/AMDGPU/shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -681,52 +681,60 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; EG-LABEL: shl_v4i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 42, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 51, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV T0.Y, T6.X,
; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: AND_INT * T1.W, T10.Z, literal.x,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: MOV T4.X, T10.X,
; EG-NEXT: MOV * T5.X, T10.Y,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: MOV T0.Y, PS,
; EG-NEXT: MOV * T2.X, T10.Z,
; EG-NEXT: MOV T3.X, T10.W,
; EG-NEXT: MOV * T0.Z, T6.X,
; EG-NEXT: MOV * T1.Y, T2.X,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, T10.X, PV.W,
; EG-NEXT: LSHL * T1.W, T0.X, PV.W,
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T0.Y, literal.y,
; EG-NEXT: AND_INT * T2.W, T0.Z, literal.y,
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
; EG-NEXT: MOV * T6.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: LSHR T1.W, T10.Z, literal.x,
; EG-NEXT: LSHR * T2.W, T10.X, literal.x,
; EG-NEXT: MOV * T0.Z, T3.X,
; EG-NEXT: MOV * T6.X, T1.W,
; EG-NEXT: MOV T1.Z, PV.X,
; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
; EG-NEXT: LSHR * T2.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, PS, PV.W,
; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
; EG-NEXT: MOV T6.X, PV.W,
; EG-NEXT: MOV * T0.X, T7.X,
; EG-NEXT: AND_INT * T1.W, T10.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, T10.Y, PV.W,
; EG-NEXT: LSHL T1.W, T0.Y, PV.W,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
; EG-NEXT: MOV * T7.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: LSHR T1.W, T10.W, literal.x,
; EG-NEXT: LSHR * T2.W, T10.Y, literal.x,
; EG-NEXT: LSHR T1.W, T0.Z, literal.x,
; EG-NEXT: LSHR * T2.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PS, PV.W,
; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
Expand Down
55 changes: 32 additions & 23 deletions llvm/test/CodeGen/AMDGPU/sra.ll
Original file line number Diff line number Diff line change
Expand Up @@ -323,43 +323,52 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; EG-LABEL: ashr_v4i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 48, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 58, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.Y, T6.X,
; EG-NEXT: MOV * T9.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: BFE_INT T0.W, T9.X, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: ASHR * T0.W, PV.W, PS,
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV T4.X, T9.X,
; EG-NEXT: MOV * T5.X, T9.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: MOV T2.X, T9.Z,
; EG-NEXT: MOV * T3.X, T9.W,
; EG-NEXT: MOV * T0.W, T6.X,
; EG-NEXT: MOV T1.Y, T2.X,
; EG-NEXT: BFE_INT * T1.W, T0.Y, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ASHR * T1.W, T1.W, PV.W,
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T0.W, T0.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T6.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T9.X, literal.x,
; EG-NEXT: MOV * T1.Z, T3.X,
; EG-NEXT: MOV * T6.X, T0.W,
; EG-NEXT: MOV T0.W, PV.X,
; EG-NEXT: LSHR * T1.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: LSHR * T1.W, T9.Z, literal.x,
; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x,
; EG-NEXT: LSHR * T2.W, T1.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: ASHR T1.W, PV.W, PS,
; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: OR_INT * T0.W, T0.W, PV.W,
; EG-NEXT: MOV T6.X, PV.W,
; EG-NEXT: MOV T0.Y, T7.X,
; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.W, literal.y,
; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, T1.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: ASHR T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
Expand All @@ -369,10 +378,10 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T7.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: LSHR * T0.W, T9.Y, literal.x,
; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: LSHR * T1.W, T9.W, literal.x,
; EG-NEXT: LSHR * T1.W, T1.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/Mips/o32_cc_byval.ll
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ define void @f1() nounwind {
; CHECK-NEXT: sw $1, 16($sp)
; CHECK-NEXT: lw $7, 4($18)
; CHECK-NEXT: lw $6, %lo(f1.s1)($17)
; CHECK-NEXT: lbu $5, 40($sp)
; CHECK-NEXT: lw $25, %call16(callee3)($16)
; CHECK-NEXT: addiu $5, $zero, 11
; CHECK-NEXT: jalr $25
; CHECK-NEXT: move $gp, $16
; CHECK-NEXT: lw $16, 48($sp) # 4-byte Folded Reload
Expand Down Expand Up @@ -234,7 +234,6 @@ define void @f5(i64 %a0, ptr nocapture byval(%struct.S4) %a1) nounwind {
; CHECK-NEXT: addiu $sp, $sp, -32
; CHECK-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill
; CHECK-NEXT: addu $gp, $2, $25
; CHECK-NEXT: move $1, $6
; CHECK-NEXT: sw $7, 44($sp)
; CHECK-NEXT: sw $6, 40($sp)
; CHECK-NEXT: sw $5, 20($sp)
Expand All @@ -244,7 +243,7 @@ define void @f5(i64 %a0, ptr nocapture byval(%struct.S4) %a1) nounwind {
; CHECK-NEXT: lw $5, 44($sp)
; CHECK-NEXT: lw $25, %call16(f6)($gp)
; CHECK-NEXT: jalr $25
; CHECK-NEXT: move $4, $1
; CHECK-NEXT: lw $4, 40($sp)
; CHECK-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload
; CHECK-NEXT: jr $ra
; CHECK-NEXT: addiu $sp, $sp, 32
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
Original file line number Diff line number Diff line change
Expand Up @@ -958,33 +958,33 @@ declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4)
; CHECK-LABEL: name: call_test_byval_homogeneous_float_struct{{.*}}

; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
; 32BIT-DAG: renamable $r3 = LWZ 0, %stack.0.s :: (load (s32) from %stack.0.s, align 8)
; 32BIT-DAG: renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4)
; 32BIT-DAG: renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
; 32BIT-DAG: $r3 = LI 0
; 32BIT-NEXT: BL_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3
; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1

; CHECKASM-LABEL: .call_test_byval_homogeneous_float_struct:

; ASM32: stwu 1, -80(1)
; ASM32-DAG: lwz 3, 64(1)
; ASM32-DAG: lwz 4, 68(1)
; ASM32-DAG: lwz 5, 72(1)
; ASM32-DAG: stw 3, 64(1)
; ASM32-NEXT: bl .test_byval_homogeneous_float_struct[PR]
; ASM32-NEXT: nop

; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
; 64BIT: renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
; 64BIT-NEXT: renamable $x4 = RLDICR killed renamable $x3, 32, 31
; 64BIT-NEXT: $x3 = LI8 0
; 64BIT-DAG: renamable $x3 = LD 0, %stack.0.s :: (load (s64) from %stack.0.s)
; 64BIT-DAG: renamable $x4 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
; 64BIT-DAG: renamable $x4 = RLDICR killed renamable $x4, 32, 31
; 64BIT-NEXT: BL8_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1

; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
; ASM64: stdu 1, -128(1)
; ASM64: lwz 3, 120(1)
; ASM64-NEXT: sldi 4, 3, 32
; ASM64-NEXT: li 3, 0
; ASM64-DAG: ld 3, 112(1)
; ASM64-DAG: lwz 4, 120(1)
; ASM64-DAG: sldi 4, 4, 32
; ASM64-NEXT: bl .test_byval_homogeneous_float_struct[PR]
; ASM64-NEXT: nop
8 changes: 6 additions & 2 deletions llvm/test/CodeGen/PowerPC/byval-lhs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ define void @bar1(i64 %a) nounwind {
; LE-NEXT: stdu r1, -48(r1)
; LE-NEXT: std r0, 64(r1)
; LE-NEXT: std r3, 40(r1)
; LE-NEXT: ld r3, 40(r1)
; LE-NEXT: bl f0
; LE-NEXT: nop
; LE-NEXT: addi r1, r1, 48
Expand All @@ -30,6 +31,7 @@ define void @bar1(i64 %a) nounwind {
; AIX-NEXT: stdu r1, -128(r1)
; AIX-NEXT: std r0, 144(r1)
; AIX-NEXT: std r3, 120(r1)
; AIX-NEXT: ld r3, 120(r1)
; AIX-NEXT: bl .f0[PR]
; AIX-NEXT: nop
; AIX-NEXT: addi r1, r1, 128
Expand All @@ -47,10 +49,11 @@ define void @bar2(i64 %a) nounwind {
; LE: # %bb.0:
; LE-NEXT: mflr r0
; LE-NEXT: stdu r1, -48(r1)
; LE-NEXT: mr r4, r3
; LE-NEXT: std r0, 64(r1)
; LE-NEXT: std r3, 32(r1)
; LE-NEXT: std r3, 40(r1)
; LE-NEXT: ld r4, 40(r1)
; LE-NEXT: ld r3, 32(r1)
; LE-NEXT: bl f1
; LE-NEXT: nop
; LE-NEXT: addi r1, r1, 48
Expand All @@ -62,10 +65,11 @@ define void @bar2(i64 %a) nounwind {
; AIX: # %bb.0:
; AIX-NEXT: mflr r0
; AIX-NEXT: stdu r1, -128(r1)
; AIX-NEXT: mr r4, r3
; AIX-NEXT: std r0, 144(r1)
; AIX-NEXT: std r3, 112(r1)
; AIX-NEXT: std r3, 120(r1)
; AIX-NEXT: ld r4, 120(r1)
; AIX-NEXT: ld r3, 112(r1)
; AIX-NEXT: bl .f1[PR]
; AIX-NEXT: nop
; AIX-NEXT: addi r1, r1, 128
Expand Down
6 changes: 6 additions & 0 deletions llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat
; P8LE-NEXT: stdx r3, 0, r5
; P8LE-NEXT: stb r4, 79(r1)
; P8LE-NEXT: lbz r4, 56(r1)
; P8LE-NEXT: ld r3, 48(r1)
; P8LE-NEXT: bl callee_9
; P8LE-NEXT: nop
; P8LE-NEXT: li r3, 0
Expand All @@ -203,6 +204,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat
; P9LE-NEXT: std r3, 48(r1)
; P9LE-NEXT: stdx r3, 0, r4
; P9LE-NEXT: lbz r4, 56(r1)
; P9LE-NEXT: ld r3, 48(r1)
; P9LE-NEXT: stb r5, 79(r1)
; P9LE-NEXT: bl callee_9
; P9LE-NEXT: nop
Expand All @@ -223,6 +225,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat
; P10LE-NEXT: lbz r5, 56(r1)
; P10LE-NEXT: stdx r3, 0, r4
; P10LE-NEXT: lbz r4, 56(r1)
; P10LE-NEXT: ld r3, 48(r1)
; P10LE-NEXT: stb r5, 79(r1)
; P10LE-NEXT: bl callee_9@notoc
; P10LE-NEXT: li r3, 0
Expand All @@ -243,6 +246,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat
; P8BE-NEXT: stdx r3, 0, r5
; P8BE-NEXT: stb r4, 143(r1)
; P8BE-NEXT: lbz r4, 200(r1)
; P8BE-NEXT: ld r3, 192(r1)
; P8BE-NEXT: bl callee_9
; P8BE-NEXT: nop
; P8BE-NEXT: li r3, 0
Expand All @@ -262,6 +266,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat
; P9BE-NEXT: std r3, 192(r1)
; P9BE-NEXT: stdx r3, 0, r4
; P9BE-NEXT: lbz r4, 200(r1)
; P9BE-NEXT: ld r3, 192(r1)
; P9BE-NEXT: stb r5, 143(r1)
; P9BE-NEXT: bl callee_9
; P9BE-NEXT: nop
Expand All @@ -282,6 +287,7 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat
; P10BE-NEXT: lbz r5, 200(r1)
; P10BE-NEXT: stdx r3, 0, r4
; P10BE-NEXT: lbz r4, 200(r1)
; P10BE-NEXT: ld r3, 192(r1)
; P10BE-NEXT: stb r5, 143(r1)
; P10BE-NEXT: bl callee_9
; P10BE-NEXT: nop
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/X86/fastcc-byval.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ define fastcc i32 @bar() nounwind {
; CHECK: ## %bb.0:
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: movl $1, 8(%esp)
; CHECK-NEXT: movl $1, (%esp)
; CHECK-NEXT: movl 8(%esp), %eax
; CHECK-NEXT: movl %eax, (%esp)
; CHECK-NEXT: calll _foo
; CHECK-NEXT: movl 8(%esp), %eax
; CHECK-NEXT: addl $12, %esp
Expand Down