diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 30a1b31f110218..9c8733bacfe950 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -6,6 +6,129 @@
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
 ; individual elements instead of 128-bit stores.
 
+define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
+; SI-LABEL: insertelement_v2f32_0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x40a00000
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: insertelement_v2f32_0:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x40a00000
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
+  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
+; SI-LABEL: insertelement_v2f32_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s5, 0x40a00000
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: insertelement_v2f32_1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s5, 0x40a00000
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
+  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+; SI-LABEL: insertelement_v2i32_0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0x3e7
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: insertelement_v2i32_0:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_movk_i32 s4, 0x3e7
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <2 x i32> %a, i32 999, i32 0
+  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+; SI-LABEL: insertelement_v2i32_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s5, 0x3e7
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: insertelement_v2i32_1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_movk_i32 s5, 0x3e7
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <2 x i32> %a, i32 999, i32 1
+  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
+  ret void
+}
 
 ; FIXME: Why is the constant moved into the intermediate register and
 ; not just directly into the vector component?
@@ -1336,19 +1459,19 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s0, 0
-; SI-NEXT:    s_cbranch_scc0 BB26_2
+; SI-NEXT:    s_cbranch_scc0 BB30_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_load_dword s1, s[6:7], 0x1
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz BB26_3
-; SI-NEXT:    s_branch BB26_4
-; SI-NEXT:  BB26_2:
-; SI-NEXT:  BB26_3: ; %if
+; SI-NEXT:    s_cbranch_vccz BB30_3
+; SI-NEXT:    s_branch BB30_4
+; SI-NEXT:  BB30_2:
+; SI-NEXT:  BB30_3: ; %if
 ; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
-; SI-NEXT:  BB26_4: ; %endif
+; SI-NEXT:  BB30_4: ; %endif
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
@@ -1363,16 +1486,16 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cbranch_scc0 BB26_2
+; VI-NEXT:    s_cbranch_scc0 BB30_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_load_dword s1, s[6:7], 0x4
-; VI-NEXT:    s_cbranch_execz BB26_3
-; VI-NEXT:    s_branch BB26_4
-; VI-NEXT:  BB26_2:
-; VI-NEXT:  BB26_3: ; %if
+; VI-NEXT:    s_cbranch_execz BB30_3
+; VI-NEXT:    s_branch BB30_4
+; VI-NEXT:  BB30_2:
+; VI-NEXT:  BB30_3: ; %if
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s1, s[6:7], 0x0
-; VI-NEXT:  BB26_4: ; %endif
+; VI-NEXT:  BB30_4: ; %endif
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000