diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll new file mode 100644 index 0000000000000..c8796b839913c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll @@ -0,0 +1,3475 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 + +define void @vmulwev_h_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_h_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr3, $a1, 0 +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr3, 14 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 0 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 2 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 4 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 6 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 8 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 10 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 12 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 0 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 2 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 4 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 6 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 8 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 10 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 12 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 14 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 0 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 2 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 4 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 6 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 8 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 10 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 12 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 0 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 2 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 4 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 6 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 8 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 10 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 12 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7 +; CHECK-NEXT: xvpermi.q $xr1, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr4, $xr0, 2 +; CHECK-NEXT: xvmul.h $xr0, $xr1, $xr4 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %vas = shufflevector <32 x i8> %va, <32 x i8> poison, <16 x i32> + %vbs = shufflevector <32 x i8> %vb, <32 x i8> poison, <16 x i32> + %vae = sext <16 x i8> %vas to <16 x i16> + %vbe = sext <16 x i8> %vbs to <16 x i16> + %mul = mul <16 x i16> %vae, %vbe + store <16 x i16> %mul, ptr %res + ret void +} + +define void @vmulwev_w_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_w_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 6 +; CHECK-NEXT: vpickve2gr.h $a2, $vr2, 4 +; CHECK-NEXT: vpickve2gr.h $a3, $vr2, 2 +; CHECK-NEXT: vpickve2gr.h $a4, $vr2, 0 +; CHECK-NEXT: vpickve2gr.h $a5, $vr0, 6 +; CHECK-NEXT: vpickve2gr.h $a6, $vr0, 4 +; CHECK-NEXT: vpickve2gr.h $a7, $vr0, 2 +; CHECK-NEXT: vpickve2gr.h $t0, $vr0, 0 +; CHECK-NEXT: xvpermi.d $xr0, $xr1, 14 +; CHECK-NEXT: vpickve2gr.h $t1, $vr0, 6 +; CHECK-NEXT: vpickve2gr.h $t2, $vr0, 4 +; CHECK-NEXT: vpickve2gr.h $t3, $vr0, 2 +; CHECK-NEXT: vpickve2gr.h $t4, $vr0, 0 +; CHECK-NEXT: vpickve2gr.h $t5, $vr1, 6 +; CHECK-NEXT: vpickve2gr.h $t6, $vr1, 4 +; CHECK-NEXT: vpickve2gr.h $t7, $vr1, 2 +; CHECK-NEXT: vpickve2gr.h $t8, $vr1, 0 +; CHECK-NEXT: ext.w.h $t0, $t0 +; CHECK-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; CHECK-NEXT: ext.w.h $a7, $a7 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; CHECK-NEXT: ext.w.h $a6, $a6 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; CHECK-NEXT: ext.w.h $a5, $a5 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; CHECK-NEXT: ext.w.h $a4, $a4 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; CHECK-NEXT: ext.w.h $a3, $a3 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; CHECK-NEXT: ext.w.h $a2, $a2 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; CHECK-NEXT: ext.w.h $a1, $a1 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: ext.w.h $a1, $t8 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; CHECK-NEXT: ext.w.h $a1, $t7 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; CHECK-NEXT: ext.w.h $a1, $t6 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; CHECK-NEXT: ext.w.h $a1, $t5 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; CHECK-NEXT: ext.w.h $a1, $t4 +; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; CHECK-NEXT: ext.w.h $a1, $t3 +; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; CHECK-NEXT: ext.w.h $a1, $t2 +; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; CHECK-NEXT: ext.w.h $a1, $t1 +; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2 +; CHECK-NEXT: xvmul.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %vas = shufflevector <16 x i16> %va, <16 x i16> poison, <8 x i32> + %vbs = shufflevector <16 x i16> %vb, <16 x i16> poison, <8 x i32> + %vae = sext <8 x i16> %vas to <8 x i32> + %vbe = sext <8 x i16> %vbs to <8 x i32> + %mul = mul <8 x i32> %vae, %vbe + store <8 x i32> %mul, ptr %res + ret void +} + +define void @vmulwev_d_w(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_d_w: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 6 +; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 4 +; LA32-NEXT: xvpickve2gr.w $a5, $xr1, 2 +; LA32-NEXT: xvpickve2gr.w $a6, $xr1, 0 +; LA32-NEXT: xvpickve2gr.w $a7, $xr1, 6 +; LA32-NEXT: xvpickve2gr.w $t0, $xr1, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 0 +; LA32-NEXT: srai.w $a4, $a4, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 1 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 2 +; LA32-NEXT: srai.w $a3, $a3, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 3 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: srai.w $a2, $a2, 31 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 1 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: srai.w $a1, $a1, 31 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA32-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA32-NEXT: srai.w $a1, $t0, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 1 +; LA32-NEXT: vinsgr2vr.w $vr0, $a7, 2 +; LA32-NEXT: srai.w $a1, $a7, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 3 +; LA32-NEXT: vinsgr2vr.w $vr2, $a6, 0 +; LA32-NEXT: srai.w $a1, $a6, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA32-NEXT: vinsgr2vr.w $vr2, $a5, 2 +; LA32-NEXT: srai.w $a1, $a5, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA32-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_d_w: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 6 +; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 4 +; LA64-NEXT: xvpickve2gr.w $a5, $xr1, 2 +; LA64-NEXT: xvpickve2gr.w $a6, $xr1, 0 +; LA64-NEXT: xvpickve2gr.w $a7, $xr1, 6 +; LA64-NEXT: xvpickve2gr.w $t0, $xr1, 4 +; LA64-NEXT: vinsgr2vr.d $vr0, $a4, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a3, 1 +; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: vinsgr2vr.d $vr0, $t0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a7, 1 +; LA64-NEXT: vinsgr2vr.d $vr2, $a6, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a5, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA64-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %vas = shufflevector <8 x i32> %va, <8 x i32> poison, <4 x i32> + %vbs = shufflevector <8 x i32> %vb, <8 x i32> poison, <4 x i32> + %vae = sext <4 x i32> %vas to <4 x i64> + %vbe = sext <4 x i32> %vbs to <4 x i64> + %mul = mul <4 x i64> %vae, %vbe + store <4 x i64> %mul, ptr %res + ret void +} + +define void @vmulwev_q_d(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_q_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $fp, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 16 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 4 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA32-NEXT: xvpickve2gr.w $t3, $xr0, 1 +; LA32-NEXT: xvpickve2gr.w $a5, $xr0, 5 +; LA32-NEXT: xvpickve2gr.w $a3, $xr1, 4 +; LA32-NEXT: xvpickve2gr.w $a4, $xr1, 0 +; LA32-NEXT: xvpickve2gr.w $t4, $xr1, 1 +; LA32-NEXT: xvpickve2gr.w $a7, $xr1, 5 +; LA32-NEXT: srai.w $t1, $a5, 31 +; LA32-NEXT: srai.w $t5, $t3, 31 +; LA32-NEXT: srai.w $t0, $a7, 31 +; LA32-NEXT: srai.w $t6, $t4, 31 +; LA32-NEXT: mulh.wu $a6, $a2, $a4 +; LA32-NEXT: mul.w $t2, $t3, $a4 +; LA32-NEXT: add.w $a6, $t2, $a6 +; LA32-NEXT: sltu $t2, $a6, $t2 +; LA32-NEXT: mulh.wu $t7, $t3, $a4 +; LA32-NEXT: add.w $t7, $t7, $t2 +; LA32-NEXT: mul.w $t2, $a2, $t4 +; LA32-NEXT: add.w $a6, $t2, $a6 +; LA32-NEXT: sltu $t2, $a6, $t2 +; LA32-NEXT: mulh.wu $t8, $a2, $t4 +; LA32-NEXT: add.w $t2, $t8, $t2 +; LA32-NEXT: add.w $t8, $t7, $t2 +; LA32-NEXT: mul.w $fp, $t3, $t4 +; LA32-NEXT: add.w $s0, $fp, $t8 +; LA32-NEXT: mul.w $s1, $a4, $t5 +; LA32-NEXT: mul.w $s2, $t6, $a2 +; LA32-NEXT: add.w $s3, $s2, $s1 +; LA32-NEXT: add.w $t2, $s0, $s3 +; LA32-NEXT: sltu $s4, $t2, $s0 +; LA32-NEXT: sltu $fp, $s0, $fp +; LA32-NEXT: sltu $t7, $t8, $t7 +; LA32-NEXT: mulh.wu $t8, $t3, $t4 +; LA32-NEXT: add.w $t7, $t8, $t7 +; LA32-NEXT: add.w $t7, $t7, $fp +; LA32-NEXT: mulh.wu $t8, $a4, $t5 +; LA32-NEXT: add.w $t8, $t8, $s1 +; LA32-NEXT: mul.w $t4, $t4, $t5 +; LA32-NEXT: add.w $t4, $t8, $t4 +; LA32-NEXT: mul.w $t3, $t6, $t3 +; LA32-NEXT: mulh.wu $t5, $t6, $a2 +; LA32-NEXT: add.w $t3, $t5, $t3 +; LA32-NEXT: add.w $t3, $t3, $s2 +; LA32-NEXT: add.w $t3, $t3, $t4 +; LA32-NEXT: sltu $t4, $s3, $s2 +; LA32-NEXT: add.w $t3, $t3, $t4 +; LA32-NEXT: add.w $t3, $t7, $t3 +; LA32-NEXT: add.w $t3, $t3, $s4 +; LA32-NEXT: mulh.wu $t4, $a1, $a3 +; LA32-NEXT: mul.w $t5, $a5, $a3 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: sltu $t5, $t4, $t5 +; LA32-NEXT: mulh.wu $t6, $a5, $a3 +; LA32-NEXT: add.w $t5, $t6, $t5 +; LA32-NEXT: mul.w $t6, $a1, $a7 +; LA32-NEXT: add.w $t4, $t6, $t4 +; LA32-NEXT: sltu $t6, $t4, $t6 +; LA32-NEXT: mulh.wu $t7, $a1, $a7 +; LA32-NEXT: add.w $t6, $t7, $t6 +; LA32-NEXT: add.w $t6, $t5, $t6 +; LA32-NEXT: mul.w $t7, $a5, $a7 +; LA32-NEXT: add.w $t8, $t7, $t6 +; LA32-NEXT: mul.w $fp, $a3, $t1 +; LA32-NEXT: mul.w $s0, $t0, $a1 +; LA32-NEXT: add.w $s1, $s0, $fp +; LA32-NEXT: add.w $s2, $t8, $s1 +; LA32-NEXT: sltu $s3, $s2, $t8 +; LA32-NEXT: sltu $t7, $t8, $t7 +; LA32-NEXT: sltu $t5, $t6, $t5 +; LA32-NEXT: mulh.wu $t6, $a5, $a7 +; LA32-NEXT: add.w $t5, $t6, $t5 +; LA32-NEXT: add.w $t5, $t5, $t7 +; LA32-NEXT: mulh.wu $t6, $a3, $t1 +; LA32-NEXT: add.w $t6, $t6, $fp +; LA32-NEXT: mul.w $a7, $a7, $t1 +; LA32-NEXT: add.w $a7, $t6, $a7 +; LA32-NEXT: mul.w $a5, $t0, $a5 +; LA32-NEXT: mulh.wu $t0, $t0, $a1 +; LA32-NEXT: add.w $a5, $t0, $a5 +; LA32-NEXT: add.w $a5, $a5, $s0 +; LA32-NEXT: add.w $a5, $a5, $a7 +; LA32-NEXT: sltu $a7, $s1, $s0 +; LA32-NEXT: add.w $a5, $a5, $a7 +; LA32-NEXT: add.w $a5, $t5, $a5 +; LA32-NEXT: add.w $a5, $a5, $s3 +; LA32-NEXT: mul.w $a2, $a2, $a4 +; LA32-NEXT: mul.w $a1, $a1, $a3 +; LA32-NEXT: st.w $a1, $a0, 16 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $t4, $a0, 20 +; LA32-NEXT: st.w $a6, $a0, 4 +; LA32-NEXT: st.w $s2, $a0, 24 +; LA32-NEXT: st.w $t2, $a0, 8 +; LA32-NEXT: st.w $a5, $a0, 28 +; LA32-NEXT: st.w $t3, $a0, 12 +; LA32-NEXT: ld.w $s4, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 16 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_q_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 0 +; LA64-NEXT: xvpickve2gr.d $a3, $xr1, 2 +; LA64-NEXT: xvpickve2gr.d $a4, $xr1, 0 +; LA64-NEXT: mul.d $a5, $a2, $a4 +; LA64-NEXT: mulh.d $a2, $a2, $a4 +; LA64-NEXT: mul.d $a4, $a1, $a3 +; LA64-NEXT: mulh.d $a1, $a1, $a3 +; LA64-NEXT: st.d $a1, $a0, 24 +; LA64-NEXT: st.d $a4, $a0, 16 +; LA64-NEXT: st.d $a2, $a0, 8 +; LA64-NEXT: st.d $a5, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %vas = shufflevector <4 x i64> %va, <4 x i64> poison, <2 x i32> + %vbs = shufflevector <4 x i64> %vb, <4 x i64> poison, <2 x i32> + %vae = sext <2 x i64> %vas to <2 x i128> + %vbe = sext <2 x i64> %vbs to <2 x i128> + %mul = mul <2 x i128> %vae, %vbe + store <2 x i128> %mul, ptr %res + ret void +} + +define void @vmulwod_h_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_h_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr3, $a1, 0 +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr3, 14 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 1 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 3 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 5 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 7 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 9 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 11 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 13 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 15 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 1 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 3 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 5 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 7 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 9 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 11 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 13 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 15 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 1 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 3 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 5 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 7 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 9 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 11 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 13 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 15 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 1 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 3 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 5 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 7 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 9 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 11 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 13 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 15 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7 +; CHECK-NEXT: xvpermi.q $xr1, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr4, $xr0, 2 +; CHECK-NEXT: xvmul.h $xr0, $xr1, $xr4 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %vas = shufflevector <32 x i8> %va, <32 x i8> poison, <16 x i32> + %vbs = shufflevector <32 x i8> %vb, <32 x i8> poison, <16 x i32> + %vae = sext <16 x i8> %vas to <16 x i16> + %vbe = sext <16 x i8> %vbs to <16 x i16> + %mul = mul <16 x i16> %vae, %vbe + store <16 x i16> %mul, ptr %res + ret void +} + +define void @vmulwod_w_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_w_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 7 +; CHECK-NEXT: vpickve2gr.h $a2, $vr2, 5 +; CHECK-NEXT: vpickve2gr.h $a3, $vr2, 3 +; CHECK-NEXT: vpickve2gr.h $a4, $vr2, 1 +; CHECK-NEXT: vpickve2gr.h $a5, $vr0, 7 +; CHECK-NEXT: vpickve2gr.h $a6, $vr0, 5 +; CHECK-NEXT: vpickve2gr.h $a7, $vr0, 3 +; CHECK-NEXT: vpickve2gr.h $t0, $vr0, 1 +; CHECK-NEXT: xvpermi.d $xr0, $xr1, 14 +; CHECK-NEXT: vpickve2gr.h $t1, $vr0, 7 +; CHECK-NEXT: vpickve2gr.h $t2, $vr0, 5 +; CHECK-NEXT: vpickve2gr.h $t3, $vr0, 3 +; CHECK-NEXT: vpickve2gr.h $t4, $vr0, 1 +; CHECK-NEXT: vpickve2gr.h $t5, $vr1, 7 +; CHECK-NEXT: vpickve2gr.h $t6, $vr1, 5 +; CHECK-NEXT: vpickve2gr.h $t7, $vr1, 3 +; CHECK-NEXT: vpickve2gr.h $t8, $vr1, 1 +; CHECK-NEXT: ext.w.h $t0, $t0 +; CHECK-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; CHECK-NEXT: ext.w.h $a7, $a7 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; CHECK-NEXT: ext.w.h $a6, $a6 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; CHECK-NEXT: ext.w.h $a5, $a5 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; CHECK-NEXT: ext.w.h $a4, $a4 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; CHECK-NEXT: ext.w.h $a3, $a3 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; CHECK-NEXT: ext.w.h $a2, $a2 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; CHECK-NEXT: ext.w.h $a1, $a1 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: ext.w.h $a1, $t8 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; CHECK-NEXT: ext.w.h $a1, $t7 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; CHECK-NEXT: ext.w.h $a1, $t6 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; CHECK-NEXT: ext.w.h $a1, $t5 +; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; CHECK-NEXT: ext.w.h $a1, $t4 +; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; CHECK-NEXT: ext.w.h $a1, $t3 +; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; CHECK-NEXT: ext.w.h $a1, $t2 +; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; CHECK-NEXT: ext.w.h $a1, $t1 +; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2 +; CHECK-NEXT: xvmul.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %vas = shufflevector <16 x i16> %va, <16 x i16> poison, <8 x i32> + %vbs = shufflevector <16 x i16> %vb, <16 x i16> poison, <8 x i32> + %vae = sext <8 x i16> %vas to <8 x i32> + %vbe = sext <8 x i16> %vbs to <8 x i32> + %mul = mul <8 x i32> %vae, %vbe + store <8 x i32> %mul, ptr %res + ret void +} + +define void @vmulwod_d_w(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_d_w: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 3 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 1 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 7 +; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 5 +; LA32-NEXT: xvpickve2gr.w $a5, $xr1, 3 +; LA32-NEXT: xvpickve2gr.w $a6, $xr1, 1 +; LA32-NEXT: xvpickve2gr.w $a7, $xr1, 7 +; LA32-NEXT: xvpickve2gr.w $t0, $xr1, 5 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 0 +; LA32-NEXT: srai.w $a4, $a4, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 1 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 2 +; LA32-NEXT: srai.w $a3, $a3, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 3 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: srai.w $a2, $a2, 31 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 1 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: srai.w $a1, $a1, 31 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA32-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA32-NEXT: srai.w $a1, $t0, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 1 +; LA32-NEXT: vinsgr2vr.w $vr0, $a7, 2 +; LA32-NEXT: srai.w $a1, $a7, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 3 +; LA32-NEXT: vinsgr2vr.w $vr2, $a6, 0 +; LA32-NEXT: srai.w $a1, $a6, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA32-NEXT: vinsgr2vr.w $vr2, $a5, 2 +; LA32-NEXT: srai.w $a1, $a5, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA32-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_d_w: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 3 +; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 7 +; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 5 +; LA64-NEXT: xvpickve2gr.w $a5, $xr1, 3 +; LA64-NEXT: xvpickve2gr.w $a6, $xr1, 1 +; LA64-NEXT: xvpickve2gr.w $a7, $xr1, 7 +; LA64-NEXT: xvpickve2gr.w $t0, $xr1, 5 +; LA64-NEXT: vinsgr2vr.d $vr0, $a4, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a3, 1 +; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: vinsgr2vr.d $vr0, $t0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a7, 1 +; LA64-NEXT: vinsgr2vr.d $vr2, $a6, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a5, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA64-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %vas = shufflevector <8 x i32> %va, <8 x i32> poison, <4 x i32> + %vbs = shufflevector <8 x i32> %vb, <8 x i32> poison, <4 x i32> + %vae = sext <4 x i32> %vas to <4 x i64> + %vbe = sext <4 x i32> %vbs to <4 x i64> + %mul = mul <4 x i64> %vae, %vbe + store <4 x i64> %mul, ptr %res + ret void +} + +define void @vmulwod_q_d(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_q_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $fp, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 16 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 6 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $t3, $xr0, 3 +; LA32-NEXT: xvpickve2gr.w $a5, $xr0, 7 +; LA32-NEXT: xvpickve2gr.w $a3, $xr1, 6 +; LA32-NEXT: xvpickve2gr.w $a4, $xr1, 2 +; LA32-NEXT: xvpickve2gr.w $t4, $xr1, 3 +; LA32-NEXT: xvpickve2gr.w $a7, $xr1, 7 +; LA32-NEXT: srai.w $t1, $a5, 31 +; LA32-NEXT: srai.w $t5, $t3, 31 +; LA32-NEXT: srai.w $t0, $a7, 31 +; LA32-NEXT: srai.w $t6, $t4, 31 +; LA32-NEXT: mulh.wu $a6, $a2, $a4 +; LA32-NEXT: mul.w $t2, $t3, $a4 +; LA32-NEXT: add.w $a6, $t2, $a6 +; LA32-NEXT: sltu $t2, $a6, $t2 +; LA32-NEXT: mulh.wu $t7, $t3, $a4 +; LA32-NEXT: add.w $t7, $t7, $t2 +; LA32-NEXT: mul.w $t2, $a2, $t4 +; LA32-NEXT: add.w $a6, $t2, $a6 +; LA32-NEXT: sltu $t2, $a6, $t2 +; LA32-NEXT: mulh.wu $t8, $a2, $t4 +; LA32-NEXT: add.w $t2, $t8, $t2 +; LA32-NEXT: add.w $t8, $t7, $t2 +; LA32-NEXT: mul.w $fp, $t3, $t4 +; LA32-NEXT: add.w $s0, $fp, $t8 +; LA32-NEXT: mul.w $s1, $a4, $t5 +; LA32-NEXT: mul.w $s2, $t6, $a2 +; LA32-NEXT: add.w $s3, $s2, $s1 +; LA32-NEXT: add.w $t2, $s0, $s3 +; LA32-NEXT: sltu $s4, $t2, $s0 +; LA32-NEXT: sltu $fp, $s0, $fp +; LA32-NEXT: sltu $t7, $t8, $t7 +; LA32-NEXT: mulh.wu $t8, $t3, $t4 +; LA32-NEXT: add.w $t7, $t8, $t7 +; LA32-NEXT: add.w $t7, $t7, $fp +; LA32-NEXT: mulh.wu $t8, $a4, $t5 +; LA32-NEXT: add.w $t8, $t8, $s1 +; LA32-NEXT: mul.w $t4, $t4, $t5 +; LA32-NEXT: add.w $t4, $t8, $t4 +; LA32-NEXT: mul.w $t3, $t6, $t3 +; LA32-NEXT: mulh.wu $t5, $t6, $a2 +; LA32-NEXT: add.w $t3, $t5, $t3 +; LA32-NEXT: add.w $t3, $t3, $s2 +; LA32-NEXT: add.w $t3, $t3, $t4 +; LA32-NEXT: sltu $t4, $s3, $s2 +; LA32-NEXT: add.w $t3, $t3, $t4 +; LA32-NEXT: add.w $t3, $t7, $t3 +; LA32-NEXT: add.w $t3, $t3, $s4 +; LA32-NEXT: mulh.wu $t4, $a1, $a3 +; LA32-NEXT: mul.w $t5, $a5, $a3 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: sltu $t5, $t4, $t5 +; LA32-NEXT: mulh.wu $t6, $a5, $a3 +; LA32-NEXT: add.w $t5, $t6, $t5 +; LA32-NEXT: mul.w $t6, $a1, $a7 +; LA32-NEXT: add.w $t4, $t6, $t4 +; LA32-NEXT: sltu $t6, $t4, $t6 +; LA32-NEXT: mulh.wu $t7, $a1, $a7 +; LA32-NEXT: add.w $t6, $t7, $t6 +; LA32-NEXT: add.w $t6, $t5, $t6 +; LA32-NEXT: mul.w $t7, $a5, $a7 +; LA32-NEXT: add.w $t8, $t7, $t6 +; LA32-NEXT: mul.w $fp, $a3, $t1 +; LA32-NEXT: mul.w $s0, $t0, $a1 +; LA32-NEXT: add.w $s1, $s0, $fp +; LA32-NEXT: add.w $s2, $t8, $s1 +; LA32-NEXT: sltu $s3, $s2, $t8 +; LA32-NEXT: sltu $t7, $t8, $t7 +; LA32-NEXT: sltu $t5, $t6, $t5 +; LA32-NEXT: mulh.wu $t6, $a5, $a7 +; LA32-NEXT: add.w $t5, $t6, $t5 +; LA32-NEXT: add.w $t5, $t5, $t7 +; LA32-NEXT: mulh.wu $t6, $a3, $t1 +; LA32-NEXT: add.w $t6, $t6, $fp +; LA32-NEXT: mul.w $a7, $a7, $t1 +; LA32-NEXT: add.w $a7, $t6, $a7 +; LA32-NEXT: mul.w $a5, $t0, $a5 +; LA32-NEXT: mulh.wu $t0, $t0, $a1 +; LA32-NEXT: add.w $a5, $t0, $a5 +; LA32-NEXT: add.w $a5, $a5, $s0 +; LA32-NEXT: add.w $a5, $a5, $a7 +; LA32-NEXT: sltu $a7, $s1, $s0 +; LA32-NEXT: add.w $a5, $a5, $a7 +; LA32-NEXT: add.w $a5, $t5, $a5 +; LA32-NEXT: add.w $a5, $a5, $s3 +; LA32-NEXT: mul.w $a2, $a2, $a4 +; LA32-NEXT: mul.w $a1, $a1, $a3 +; LA32-NEXT: st.w $a1, $a0, 16 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $t4, $a0, 20 +; LA32-NEXT: st.w $a6, $a0, 4 +; LA32-NEXT: st.w $s2, $a0, 24 +; LA32-NEXT: st.w $t2, $a0, 8 +; LA32-NEXT: st.w $a5, $a0, 28 +; LA32-NEXT: st.w $t3, $a0, 12 +; LA32-NEXT: ld.w $s4, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 16 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_q_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 3 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.d $a3, $xr1, 3 +; LA64-NEXT: xvpickve2gr.d $a4, $xr1, 1 +; LA64-NEXT: mul.d $a5, $a2, $a4 +; LA64-NEXT: mulh.d $a2, $a2, $a4 +; LA64-NEXT: mul.d $a4, $a1, $a3 +; LA64-NEXT: mulh.d $a1, $a1, $a3 +; LA64-NEXT: st.d $a1, $a0, 24 +; LA64-NEXT: st.d $a4, $a0, 16 +; LA64-NEXT: st.d $a2, $a0, 8 +; LA64-NEXT: st.d $a5, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %vas = shufflevector <4 x i64> %va, <4 x i64> poison, <2 x i32> + %vbs = shufflevector <4 x i64> %vb, <4 x i64> poison, <2 x i32> + %vae = sext <2 x i64> %vas to <2 x i128> + %vbe = sext <2 x i64> %vbs to <2 x i128> + %mul = mul <2 x i128> %vae, %vbe + store <2 x i128> %mul, ptr %res + ret void +} + +define void @vmulwev_h_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_h_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr3, $a1, 0 +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr3, 14 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 0 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 2 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 4 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 6 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 8 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 10 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 12 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 0 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 2 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 4 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 6 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 8 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 10 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 12 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 14 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 0 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 2 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 4 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 6 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 8 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 10 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 12 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 0 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 2 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 4 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 6 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 8 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 10 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 12 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7 +; CHECK-NEXT: xvpermi.q $xr1, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr4, $xr0, 2 +; CHECK-NEXT: xvmul.h $xr0, $xr1, $xr4 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %vas = shufflevector <32 x i8> %va, <32 x i8> poison, <16 x i32> + %vbs = shufflevector <32 x i8> %vb, <32 x i8> poison, <16 x i32> + %vae = zext <16 x i8> %vas to <16 x i16> + %vbe = zext <16 x i8> %vbs to <16 x i16> + %mul = mul <16 x i16> %vae, %vbe + store <16 x i16> %mul, ptr %res + ret void +} + +define void @vmulwev_w_hu(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_w_hu: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA32-NEXT: vpickve2gr.h $a1, $vr2, 6 +; LA32-NEXT: vpickve2gr.h $a2, $vr2, 4 +; LA32-NEXT: vpickve2gr.h $a3, $vr2, 2 +; LA32-NEXT: vpickve2gr.h $a4, $vr2, 0 +; LA32-NEXT: vpickve2gr.h $a5, $vr0, 6 +; LA32-NEXT: vpickve2gr.h $a6, $vr0, 4 +; LA32-NEXT: vpickve2gr.h $a7, $vr0, 2 +; LA32-NEXT: vpickve2gr.h $t0, $vr0, 0 +; LA32-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA32-NEXT: vpickve2gr.h $t1, $vr0, 6 +; LA32-NEXT: vpickve2gr.h $t2, $vr0, 4 +; LA32-NEXT: vpickve2gr.h $t3, $vr0, 2 +; LA32-NEXT: vpickve2gr.h $t4, $vr0, 0 +; LA32-NEXT: vpickve2gr.h $t5, $vr1, 6 +; LA32-NEXT: vpickve2gr.h $t6, $vr1, 4 +; LA32-NEXT: vpickve2gr.h $t7, $vr1, 2 +; LA32-NEXT: vpickve2gr.h $t8, $vr1, 0 +; LA32-NEXT: bstrpick.w $t0, $t0, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA32-NEXT: bstrpick.w $a7, $a7, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA32-NEXT: bstrpick.w $a6, $a6, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA32-NEXT: bstrpick.w $a5, $a5, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA32-NEXT: bstrpick.w $a4, $a4, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA32-NEXT: bstrpick.w $a3, $a3, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA32-NEXT: bstrpick.w $a2, $a2, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA32-NEXT: bstrpick.w $a1, $t8, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA32-NEXT: bstrpick.w $a1, $t7, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA32-NEXT: bstrpick.w $a1, $t6, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: bstrpick.w $a1, $t5, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: bstrpick.w $a1, $t4, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA32-NEXT: bstrpick.w $a1, $t3, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA32-NEXT: bstrpick.w $a1, $t2, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: bstrpick.w $a1, $t1, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA32-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_w_hu: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA64-NEXT: vpickve2gr.h $a1, $vr2, 6 +; LA64-NEXT: vpickve2gr.h $a2, $vr2, 4 +; LA64-NEXT: vpickve2gr.h $a3, $vr2, 2 +; LA64-NEXT: vpickve2gr.h $a4, $vr2, 0 +; LA64-NEXT: vpickve2gr.h $a5, $vr0, 6 +; LA64-NEXT: vpickve2gr.h $a6, $vr0, 4 +; LA64-NEXT: vpickve2gr.h $a7, $vr0, 2 +; LA64-NEXT: vpickve2gr.h $t0, $vr0, 0 +; LA64-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA64-NEXT: vpickve2gr.h $t1, $vr0, 6 +; LA64-NEXT: vpickve2gr.h $t2, $vr0, 4 +; LA64-NEXT: vpickve2gr.h $t3, $vr0, 2 +; LA64-NEXT: vpickve2gr.h $t4, $vr0, 0 +; LA64-NEXT: vpickve2gr.h $t5, $vr1, 6 +; LA64-NEXT: vpickve2gr.h $t6, $vr1, 4 +; LA64-NEXT: vpickve2gr.h $t7, $vr1, 2 +; LA64-NEXT: vpickve2gr.h $t8, $vr1, 0 +; LA64-NEXT: bstrpick.d $t0, $t0, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA64-NEXT: bstrpick.d $a7, $a7, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA64-NEXT: bstrpick.d $a6, $a6, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA64-NEXT: bstrpick.d $a5, $a5, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA64-NEXT: bstrpick.d $a4, $a4, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA64-NEXT: bstrpick.d $a3, $a3, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA64-NEXT: bstrpick.d $a2, $a2, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA64-NEXT: bstrpick.d $a1, $t8, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $t7, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $t6, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA64-NEXT: bstrpick.d $a1, $t5, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: bstrpick.d $a1, $t4, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $t3, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $t2, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA64-NEXT: bstrpick.d $a1, $t1, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %vas = shufflevector <16 x i16> %va, <16 x i16> poison, <8 x i32> + %vbs = shufflevector <16 x i16> %vb, <16 x i16> poison, <8 x i32> + %vae = zext <8 x i16> %vas to <8 x i32> + %vbe = zext <8 x i16> %vbs to <8 x i32> + %mul = mul <8 x i32> %vae, %vbe + store <8 x i32> %mul, ptr %res + ret void +} + +define void @vmulwev_d_wu(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_d_wu: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvrepli.b $xr1, 0 +; LA32-NEXT: xvld $xr2, $a2, 0 +; LA32-NEXT: xvori.b $xr3, $xr1, 0 +; LA32-NEXT: xvinsve0.w $xr3, $xr0, 0 +; LA32-NEXT: xvpickve.w $xr4, $xr0, 2 +; LA32-NEXT: xvinsve0.w $xr3, $xr4, 2 +; LA32-NEXT: xvpickve.w $xr4, $xr0, 4 +; LA32-NEXT: xvinsve0.w $xr3, $xr4, 4 +; LA32-NEXT: xvpickve.w $xr0, $xr0, 6 +; LA32-NEXT: xvinsve0.w $xr3, $xr0, 6 +; LA32-NEXT: xvinsve0.w $xr1, $xr2, 0 +; LA32-NEXT: xvpickve.w $xr0, $xr2, 2 +; LA32-NEXT: xvinsve0.w $xr1, $xr0, 2 +; LA32-NEXT: xvpickve.w $xr0, $xr2, 4 +; LA32-NEXT: xvinsve0.w $xr1, $xr0, 4 +; LA32-NEXT: xvpickve.w $xr0, $xr2, 6 +; LA32-NEXT: xvinsve0.w $xr1, $xr0, 6 +; LA32-NEXT: xvmul.d $xr0, $xr3, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_d_wu: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 6 +; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 4 +; LA64-NEXT: xvpickve2gr.w $a5, $xr1, 2 +; LA64-NEXT: xvpickve2gr.w $a6, $xr1, 0 +; LA64-NEXT: xvpickve2gr.w $a7, $xr1, 6 +; LA64-NEXT: xvpickve2gr.w $t0, $xr1, 4 +; LA64-NEXT: bstrpick.d $a4, $a4, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a4, 0 +; LA64-NEXT: bstrpick.d $a3, $a3, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a3, 1 +; LA64-NEXT: bstrpick.d $a2, $a2, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 0 +; LA64-NEXT: bstrpick.d $a1, $a1, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: bstrpick.d $a1, $t0, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $a7, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $a6, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $a5, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA64-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %vas = shufflevector <8 x i32> %va, <8 x i32> poison, <4 x i32> + %vbs = shufflevector <8 x i32> %vb, <8 x i32> poison, <4 x i32> + %vae = zext <4 x i32> %vas to <4 x i64> + %vbe = zext <4 x i32> %vbs to <4 x i64> + %mul = mul <4 x i64> %vae, %vbe + store <4 x i64> %mul, ptr %res + ret void +} + +define void @vmulwev_q_du(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_q_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 5 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 4 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 1 +; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 0 +; LA32-NEXT: xvpickve2gr.w $a5, $xr1, 5 +; LA32-NEXT: xvpickve2gr.w $a6, $xr1, 4 +; LA32-NEXT: xvpickve2gr.w $a7, $xr1, 1 +; LA32-NEXT: xvpickve2gr.w $t0, $xr1, 0 +; LA32-NEXT: mulh.wu $t1, $a4, $t0 +; LA32-NEXT: mul.w $t2, $a3, $t0 +; LA32-NEXT: add.w $t1, $t2, $t1 +; LA32-NEXT: sltu $t2, $t1, $t2 +; LA32-NEXT: mulh.wu $t3, $a3, $t0 +; LA32-NEXT: add.w $t2, $t3, $t2 +; LA32-NEXT: mul.w $t3, $a4, $a7 +; LA32-NEXT: add.w $t1, $t3, $t1 +; LA32-NEXT: sltu $t3, $t1, $t3 +; LA32-NEXT: mulh.wu $t4, $a4, $a7 +; LA32-NEXT: add.w $t3, $t4, $t3 +; LA32-NEXT: add.w $t3, $t2, $t3 +; LA32-NEXT: mul.w $t4, $a3, $a7 +; LA32-NEXT: add.w $t5, $t4, $t3 +; LA32-NEXT: sltu $t4, $t5, $t4 +; LA32-NEXT: sltu $t2, $t3, $t2 +; LA32-NEXT: mulh.wu $a3, $a3, $a7 +; LA32-NEXT: add.w $a3, $a3, $t2 +; LA32-NEXT: add.w $a3, $a3, $t4 +; LA32-NEXT: mulh.wu $a7, $a2, $a6 +; LA32-NEXT: mul.w $t2, $a1, $a6 +; LA32-NEXT: add.w $a7, $t2, $a7 +; LA32-NEXT: sltu $t2, $a7, $t2 +; LA32-NEXT: mulh.wu $t3, $a1, $a6 +; LA32-NEXT: add.w $t2, $t3, $t2 +; LA32-NEXT: mul.w $t3, $a2, $a5 +; LA32-NEXT: add.w $a7, $t3, $a7 +; LA32-NEXT: sltu $t3, $a7, $t3 +; LA32-NEXT: mulh.wu $t4, $a2, $a5 +; LA32-NEXT: add.w $t3, $t4, $t3 +; LA32-NEXT: add.w $t3, $t2, $t3 +; LA32-NEXT: mul.w $t4, $a1, $a5 +; LA32-NEXT: add.w $t6, $t4, $t3 +; LA32-NEXT: sltu $t4, $t6, $t4 +; LA32-NEXT: sltu $t2, $t3, $t2 +; LA32-NEXT: mulh.wu $a1, $a1, $a5 +; LA32-NEXT: add.w $a1, $a1, $t2 +; LA32-NEXT: add.w $a1, $a1, $t4 +; LA32-NEXT: mul.w $a4, $a4, $t0 +; LA32-NEXT: mul.w $a2, $a2, $a6 +; LA32-NEXT: st.w $a2, $a0, 16 +; LA32-NEXT: st.w $a4, $a0, 0 +; LA32-NEXT: st.w $a7, $a0, 20 +; LA32-NEXT: st.w $t1, $a0, 4 +; LA32-NEXT: st.w $t6, $a0, 24 +; LA32-NEXT: st.w $t5, $a0, 8 +; LA32-NEXT: st.w $a1, $a0, 28 +; LA32-NEXT: st.w $a3, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_q_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 0 +; LA64-NEXT: xvpickve2gr.d $a3, $xr1, 2 +; LA64-NEXT: xvpickve2gr.d $a4, $xr1, 0 +; LA64-NEXT: mul.d $a5, $a2, $a4 +; LA64-NEXT: mulh.du $a2, $a2, $a4 +; LA64-NEXT: mul.d $a4, $a1, $a3 +; LA64-NEXT: mulh.du $a1, $a1, $a3 +; LA64-NEXT: st.d $a1, $a0, 24 +; LA64-NEXT: st.d $a4, $a0, 16 +; LA64-NEXT: st.d $a2, $a0, 8 +; LA64-NEXT: st.d $a5, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %vas = shufflevector <4 x i64> %va, <4 x i64> poison, <2 x i32> + %vbs = shufflevector <4 x i64> %vb, <4 x i64> poison, <2 x i32> + %vae = zext <2 x i64> %vas to <2 x i128> + %vbe = zext <2 x i64> %vbs to <2 x i128> + %mul = mul <2 x i128> %vae, %vbe + store <2 x i128> %mul, ptr %res + ret void +} + +define void @vmulwod_h_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_h_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr3, $a1, 0 +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr3, 14 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 1 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 3 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 5 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 7 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 9 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 11 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 13 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 15 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 1 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 3 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 5 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 7 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 9 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 11 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 13 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 15 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 1 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 3 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 5 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 7 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 9 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 11 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 13 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 15 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 1 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 3 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 5 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 7 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 9 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 11 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 13 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 15 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7 +; CHECK-NEXT: xvpermi.q $xr1, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr4, $xr0, 2 +; CHECK-NEXT: xvmul.h $xr0, $xr1, $xr4 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %vas = shufflevector <32 x i8> %va, <32 x i8> poison, <16 x i32> + %vbs = shufflevector <32 x i8> %vb, <32 x i8> poison, <16 x i32> + %vae = zext <16 x i8> %vas to <16 x i16> + %vbe = zext <16 x i8> %vbs to <16 x i16> + %mul = mul <16 x i16> %vae, %vbe + store <16 x i16> %mul, ptr %res + ret void +} + +define void @vmulwod_w_hu(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_w_hu: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA32-NEXT: vpickve2gr.h $a1, $vr2, 7 +; LA32-NEXT: vpickve2gr.h $a2, $vr2, 5 +; LA32-NEXT: vpickve2gr.h $a3, $vr2, 3 +; LA32-NEXT: vpickve2gr.h $a4, $vr2, 1 +; LA32-NEXT: vpickve2gr.h $a5, $vr0, 7 +; LA32-NEXT: vpickve2gr.h $a6, $vr0, 5 +; LA32-NEXT: vpickve2gr.h $a7, $vr0, 3 +; LA32-NEXT: vpickve2gr.h $t0, $vr0, 1 +; LA32-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA32-NEXT: vpickve2gr.h $t1, $vr0, 7 +; LA32-NEXT: vpickve2gr.h $t2, $vr0, 5 +; LA32-NEXT: vpickve2gr.h $t3, $vr0, 3 +; LA32-NEXT: vpickve2gr.h $t4, $vr0, 1 +; LA32-NEXT: vpickve2gr.h $t5, $vr1, 7 +; LA32-NEXT: vpickve2gr.h $t6, $vr1, 5 +; LA32-NEXT: vpickve2gr.h $t7, $vr1, 3 +; LA32-NEXT: vpickve2gr.h $t8, $vr1, 1 +; LA32-NEXT: bstrpick.w $t0, $t0, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA32-NEXT: bstrpick.w $a7, $a7, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA32-NEXT: bstrpick.w $a6, $a6, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA32-NEXT: bstrpick.w $a5, $a5, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA32-NEXT: bstrpick.w $a4, $a4, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA32-NEXT: bstrpick.w $a3, $a3, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA32-NEXT: bstrpick.w $a2, $a2, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA32-NEXT: bstrpick.w $a1, $t8, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA32-NEXT: bstrpick.w $a1, $t7, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA32-NEXT: bstrpick.w $a1, $t6, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: bstrpick.w $a1, $t5, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: bstrpick.w $a1, $t4, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA32-NEXT: bstrpick.w $a1, $t3, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA32-NEXT: bstrpick.w $a1, $t2, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: bstrpick.w $a1, $t1, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA32-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_w_hu: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA64-NEXT: vpickve2gr.h $a1, $vr2, 7 +; LA64-NEXT: vpickve2gr.h $a2, $vr2, 5 +; LA64-NEXT: vpickve2gr.h $a3, $vr2, 3 +; LA64-NEXT: vpickve2gr.h $a4, $vr2, 1 +; LA64-NEXT: vpickve2gr.h $a5, $vr0, 7 +; LA64-NEXT: vpickve2gr.h $a6, $vr0, 5 +; LA64-NEXT: vpickve2gr.h $a7, $vr0, 3 +; LA64-NEXT: vpickve2gr.h $t0, $vr0, 1 +; LA64-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA64-NEXT: vpickve2gr.h $t1, $vr0, 7 +; LA64-NEXT: vpickve2gr.h $t2, $vr0, 5 +; LA64-NEXT: vpickve2gr.h $t3, $vr0, 3 +; LA64-NEXT: vpickve2gr.h $t4, $vr0, 1 +; LA64-NEXT: vpickve2gr.h $t5, $vr1, 7 +; LA64-NEXT: vpickve2gr.h $t6, $vr1, 5 +; LA64-NEXT: vpickve2gr.h $t7, $vr1, 3 +; LA64-NEXT: vpickve2gr.h $t8, $vr1, 1 +; LA64-NEXT: bstrpick.d $t0, $t0, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA64-NEXT: bstrpick.d $a7, $a7, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA64-NEXT: bstrpick.d $a6, $a6, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA64-NEXT: bstrpick.d $a5, $a5, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA64-NEXT: bstrpick.d $a4, $a4, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA64-NEXT: bstrpick.d $a3, $a3, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA64-NEXT: bstrpick.d $a2, $a2, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA64-NEXT: bstrpick.d $a1, $t8, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $t7, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $t6, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA64-NEXT: bstrpick.d $a1, $t5, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: bstrpick.d $a1, $t4, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $t3, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $t2, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA64-NEXT: bstrpick.d $a1, $t1, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %vas = shufflevector <16 x i16> %va, <16 x i16> poison, <8 x i32> + %vbs = shufflevector <16 x i16> %vb, <16 x i16> poison, <8 x i32> + %vae = zext <8 x i16> %vas to <8 x i32> + %vbe = zext <8 x i16> %vbs to <8 x i32> + %mul = mul <8 x i32> %vae, %vbe + store <8 x i32> %mul, ptr %res + ret void +} + +define void @vmulwod_d_wu(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_d_wu: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvrepli.b $xr2, 0 +; LA32-NEXT: xvpickve.w $xr3, $xr0, 1 +; LA32-NEXT: xvori.b $xr4, $xr2, 0 +; LA32-NEXT: xvinsve0.w $xr4, $xr3, 0 +; LA32-NEXT: xvpickve.w $xr3, $xr0, 3 +; LA32-NEXT: xvinsve0.w $xr4, $xr3, 2 +; LA32-NEXT: xvpickve.w $xr3, $xr0, 5 +; LA32-NEXT: xvinsve0.w $xr4, $xr3, 4 +; LA32-NEXT: xvpickve.w $xr0, $xr0, 7 +; LA32-NEXT: xvinsve0.w $xr4, $xr0, 6 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 1 +; LA32-NEXT: xvinsve0.w $xr2, $xr0, 0 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 3 +; LA32-NEXT: xvinsve0.w $xr2, $xr0, 2 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 5 +; LA32-NEXT: xvinsve0.w $xr2, $xr0, 4 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 7 +; LA32-NEXT: xvinsve0.w $xr2, $xr0, 6 +; LA32-NEXT: xvmul.d $xr0, $xr4, $xr2 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_d_wu: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 3 +; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 7 +; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 5 +; LA64-NEXT: xvpickve2gr.w $a5, $xr1, 3 +; LA64-NEXT: xvpickve2gr.w $a6, $xr1, 1 +; LA64-NEXT: xvpickve2gr.w $a7, $xr1, 7 +; LA64-NEXT: xvpickve2gr.w $t0, $xr1, 5 +; LA64-NEXT: bstrpick.d $a4, $a4, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a4, 0 +; LA64-NEXT: bstrpick.d $a3, $a3, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a3, 1 +; LA64-NEXT: bstrpick.d $a2, $a2, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 0 +; LA64-NEXT: bstrpick.d $a1, $a1, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: bstrpick.d $a1, $t0, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $a7, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $a6, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $a5, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA64-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %vas = shufflevector <8 x i32> %va, <8 x i32> poison, <4 x i32> + %vbs = shufflevector <8 x i32> %vb, <8 x i32> poison, <4 x i32> + %vae = zext <4 x i32> %vas to <4 x i64> + %vbe = zext <4 x i32> %vbs to <4 x i64> + %mul = mul <4 x i64> %vae, %vbe + store <4 x i64> %mul, ptr %res + ret void +} + +define void @vmulwod_q_du(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_q_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 7 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 6 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 3 +; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a5, $xr1, 7 +; LA32-NEXT: xvpickve2gr.w $a6, $xr1, 6 +; LA32-NEXT: xvpickve2gr.w $a7, $xr1, 3 +; LA32-NEXT: xvpickve2gr.w $t0, $xr1, 2 +; LA32-NEXT: mulh.wu $t1, $a4, $t0 +; LA32-NEXT: mul.w $t2, $a3, $t0 +; LA32-NEXT: add.w $t1, $t2, $t1 +; LA32-NEXT: sltu $t2, $t1, $t2 +; LA32-NEXT: mulh.wu $t3, $a3, $t0 +; LA32-NEXT: add.w $t2, $t3, $t2 +; LA32-NEXT: mul.w $t3, $a4, $a7 +; LA32-NEXT: add.w $t1, $t3, $t1 +; LA32-NEXT: sltu $t3, $t1, $t3 +; LA32-NEXT: mulh.wu $t4, $a4, $a7 +; LA32-NEXT: add.w $t3, $t4, $t3 +; LA32-NEXT: add.w $t3, $t2, $t3 +; LA32-NEXT: mul.w $t4, $a3, $a7 +; LA32-NEXT: add.w $t5, $t4, $t3 +; LA32-NEXT: sltu $t4, $t5, $t4 +; LA32-NEXT: sltu $t2, $t3, $t2 +; LA32-NEXT: mulh.wu $a3, $a3, $a7 +; LA32-NEXT: add.w $a3, $a3, $t2 +; LA32-NEXT: add.w $a3, $a3, $t4 +; LA32-NEXT: mulh.wu $a7, $a2, $a6 +; LA32-NEXT: mul.w $t2, $a1, $a6 +; LA32-NEXT: add.w $a7, $t2, $a7 +; LA32-NEXT: sltu $t2, $a7, $t2 +; LA32-NEXT: mulh.wu $t3, $a1, $a6 +; LA32-NEXT: add.w $t2, $t3, $t2 +; LA32-NEXT: mul.w $t3, $a2, $a5 +; LA32-NEXT: add.w $a7, $t3, $a7 +; LA32-NEXT: sltu $t3, $a7, $t3 +; LA32-NEXT: mulh.wu $t4, $a2, $a5 +; LA32-NEXT: add.w $t3, $t4, $t3 +; LA32-NEXT: add.w $t3, $t2, $t3 +; LA32-NEXT: mul.w $t4, $a1, $a5 +; LA32-NEXT: add.w $t6, $t4, $t3 +; LA32-NEXT: sltu $t4, $t6, $t4 +; LA32-NEXT: sltu $t2, $t3, $t2 +; LA32-NEXT: mulh.wu $a1, $a1, $a5 +; LA32-NEXT: add.w $a1, $a1, $t2 +; LA32-NEXT: add.w $a1, $a1, $t4 +; LA32-NEXT: mul.w $a4, $a4, $t0 +; LA32-NEXT: mul.w $a2, $a2, $a6 +; LA32-NEXT: st.w $a2, $a0, 16 +; LA32-NEXT: st.w $a4, $a0, 0 +; LA32-NEXT: st.w $a7, $a0, 20 +; LA32-NEXT: st.w $t1, $a0, 4 +; LA32-NEXT: st.w $t6, $a0, 24 +; LA32-NEXT: st.w $t5, $a0, 8 +; LA32-NEXT: st.w $a1, $a0, 28 +; LA32-NEXT: st.w $a3, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_q_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 3 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.d $a3, $xr1, 3 +; LA64-NEXT: xvpickve2gr.d $a4, $xr1, 1 +; LA64-NEXT: mul.d $a5, $a2, $a4 +; LA64-NEXT: mulh.du $a2, $a2, $a4 +; LA64-NEXT: mul.d $a4, $a1, $a3 +; LA64-NEXT: mulh.du $a1, $a1, $a3 +; LA64-NEXT: st.d $a1, $a0, 24 +; LA64-NEXT: st.d $a4, $a0, 16 +; LA64-NEXT: st.d $a2, $a0, 8 +; LA64-NEXT: st.d $a5, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %vas = shufflevector <4 x i64> %va, <4 x i64> poison, <2 x i32> + %vbs = shufflevector <4 x i64> %vb, <4 x i64> poison, <2 x i32> + %vae = zext <2 x i64> %vas to <2 x i128> + %vbe = zext <2 x i64> %vbs to <2 x i128> + %mul = mul <2 x i128> %vae, %vbe + store <2 x i128> %mul, ptr %res + ret void +} + +define void @vmulwev_h_bu_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_h_bu_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr3, $a1, 0 +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr3, 14 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 0 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 2 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 4 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 6 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 8 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 10 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 12 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 0 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 2 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 4 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 6 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 8 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 10 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 12 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 14 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 0 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 2 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 4 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 6 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 8 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 10 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 12 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 0 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 2 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 4 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 6 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 8 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 10 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 12 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7 +; CHECK-NEXT: xvpermi.q $xr1, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr4, $xr0, 2 +; CHECK-NEXT: xvmul.h $xr0, $xr1, $xr4 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %vas = shufflevector <32 x i8> %va, <32 x i8> poison, <16 x i32> + %vbs = shufflevector <32 x i8> %vb, <32 x i8> poison, <16 x i32> + %vae = zext <16 x i8> %vas to <16 x i16> + %vbe = sext <16 x i8> %vbs to <16 x i16> + %mul = mul <16 x i16> %vae, %vbe + store <16 x i16> %mul, ptr %res + ret void +} + +define void @vmulwev_w_hu_h(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_w_hu_h: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA32-NEXT: vpickve2gr.h $a1, $vr2, 6 +; LA32-NEXT: vpickve2gr.h $a2, $vr2, 4 +; LA32-NEXT: vpickve2gr.h $a3, $vr2, 2 +; LA32-NEXT: vpickve2gr.h $a4, $vr2, 0 +; LA32-NEXT: vpickve2gr.h $a5, $vr0, 6 +; LA32-NEXT: vpickve2gr.h $a6, $vr0, 4 +; LA32-NEXT: vpickve2gr.h $a7, $vr0, 2 +; LA32-NEXT: vpickve2gr.h $t0, $vr0, 0 +; LA32-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA32-NEXT: vpickve2gr.h $t1, $vr0, 6 +; LA32-NEXT: vpickve2gr.h $t2, $vr0, 4 +; LA32-NEXT: vpickve2gr.h $t3, $vr0, 2 +; LA32-NEXT: vpickve2gr.h $t4, $vr0, 0 +; LA32-NEXT: vpickve2gr.h $t5, $vr1, 6 +; LA32-NEXT: vpickve2gr.h $t6, $vr1, 4 +; LA32-NEXT: vpickve2gr.h $t7, $vr1, 2 +; LA32-NEXT: vpickve2gr.h $t8, $vr1, 0 +; LA32-NEXT: bstrpick.w $t0, $t0, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA32-NEXT: bstrpick.w $a7, $a7, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA32-NEXT: bstrpick.w $a6, $a6, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA32-NEXT: bstrpick.w $a5, $a5, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA32-NEXT: bstrpick.w $a4, $a4, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA32-NEXT: bstrpick.w $a3, $a3, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA32-NEXT: bstrpick.w $a2, $a2, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA32-NEXT: ext.w.h $a1, $t8 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA32-NEXT: ext.w.h $a1, $t7 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA32-NEXT: ext.w.h $a1, $t6 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: ext.w.h $a1, $t5 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: ext.w.h $a1, $t4 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA32-NEXT: ext.w.h $a1, $t3 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA32-NEXT: ext.w.h $a1, $t2 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: ext.w.h $a1, $t1 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA32-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_w_hu_h: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA64-NEXT: vpickve2gr.h $a1, $vr2, 6 +; LA64-NEXT: vpickve2gr.h $a2, $vr2, 4 +; LA64-NEXT: vpickve2gr.h $a3, $vr2, 2 +; LA64-NEXT: vpickve2gr.h $a4, $vr2, 0 +; LA64-NEXT: vpickve2gr.h $a5, $vr0, 6 +; LA64-NEXT: vpickve2gr.h $a6, $vr0, 4 +; LA64-NEXT: vpickve2gr.h $a7, $vr0, 2 +; LA64-NEXT: vpickve2gr.h $t0, $vr0, 0 +; LA64-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA64-NEXT: vpickve2gr.h $t1, $vr0, 6 +; LA64-NEXT: vpickve2gr.h $t2, $vr0, 4 +; LA64-NEXT: vpickve2gr.h $t3, $vr0, 2 +; LA64-NEXT: vpickve2gr.h $t4, $vr0, 0 +; LA64-NEXT: vpickve2gr.h $t5, $vr1, 6 +; LA64-NEXT: vpickve2gr.h $t6, $vr1, 4 +; LA64-NEXT: vpickve2gr.h $t7, $vr1, 2 +; LA64-NEXT: vpickve2gr.h $t8, $vr1, 0 +; LA64-NEXT: bstrpick.d $t0, $t0, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA64-NEXT: bstrpick.d $a7, $a7, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA64-NEXT: bstrpick.d $a6, $a6, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA64-NEXT: bstrpick.d $a5, $a5, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA64-NEXT: bstrpick.d $a4, $a4, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA64-NEXT: bstrpick.d $a3, $a3, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA64-NEXT: bstrpick.d $a2, $a2, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA64-NEXT: ext.w.h $a1, $t8 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA64-NEXT: ext.w.h $a1, $t7 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA64-NEXT: ext.w.h $a1, $t6 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA64-NEXT: ext.w.h $a1, $t5 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: ext.w.h $a1, $t4 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA64-NEXT: ext.w.h $a1, $t3 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA64-NEXT: ext.w.h $a1, $t2 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA64-NEXT: ext.w.h $a1, $t1 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %vas = shufflevector <16 x i16> %va, <16 x i16> poison, <8 x i32> + %vbs = shufflevector <16 x i16> %vb, <16 x i16> poison, <8 x i32> + %vae = zext <8 x i16> %vas to <8 x i32> + %vbe = sext <8 x i16> %vbs to <8 x i32> + %mul = mul <8 x i32> %vae, %vbe + store <8 x i32> %mul, ptr %res + ret void +} + +define void @vmulwev_d_wu_w(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_d_wu_w: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a2, 0 +; LA32-NEXT: xvld $xr1, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 6 +; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 4 +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 0 +; LA32-NEXT: xvpickve.w $xr2, $xr1, 2 +; LA32-NEXT: xvinsve0.w $xr0, $xr2, 2 +; LA32-NEXT: xvpickve.w $xr2, $xr1, 4 +; LA32-NEXT: xvinsve0.w $xr0, $xr2, 4 +; LA32-NEXT: xvpickve.w $xr1, $xr1, 6 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 6 +; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA32-NEXT: srai.w $a4, $a4, 31 +; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 1 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 2 +; LA32-NEXT: srai.w $a3, $a3, 31 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 3 +; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 0 +; LA32-NEXT: srai.w $a2, $a2, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 1 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: srai.w $a1, $a1, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr2, $xr1, 2 +; LA32-NEXT: xvmul.d $xr0, $xr0, $xr2 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_d_wu_w: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 6 +; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 4 +; LA64-NEXT: xvpickve2gr.w $a5, $xr1, 2 +; LA64-NEXT: xvpickve2gr.w $a6, $xr1, 0 +; LA64-NEXT: xvpickve2gr.w $a7, $xr1, 6 +; LA64-NEXT: xvpickve2gr.w $t0, $xr1, 4 +; LA64-NEXT: bstrpick.d $a4, $a4, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a4, 0 +; LA64-NEXT: bstrpick.d $a3, $a3, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a3, 1 +; LA64-NEXT: bstrpick.d $a2, $a2, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 0 +; LA64-NEXT: bstrpick.d $a1, $a1, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: vinsgr2vr.d $vr0, $t0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a7, 1 +; LA64-NEXT: vinsgr2vr.d $vr2, $a6, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a5, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA64-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %vas = shufflevector <8 x i32> %va, <8 x i32> poison, <4 x i32> + %vbs = shufflevector <8 x i32> %vb, <8 x i32> poison, <4 x i32> + %vae = zext <4 x i32> %vas to <4 x i64> + %vbe = sext <4 x i32> %vbs to <4 x i64> + %mul = mul <4 x i64> %vae, %vbe + store <4 x i64> %mul, ptr %res + ret void +} + +define void @vmulwev_q_du_d(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_q_du_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $fp, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 5 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 4 +; LA32-NEXT: xvpickve2gr.w $a6, $xr0, 1 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA32-NEXT: xvpickve2gr.w $a4, $xr1, 4 +; LA32-NEXT: xvpickve2gr.w $a5, $xr1, 0 +; LA32-NEXT: xvpickve2gr.w $a7, $xr1, 1 +; LA32-NEXT: xvpickve2gr.w $t0, $xr1, 5 +; LA32-NEXT: srai.w $t1, $t0, 31 +; LA32-NEXT: srai.w $t2, $a7, 31 +; LA32-NEXT: mulh.wu $t3, $a2, $a5 +; LA32-NEXT: mul.w $t4, $a6, $a5 +; LA32-NEXT: add.w $t3, $t4, $t3 +; LA32-NEXT: sltu $t4, $t3, $t4 +; LA32-NEXT: mulh.wu $t5, $a6, $a5 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: mul.w $t5, $a2, $a7 +; LA32-NEXT: add.w $t3, $t5, $t3 +; LA32-NEXT: sltu $t5, $t3, $t5 +; LA32-NEXT: mulh.wu $t6, $a2, $a7 +; LA32-NEXT: add.w $t5, $t6, $t5 +; LA32-NEXT: add.w $t5, $t4, $t5 +; LA32-NEXT: mul.w $t6, $a6, $a7 +; LA32-NEXT: add.w $t7, $t6, $t5 +; LA32-NEXT: mul.w $t8, $t2, $a2 +; LA32-NEXT: add.w $fp, $t7, $t8 +; LA32-NEXT: sltu $s0, $fp, $t7 +; LA32-NEXT: sltu $t6, $t7, $t6 +; LA32-NEXT: sltu $t4, $t5, $t4 +; LA32-NEXT: mulh.wu $a7, $a6, $a7 +; LA32-NEXT: add.w $a7, $a7, $t4 +; LA32-NEXT: add.w $a7, $a7, $t6 +; LA32-NEXT: mul.w $a6, $t2, $a6 +; LA32-NEXT: mulh.wu $t2, $t2, $a2 +; LA32-NEXT: add.w $a6, $t2, $a6 +; LA32-NEXT: add.w $a6, $a6, $t8 +; LA32-NEXT: add.w $a6, $a7, $a6 +; LA32-NEXT: add.w $a6, $a6, $s0 +; LA32-NEXT: mulh.wu $a7, $a1, $a4 +; LA32-NEXT: mul.w $t2, $a3, $a4 +; LA32-NEXT: add.w $a7, $t2, $a7 +; LA32-NEXT: sltu $t2, $a7, $t2 +; LA32-NEXT: mulh.wu $t4, $a3, $a4 +; LA32-NEXT: add.w $t2, $t4, $t2 +; LA32-NEXT: mul.w $t4, $a1, $t0 +; LA32-NEXT: add.w $a7, $t4, $a7 +; LA32-NEXT: sltu $t4, $a7, $t4 +; LA32-NEXT: mulh.wu $t5, $a1, $t0 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: add.w $t4, $t2, $t4 +; LA32-NEXT: mul.w $t5, $a3, $t0 +; LA32-NEXT: add.w $t6, $t5, $t4 +; LA32-NEXT: mul.w $t7, $t1, $a1 +; LA32-NEXT: add.w $t8, $t6, $t7 +; LA32-NEXT: sltu $s0, $t8, $t6 +; LA32-NEXT: sltu $t5, $t6, $t5 +; LA32-NEXT: sltu $t2, $t4, $t2 +; LA32-NEXT: mulh.wu $t0, $a3, $t0 +; LA32-NEXT: add.w $t0, $t0, $t2 +; LA32-NEXT: add.w $t0, $t0, $t5 +; LA32-NEXT: mul.w $a3, $t1, $a3 +; LA32-NEXT: mulh.wu $t1, $t1, $a1 +; LA32-NEXT: add.w $a3, $t1, $a3 +; LA32-NEXT: add.w $a3, $a3, $t7 +; LA32-NEXT: add.w $a3, $t0, $a3 +; LA32-NEXT: add.w $a3, $a3, $s0 +; LA32-NEXT: mul.w $a2, $a2, $a5 +; LA32-NEXT: mul.w $a1, $a1, $a4 +; LA32-NEXT: st.w $a1, $a0, 16 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $a7, $a0, 20 +; LA32-NEXT: st.w $t3, $a0, 4 +; LA32-NEXT: st.w $t8, $a0, 24 +; LA32-NEXT: st.w $fp, $a0, 8 +; LA32-NEXT: st.w $a3, $a0, 28 +; LA32-NEXT: st.w $a6, $a0, 12 +; LA32-NEXT: ld.w $s0, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_q_du_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 0 +; LA64-NEXT: xvpickve2gr.d $a3, $xr1, 0 +; LA64-NEXT: xvpickve2gr.d $a4, $xr1, 2 +; LA64-NEXT: srai.d $a5, $a4, 63 +; LA64-NEXT: srai.d $a6, $a3, 63 +; LA64-NEXT: mulh.du $a7, $a2, $a3 +; LA64-NEXT: mul.d $a6, $a2, $a6 +; LA64-NEXT: add.d $a6, $a7, $a6 +; LA64-NEXT: mulh.du $a7, $a1, $a4 +; LA64-NEXT: mul.d $a5, $a1, $a5 +; LA64-NEXT: add.d $a5, $a7, $a5 +; LA64-NEXT: mul.d $a2, $a2, $a3 +; LA64-NEXT: mul.d $a1, $a1, $a4 +; LA64-NEXT: st.d $a1, $a0, 16 +; LA64-NEXT: st.d $a2, $a0, 0 +; LA64-NEXT: st.d $a5, $a0, 24 +; LA64-NEXT: st.d $a6, $a0, 8 +; LA64-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %vas = shufflevector <4 x i64> %va, <4 x i64> poison, <2 x i32> + %vbs = shufflevector <4 x i64> %vb, <4 x i64> poison, <2 x i32> + %vae = zext <2 x i64> %vas to <2 x i128> + %vbe = sext <2 x i64> %vbs to <2 x i128> + %mul = mul <2 x i128> %vae, %vbe + store <2 x i128> %mul, ptr %res + ret void +} + +define void @vmulwod_h_bu_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_h_bu_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr3, $a1, 0 +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr3, 14 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 1 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 3 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 5 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 7 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 9 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 11 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 13 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 15 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 1 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 3 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 5 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 7 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 9 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 11 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 13 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 15 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 1 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 3 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 5 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 7 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 9 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 11 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 13 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 15 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 1 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 3 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 5 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 7 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 9 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 11 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 13 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 15 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7 +; CHECK-NEXT: xvpermi.q $xr1, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr4, $xr0, 2 +; CHECK-NEXT: xvmul.h $xr0, $xr1, $xr4 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %vas = shufflevector <32 x i8> %va, <32 x i8> poison, <16 x i32> + %vbs = shufflevector <32 x i8> %vb, <32 x i8> poison, <16 x i32> + %vae = zext <16 x i8> %vas to <16 x i16> + %vbe = sext <16 x i8> %vbs to <16 x i16> + %mul = mul <16 x i16> %vae, %vbe + store <16 x i16> %mul, ptr %res + ret void +} + +define void @vmulwod_w_hu_h(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_w_hu_h: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA32-NEXT: vpickve2gr.h $a1, $vr2, 7 +; LA32-NEXT: vpickve2gr.h $a2, $vr2, 5 +; LA32-NEXT: vpickve2gr.h $a3, $vr2, 3 +; LA32-NEXT: vpickve2gr.h $a4, $vr2, 1 +; LA32-NEXT: vpickve2gr.h $a5, $vr0, 7 +; LA32-NEXT: vpickve2gr.h $a6, $vr0, 5 +; LA32-NEXT: vpickve2gr.h $a7, $vr0, 3 +; LA32-NEXT: vpickve2gr.h $t0, $vr0, 1 +; LA32-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA32-NEXT: vpickve2gr.h $t1, $vr0, 7 +; LA32-NEXT: vpickve2gr.h $t2, $vr0, 5 +; LA32-NEXT: vpickve2gr.h $t3, $vr0, 3 +; LA32-NEXT: vpickve2gr.h $t4, $vr0, 1 +; LA32-NEXT: vpickve2gr.h $t5, $vr1, 7 +; LA32-NEXT: vpickve2gr.h $t6, $vr1, 5 +; LA32-NEXT: vpickve2gr.h $t7, $vr1, 3 +; LA32-NEXT: vpickve2gr.h $t8, $vr1, 1 +; LA32-NEXT: bstrpick.w $t0, $t0, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA32-NEXT: bstrpick.w $a7, $a7, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA32-NEXT: bstrpick.w $a6, $a6, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA32-NEXT: bstrpick.w $a5, $a5, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA32-NEXT: bstrpick.w $a4, $a4, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA32-NEXT: bstrpick.w $a3, $a3, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA32-NEXT: bstrpick.w $a2, $a2, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA32-NEXT: bstrpick.w $a1, $a1, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA32-NEXT: ext.w.h $a1, $t8 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA32-NEXT: ext.w.h $a1, $t7 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA32-NEXT: ext.w.h $a1, $t6 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: ext.w.h $a1, $t5 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: ext.w.h $a1, $t4 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA32-NEXT: ext.w.h $a1, $t3 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA32-NEXT: ext.w.h $a1, $t2 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: ext.w.h $a1, $t1 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA32-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_w_hu_h: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA64-NEXT: vpickve2gr.h $a1, $vr2, 7 +; LA64-NEXT: vpickve2gr.h $a2, $vr2, 5 +; LA64-NEXT: vpickve2gr.h $a3, $vr2, 3 +; LA64-NEXT: vpickve2gr.h $a4, $vr2, 1 +; LA64-NEXT: vpickve2gr.h $a5, $vr0, 7 +; LA64-NEXT: vpickve2gr.h $a6, $vr0, 5 +; LA64-NEXT: vpickve2gr.h $a7, $vr0, 3 +; LA64-NEXT: vpickve2gr.h $t0, $vr0, 1 +; LA64-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA64-NEXT: vpickve2gr.h $t1, $vr0, 7 +; LA64-NEXT: vpickve2gr.h $t2, $vr0, 5 +; LA64-NEXT: vpickve2gr.h $t3, $vr0, 3 +; LA64-NEXT: vpickve2gr.h $t4, $vr0, 1 +; LA64-NEXT: vpickve2gr.h $t5, $vr1, 7 +; LA64-NEXT: vpickve2gr.h $t6, $vr1, 5 +; LA64-NEXT: vpickve2gr.h $t7, $vr1, 3 +; LA64-NEXT: vpickve2gr.h $t8, $vr1, 1 +; LA64-NEXT: bstrpick.d $t0, $t0, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA64-NEXT: bstrpick.d $a7, $a7, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA64-NEXT: bstrpick.d $a6, $a6, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA64-NEXT: bstrpick.d $a5, $a5, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA64-NEXT: bstrpick.d $a4, $a4, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA64-NEXT: bstrpick.d $a3, $a3, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA64-NEXT: bstrpick.d $a2, $a2, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA64-NEXT: ext.w.h $a1, $t8 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA64-NEXT: ext.w.h $a1, $t7 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA64-NEXT: ext.w.h $a1, $t6 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA64-NEXT: ext.w.h $a1, $t5 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: ext.w.h $a1, $t4 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA64-NEXT: ext.w.h $a1, $t3 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA64-NEXT: ext.w.h $a1, $t2 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA64-NEXT: ext.w.h $a1, $t1 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %vas = shufflevector <16 x i16> %va, <16 x i16> poison, <8 x i32> + %vbs = shufflevector <16 x i16> %vb, <16 x i16> poison, <8 x i32> + %vae = zext <8 x i16> %vas to <8 x i32> + %vbe = sext <8 x i16> %vbs to <8 x i32> + %mul = mul <8 x i32> %vae, %vbe + store <8 x i32> %mul, ptr %res + ret void +} + +define void @vmulwod_d_wu_w(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_d_wu_w: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a2, 0 +; LA32-NEXT: xvld $xr1, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 3 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 1 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 7 +; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 5 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 1 +; LA32-NEXT: xvrepli.b $xr2, 0 +; LA32-NEXT: xvinsve0.w $xr2, $xr0, 0 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 3 +; LA32-NEXT: xvinsve0.w $xr2, $xr0, 2 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 5 +; LA32-NEXT: xvinsve0.w $xr2, $xr0, 4 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 7 +; LA32-NEXT: xvinsve0.w $xr2, $xr0, 6 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 0 +; LA32-NEXT: srai.w $a4, $a4, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 1 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 2 +; LA32-NEXT: srai.w $a3, $a3, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 3 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: srai.w $a2, $a2, 31 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 1 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: srai.w $a1, $a1, 31 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA32-NEXT: xvmul.d $xr0, $xr2, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_d_wu_w: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 3 +; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 7 +; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 5 +; LA64-NEXT: xvpickve2gr.w $a5, $xr1, 3 +; LA64-NEXT: xvpickve2gr.w $a6, $xr1, 1 +; LA64-NEXT: xvpickve2gr.w $a7, $xr1, 7 +; LA64-NEXT: xvpickve2gr.w $t0, $xr1, 5 +; LA64-NEXT: bstrpick.d $a4, $a4, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a4, 0 +; LA64-NEXT: bstrpick.d $a3, $a3, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a3, 1 +; LA64-NEXT: bstrpick.d $a2, $a2, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 0 +; LA64-NEXT: bstrpick.d $a1, $a1, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: vinsgr2vr.d $vr0, $t0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a7, 1 +; LA64-NEXT: vinsgr2vr.d $vr2, $a6, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a5, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA64-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %vas = shufflevector <8 x i32> %va, <8 x i32> poison, <4 x i32> + %vbs = shufflevector <8 x i32> %vb, <8 x i32> poison, <4 x i32> + %vae = zext <4 x i32> %vas to <4 x i64> + %vbe = sext <4 x i32> %vbs to <4 x i64> + %mul = mul <4 x i64> %vae, %vbe + store <4 x i64> %mul, ptr %res + ret void +} + +define void @vmulwod_q_du_d(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_q_du_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $fp, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 7 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 6 +; LA32-NEXT: xvpickve2gr.w $a6, $xr0, 3 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a4, $xr1, 6 +; LA32-NEXT: xvpickve2gr.w $a5, $xr1, 2 +; LA32-NEXT: xvpickve2gr.w $a7, $xr1, 3 +; LA32-NEXT: xvpickve2gr.w $t0, $xr1, 7 +; LA32-NEXT: srai.w $t1, $t0, 31 +; LA32-NEXT: srai.w $t2, $a7, 31 +; LA32-NEXT: mulh.wu $t3, $a2, $a5 +; LA32-NEXT: mul.w $t4, $a6, $a5 +; LA32-NEXT: add.w $t3, $t4, $t3 +; LA32-NEXT: sltu $t4, $t3, $t4 +; LA32-NEXT: mulh.wu $t5, $a6, $a5 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: mul.w $t5, $a2, $a7 +; LA32-NEXT: add.w $t3, $t5, $t3 +; LA32-NEXT: sltu $t5, $t3, $t5 +; LA32-NEXT: mulh.wu $t6, $a2, $a7 +; LA32-NEXT: add.w $t5, $t6, $t5 +; LA32-NEXT: add.w $t5, $t4, $t5 +; LA32-NEXT: mul.w $t6, $a6, $a7 +; LA32-NEXT: add.w $t7, $t6, $t5 +; LA32-NEXT: mul.w $t8, $t2, $a2 +; LA32-NEXT: add.w $fp, $t7, $t8 +; LA32-NEXT: sltu $s0, $fp, $t7 +; LA32-NEXT: sltu $t6, $t7, $t6 +; LA32-NEXT: sltu $t4, $t5, $t4 +; LA32-NEXT: mulh.wu $a7, $a6, $a7 +; LA32-NEXT: add.w $a7, $a7, $t4 +; LA32-NEXT: add.w $a7, $a7, $t6 +; LA32-NEXT: mul.w $a6, $t2, $a6 +; LA32-NEXT: mulh.wu $t2, $t2, $a2 +; LA32-NEXT: add.w $a6, $t2, $a6 +; LA32-NEXT: add.w $a6, $a6, $t8 +; LA32-NEXT: add.w $a6, $a7, $a6 +; LA32-NEXT: add.w $a6, $a6, $s0 +; LA32-NEXT: mulh.wu $a7, $a1, $a4 +; LA32-NEXT: mul.w $t2, $a3, $a4 +; LA32-NEXT: add.w $a7, $t2, $a7 +; LA32-NEXT: sltu $t2, $a7, $t2 +; LA32-NEXT: mulh.wu $t4, $a3, $a4 +; LA32-NEXT: add.w $t2, $t4, $t2 +; LA32-NEXT: mul.w $t4, $a1, $t0 +; LA32-NEXT: add.w $a7, $t4, $a7 +; LA32-NEXT: sltu $t4, $a7, $t4 +; LA32-NEXT: mulh.wu $t5, $a1, $t0 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: add.w $t4, $t2, $t4 +; LA32-NEXT: mul.w $t5, $a3, $t0 +; LA32-NEXT: add.w $t6, $t5, $t4 +; LA32-NEXT: mul.w $t7, $t1, $a1 +; LA32-NEXT: add.w $t8, $t6, $t7 +; LA32-NEXT: sltu $s0, $t8, $t6 +; LA32-NEXT: sltu $t5, $t6, $t5 +; LA32-NEXT: sltu $t2, $t4, $t2 +; LA32-NEXT: mulh.wu $t0, $a3, $t0 +; LA32-NEXT: add.w $t0, $t0, $t2 +; LA32-NEXT: add.w $t0, $t0, $t5 +; LA32-NEXT: mul.w $a3, $t1, $a3 +; LA32-NEXT: mulh.wu $t1, $t1, $a1 +; LA32-NEXT: add.w $a3, $t1, $a3 +; LA32-NEXT: add.w $a3, $a3, $t7 +; LA32-NEXT: add.w $a3, $t0, $a3 +; LA32-NEXT: add.w $a3, $a3, $s0 +; LA32-NEXT: mul.w $a2, $a2, $a5 +; LA32-NEXT: mul.w $a1, $a1, $a4 +; LA32-NEXT: st.w $a1, $a0, 16 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $a7, $a0, 20 +; LA32-NEXT: st.w $t3, $a0, 4 +; LA32-NEXT: st.w $t8, $a0, 24 +; LA32-NEXT: st.w $fp, $a0, 8 +; LA32-NEXT: st.w $a3, $a0, 28 +; LA32-NEXT: st.w $a6, $a0, 12 +; LA32-NEXT: ld.w $s0, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_q_du_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 3 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.d $a3, $xr1, 1 +; LA64-NEXT: xvpickve2gr.d $a4, $xr1, 3 +; LA64-NEXT: srai.d $a5, $a4, 63 +; LA64-NEXT: srai.d $a6, $a3, 63 +; LA64-NEXT: mulh.du $a7, $a2, $a3 +; LA64-NEXT: mul.d $a6, $a2, $a6 +; LA64-NEXT: add.d $a6, $a7, $a6 +; LA64-NEXT: mulh.du $a7, $a1, $a4 +; LA64-NEXT: mul.d $a5, $a1, $a5 +; LA64-NEXT: add.d $a5, $a7, $a5 +; LA64-NEXT: mul.d $a2, $a2, $a3 +; LA64-NEXT: mul.d $a1, $a1, $a4 +; LA64-NEXT: st.d $a1, $a0, 16 +; LA64-NEXT: st.d $a2, $a0, 0 +; LA64-NEXT: st.d $a5, $a0, 24 +; LA64-NEXT: st.d $a6, $a0, 8 +; LA64-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %vas = shufflevector <4 x i64> %va, <4 x i64> poison, <2 x i32> + %vbs = shufflevector <4 x i64> %vb, <4 x i64> poison, <2 x i32> + %vae = zext <2 x i64> %vas to <2 x i128> + %vbe = sext <2 x i64> %vbs to <2 x i128> + %mul = mul <2 x i128> %vae, %vbe + store <2 x i128> %mul, ptr %res + ret void +} + +define void @vmulwev_h_bu_b_1(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_h_bu_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr3, $a1, 0 +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr3, 14 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 0 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 2 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 4 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 6 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 8 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 10 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 12 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 0 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 2 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 4 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 6 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 8 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 10 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 12 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 14 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 0 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 2 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 4 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 6 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 8 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 10 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 12 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 0 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 2 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 4 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 6 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 8 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 10 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 12 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 14 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7 +; CHECK-NEXT: xvpermi.q $xr1, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr4, $xr0, 2 +; CHECK-NEXT: xvmul.h $xr0, $xr1, $xr4 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %vas = shufflevector <32 x i8> %va, <32 x i8> poison, <16 x i32> + %vbs = shufflevector <32 x i8> %vb, <32 x i8> poison, <16 x i32> + %vae = sext <16 x i8> %vas to <16 x i16> + %vbe = zext <16 x i8> %vbs to <16 x i16> + %mul = mul <16 x i16> %vae, %vbe + store <16 x i16> %mul, ptr %res + ret void +} + +define void @vmulwev_w_hu_h_1(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_w_hu_h_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA32-NEXT: vpickve2gr.h $a1, $vr2, 6 +; LA32-NEXT: vpickve2gr.h $a2, $vr2, 4 +; LA32-NEXT: vpickve2gr.h $a3, $vr2, 2 +; LA32-NEXT: vpickve2gr.h $a4, $vr2, 0 +; LA32-NEXT: vpickve2gr.h $a5, $vr0, 6 +; LA32-NEXT: vpickve2gr.h $a6, $vr0, 4 +; LA32-NEXT: vpickve2gr.h $a7, $vr0, 2 +; LA32-NEXT: vpickve2gr.h $t0, $vr0, 0 +; LA32-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA32-NEXT: vpickve2gr.h $t1, $vr0, 6 +; LA32-NEXT: vpickve2gr.h $t2, $vr0, 4 +; LA32-NEXT: vpickve2gr.h $t3, $vr0, 2 +; LA32-NEXT: vpickve2gr.h $t4, $vr0, 0 +; LA32-NEXT: vpickve2gr.h $t5, $vr1, 6 +; LA32-NEXT: vpickve2gr.h $t6, $vr1, 4 +; LA32-NEXT: vpickve2gr.h $t7, $vr1, 2 +; LA32-NEXT: vpickve2gr.h $t8, $vr1, 0 +; LA32-NEXT: ext.w.h $t0, $t0 +; LA32-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA32-NEXT: ext.w.h $a7, $a7 +; LA32-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA32-NEXT: ext.w.h $a6, $a6 +; LA32-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA32-NEXT: ext.w.h $a5, $a5 +; LA32-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA32-NEXT: ext.w.h $a4, $a4 +; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA32-NEXT: ext.w.h $a3, $a3 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA32-NEXT: ext.w.h $a2, $a2 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA32-NEXT: ext.w.h $a1, $a1 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA32-NEXT: bstrpick.w $a1, $t8, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA32-NEXT: bstrpick.w $a1, $t7, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA32-NEXT: bstrpick.w $a1, $t6, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: bstrpick.w $a1, $t5, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: bstrpick.w $a1, $t4, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA32-NEXT: bstrpick.w $a1, $t3, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA32-NEXT: bstrpick.w $a1, $t2, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: bstrpick.w $a1, $t1, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA32-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_w_hu_h_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA64-NEXT: vpickve2gr.h $a1, $vr2, 6 +; LA64-NEXT: vpickve2gr.h $a2, $vr2, 4 +; LA64-NEXT: vpickve2gr.h $a3, $vr2, 2 +; LA64-NEXT: vpickve2gr.h $a4, $vr2, 0 +; LA64-NEXT: vpickve2gr.h $a5, $vr0, 6 +; LA64-NEXT: vpickve2gr.h $a6, $vr0, 4 +; LA64-NEXT: vpickve2gr.h $a7, $vr0, 2 +; LA64-NEXT: vpickve2gr.h $t0, $vr0, 0 +; LA64-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA64-NEXT: vpickve2gr.h $t1, $vr0, 6 +; LA64-NEXT: vpickve2gr.h $t2, $vr0, 4 +; LA64-NEXT: vpickve2gr.h $t3, $vr0, 2 +; LA64-NEXT: vpickve2gr.h $t4, $vr0, 0 +; LA64-NEXT: vpickve2gr.h $t5, $vr1, 6 +; LA64-NEXT: vpickve2gr.h $t6, $vr1, 4 +; LA64-NEXT: vpickve2gr.h $t7, $vr1, 2 +; LA64-NEXT: vpickve2gr.h $t8, $vr1, 0 +; LA64-NEXT: ext.w.h $t0, $t0 +; LA64-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA64-NEXT: ext.w.h $a7, $a7 +; LA64-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA64-NEXT: ext.w.h $a6, $a6 +; LA64-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA64-NEXT: ext.w.h $a5, $a5 +; LA64-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA64-NEXT: ext.w.h $a4, $a4 +; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA64-NEXT: ext.w.h $a3, $a3 +; LA64-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA64-NEXT: ext.w.h $a2, $a2 +; LA64-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA64-NEXT: ext.w.h $a1, $a1 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA64-NEXT: bstrpick.d $a1, $t8, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $t7, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $t6, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA64-NEXT: bstrpick.d $a1, $t5, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: bstrpick.d $a1, $t4, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $t3, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $t2, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA64-NEXT: bstrpick.d $a1, $t1, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %vas = shufflevector <16 x i16> %va, <16 x i16> poison, <8 x i32> + %vbs = shufflevector <16 x i16> %vb, <16 x i16> poison, <8 x i32> + %vae = sext <8 x i16> %vas to <8 x i32> + %vbe = zext <8 x i16> %vbs to <8 x i32> + %mul = mul <8 x i32> %vae, %vbe + store <8 x i32> %mul, ptr %res + ret void +} + +define void @vmulwev_d_wu_w_1(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_d_wu_w_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 6 +; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 0 +; LA32-NEXT: srai.w $a4, $a4, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 1 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 2 +; LA32-NEXT: srai.w $a3, $a3, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 3 +; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 0 +; LA32-NEXT: srai.w $a2, $a2, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 1 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: srai.w $a1, $a1, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 0 +; LA32-NEXT: xvpickve.w $xr3, $xr1, 2 +; LA32-NEXT: xvinsve0.w $xr0, $xr3, 2 +; LA32-NEXT: xvpickve.w $xr3, $xr1, 4 +; LA32-NEXT: xvinsve0.w $xr0, $xr3, 4 +; LA32-NEXT: xvpickve.w $xr1, $xr1, 6 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 6 +; LA32-NEXT: xvmul.d $xr0, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_d_wu_w_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 6 +; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 4 +; LA64-NEXT: xvpickve2gr.w $a5, $xr1, 2 +; LA64-NEXT: xvpickve2gr.w $a6, $xr1, 0 +; LA64-NEXT: xvpickve2gr.w $a7, $xr1, 6 +; LA64-NEXT: xvpickve2gr.w $t0, $xr1, 4 +; LA64-NEXT: vinsgr2vr.d $vr0, $a4, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a3, 1 +; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: bstrpick.d $a1, $t0, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $a7, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $a6, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $a5, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA64-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %vas = shufflevector <8 x i32> %va, <8 x i32> poison, <4 x i32> + %vbs = shufflevector <8 x i32> %vb, <8 x i32> poison, <4 x i32> + %vae = sext <4 x i32> %vas to <4 x i64> + %vbe = zext <4 x i32> %vbs to <4 x i64> + %mul = mul <4 x i64> %vae, %vbe + store <4 x i64> %mul, ptr %res + ret void +} + +define void @vmulwev_q_du_d_1(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_q_du_d_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $fp, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 4 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 0 +; LA32-NEXT: xvpickve2gr.w $a6, $xr0, 1 +; LA32-NEXT: xvpickve2gr.w $a7, $xr0, 5 +; LA32-NEXT: xvpickve2gr.w $a5, $xr1, 5 +; LA32-NEXT: xvpickve2gr.w $a3, $xr1, 4 +; LA32-NEXT: xvpickve2gr.w $t0, $xr1, 1 +; LA32-NEXT: xvpickve2gr.w $a4, $xr1, 0 +; LA32-NEXT: srai.w $t1, $a7, 31 +; LA32-NEXT: srai.w $t2, $a6, 31 +; LA32-NEXT: mulh.wu $t3, $a2, $a4 +; LA32-NEXT: mul.w $t4, $a6, $a4 +; LA32-NEXT: add.w $t3, $t4, $t3 +; LA32-NEXT: sltu $t4, $t3, $t4 +; LA32-NEXT: mulh.wu $t5, $a6, $a4 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: mul.w $t5, $a2, $t0 +; LA32-NEXT: add.w $t3, $t5, $t3 +; LA32-NEXT: sltu $t5, $t3, $t5 +; LA32-NEXT: mulh.wu $t6, $a2, $t0 +; LA32-NEXT: add.w $t5, $t6, $t5 +; LA32-NEXT: add.w $t5, $t4, $t5 +; LA32-NEXT: mul.w $t6, $a6, $t0 +; LA32-NEXT: add.w $t7, $t6, $t5 +; LA32-NEXT: mul.w $t8, $a4, $t2 +; LA32-NEXT: add.w $fp, $t7, $t8 +; LA32-NEXT: sltu $s0, $fp, $t7 +; LA32-NEXT: sltu $t6, $t7, $t6 +; LA32-NEXT: sltu $t4, $t5, $t4 +; LA32-NEXT: mulh.wu $a6, $a6, $t0 +; LA32-NEXT: add.w $a6, $a6, $t4 +; LA32-NEXT: add.w $a6, $a6, $t6 +; LA32-NEXT: mulh.wu $t4, $a4, $t2 +; LA32-NEXT: add.w $t4, $t4, $t8 +; LA32-NEXT: mul.w $t0, $t0, $t2 +; LA32-NEXT: add.w $t0, $t4, $t0 +; LA32-NEXT: add.w $a6, $a6, $t0 +; LA32-NEXT: add.w $a6, $a6, $s0 +; LA32-NEXT: mulh.wu $t0, $a1, $a3 +; LA32-NEXT: mul.w $t2, $a7, $a3 +; LA32-NEXT: add.w $t0, $t2, $t0 +; LA32-NEXT: sltu $t2, $t0, $t2 +; LA32-NEXT: mulh.wu $t4, $a7, $a3 +; LA32-NEXT: add.w $t2, $t4, $t2 +; LA32-NEXT: mul.w $t4, $a1, $a5 +; LA32-NEXT: add.w $t0, $t4, $t0 +; LA32-NEXT: sltu $t4, $t0, $t4 +; LA32-NEXT: mulh.wu $t5, $a1, $a5 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: add.w $t4, $t2, $t4 +; LA32-NEXT: mul.w $t5, $a7, $a5 +; LA32-NEXT: add.w $t6, $t5, $t4 +; LA32-NEXT: mul.w $t7, $a3, $t1 +; LA32-NEXT: add.w $t8, $t6, $t7 +; LA32-NEXT: sltu $s0, $t8, $t6 +; LA32-NEXT: sltu $t5, $t6, $t5 +; LA32-NEXT: sltu $t2, $t4, $t2 +; LA32-NEXT: mulh.wu $a7, $a7, $a5 +; LA32-NEXT: add.w $a7, $a7, $t2 +; LA32-NEXT: add.w $a7, $a7, $t5 +; LA32-NEXT: mulh.wu $t2, $a3, $t1 +; LA32-NEXT: add.w $t2, $t2, $t7 +; LA32-NEXT: mul.w $a5, $a5, $t1 +; LA32-NEXT: add.w $a5, $t2, $a5 +; LA32-NEXT: add.w $a5, $a7, $a5 +; LA32-NEXT: add.w $a5, $a5, $s0 +; LA32-NEXT: mul.w $a2, $a2, $a4 +; LA32-NEXT: mul.w $a1, $a1, $a3 +; LA32-NEXT: st.w $a1, $a0, 16 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $t0, $a0, 20 +; LA32-NEXT: st.w $t3, $a0, 4 +; LA32-NEXT: st.w $t8, $a0, 24 +; LA32-NEXT: st.w $fp, $a0, 8 +; LA32-NEXT: st.w $a5, $a0, 28 +; LA32-NEXT: st.w $a6, $a0, 12 +; LA32-NEXT: ld.w $s0, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_q_du_d_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 0 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 2 +; LA64-NEXT: xvpickve2gr.d $a3, $xr1, 2 +; LA64-NEXT: xvpickve2gr.d $a4, $xr1, 0 +; LA64-NEXT: srai.d $a5, $a2, 63 +; LA64-NEXT: srai.d $a6, $a1, 63 +; LA64-NEXT: mulh.du $a7, $a1, $a4 +; LA64-NEXT: mul.d $a6, $a6, $a4 +; LA64-NEXT: add.d $a6, $a7, $a6 +; LA64-NEXT: mulh.du $a7, $a2, $a3 +; LA64-NEXT: mul.d $a5, $a5, $a3 +; LA64-NEXT: add.d $a5, $a7, $a5 +; LA64-NEXT: mul.d $a1, $a1, $a4 +; LA64-NEXT: mul.d $a2, $a2, $a3 +; LA64-NEXT: st.d $a2, $a0, 16 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a5, $a0, 24 +; LA64-NEXT: st.d $a6, $a0, 8 +; LA64-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %vas = shufflevector <4 x i64> %va, <4 x i64> poison, <2 x i32> + %vbs = shufflevector <4 x i64> %vb, <4 x i64> poison, <2 x i32> + %vae = sext <2 x i64> %vas to <2 x i128> + %vbe = zext <2 x i64> %vbs to <2 x i128> + %mul = mul <2 x i128> %vae, %vbe + store <2 x i128> %mul, ptr %res + ret void +} + +define void @vmulwod_h_bu_b_1(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_h_bu_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr3, $a1, 0 +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvpermi.d $xr2, $xr3, 14 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 1 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 3 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 5 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 7 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 9 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 11 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 13 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr3, 15 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 1 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 3 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 5 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 7 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 9 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 11 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 13 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 15 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14 +; CHECK-NEXT: ext.w.b $a1, $a1 +; CHECK-NEXT: vinsgr2vr.h $vr3, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 1 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 3 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 5 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 7 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 9 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 11 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 13 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 15 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr4, $a1, 7 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 1 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 3 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 5 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 7 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 9 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 11 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 13 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6 +; CHECK-NEXT: vpickve2gr.b $a1, $vr2, 15 +; CHECK-NEXT: andi $a1, $a1, 255 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7 +; CHECK-NEXT: xvpermi.q $xr1, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr4, $xr0, 2 +; CHECK-NEXT: xvmul.h $xr0, $xr1, $xr4 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %vas = shufflevector <32 x i8> %va, <32 x i8> poison, <16 x i32> + %vbs = shufflevector <32 x i8> %vb, <32 x i8> poison, <16 x i32> + %vae = sext <16 x i8> %vas to <16 x i16> + %vbe = zext <16 x i8> %vbs to <16 x i16> + %mul = mul <16 x i16> %vae, %vbe + store <16 x i16> %mul, ptr %res + ret void +} + +define void @vmulwod_w_hu_h_1(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_w_hu_h_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA32-NEXT: vpickve2gr.h $a1, $vr2, 7 +; LA32-NEXT: vpickve2gr.h $a2, $vr2, 5 +; LA32-NEXT: vpickve2gr.h $a3, $vr2, 3 +; LA32-NEXT: vpickve2gr.h $a4, $vr2, 1 +; LA32-NEXT: vpickve2gr.h $a5, $vr0, 7 +; LA32-NEXT: vpickve2gr.h $a6, $vr0, 5 +; LA32-NEXT: vpickve2gr.h $a7, $vr0, 3 +; LA32-NEXT: vpickve2gr.h $t0, $vr0, 1 +; LA32-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA32-NEXT: vpickve2gr.h $t1, $vr0, 7 +; LA32-NEXT: vpickve2gr.h $t2, $vr0, 5 +; LA32-NEXT: vpickve2gr.h $t3, $vr0, 3 +; LA32-NEXT: vpickve2gr.h $t4, $vr0, 1 +; LA32-NEXT: vpickve2gr.h $t5, $vr1, 7 +; LA32-NEXT: vpickve2gr.h $t6, $vr1, 5 +; LA32-NEXT: vpickve2gr.h $t7, $vr1, 3 +; LA32-NEXT: vpickve2gr.h $t8, $vr1, 1 +; LA32-NEXT: ext.w.h $t0, $t0 +; LA32-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA32-NEXT: ext.w.h $a7, $a7 +; LA32-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA32-NEXT: ext.w.h $a6, $a6 +; LA32-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA32-NEXT: ext.w.h $a5, $a5 +; LA32-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA32-NEXT: ext.w.h $a4, $a4 +; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA32-NEXT: ext.w.h $a3, $a3 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA32-NEXT: ext.w.h $a2, $a2 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA32-NEXT: ext.w.h $a1, $a1 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA32-NEXT: bstrpick.w $a1, $t8, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA32-NEXT: bstrpick.w $a1, $t7, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA32-NEXT: bstrpick.w $a1, $t6, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA32-NEXT: bstrpick.w $a1, $t5, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA32-NEXT: bstrpick.w $a1, $t4, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA32-NEXT: bstrpick.w $a1, $t3, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA32-NEXT: bstrpick.w $a1, $t2, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: bstrpick.w $a1, $t1, 15, 0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA32-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_w_hu_h_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpermi.d $xr2, $xr0, 14 +; LA64-NEXT: vpickve2gr.h $a1, $vr2, 7 +; LA64-NEXT: vpickve2gr.h $a2, $vr2, 5 +; LA64-NEXT: vpickve2gr.h $a3, $vr2, 3 +; LA64-NEXT: vpickve2gr.h $a4, $vr2, 1 +; LA64-NEXT: vpickve2gr.h $a5, $vr0, 7 +; LA64-NEXT: vpickve2gr.h $a6, $vr0, 5 +; LA64-NEXT: vpickve2gr.h $a7, $vr0, 3 +; LA64-NEXT: vpickve2gr.h $t0, $vr0, 1 +; LA64-NEXT: xvpermi.d $xr0, $xr1, 14 +; LA64-NEXT: vpickve2gr.h $t1, $vr0, 7 +; LA64-NEXT: vpickve2gr.h $t2, $vr0, 5 +; LA64-NEXT: vpickve2gr.h $t3, $vr0, 3 +; LA64-NEXT: vpickve2gr.h $t4, $vr0, 1 +; LA64-NEXT: vpickve2gr.h $t5, $vr1, 7 +; LA64-NEXT: vpickve2gr.h $t6, $vr1, 5 +; LA64-NEXT: vpickve2gr.h $t7, $vr1, 3 +; LA64-NEXT: vpickve2gr.h $t8, $vr1, 1 +; LA64-NEXT: ext.w.h $t0, $t0 +; LA64-NEXT: vinsgr2vr.w $vr0, $t0, 0 +; LA64-NEXT: ext.w.h $a7, $a7 +; LA64-NEXT: vinsgr2vr.w $vr0, $a7, 1 +; LA64-NEXT: ext.w.h $a6, $a6 +; LA64-NEXT: vinsgr2vr.w $vr0, $a6, 2 +; LA64-NEXT: ext.w.h $a5, $a5 +; LA64-NEXT: vinsgr2vr.w $vr0, $a5, 3 +; LA64-NEXT: ext.w.h $a4, $a4 +; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 0 +; LA64-NEXT: ext.w.h $a3, $a3 +; LA64-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA64-NEXT: ext.w.h $a2, $a2 +; LA64-NEXT: vinsgr2vr.w $vr1, $a2, 2 +; LA64-NEXT: ext.w.h $a1, $a1 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA64-NEXT: bstrpick.d $a1, $t8, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $t7, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $t6, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2 +; LA64-NEXT: bstrpick.d $a1, $t5, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3 +; LA64-NEXT: bstrpick.d $a1, $t4, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $t3, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $t2, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA64-NEXT: bstrpick.d $a1, $t1, 15, 0 +; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvmul.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %vas = shufflevector <16 x i16> %va, <16 x i16> poison, <8 x i32> + %vbs = shufflevector <16 x i16> %vb, <16 x i16> poison, <8 x i32> + %vae = sext <8 x i16> %vas to <8 x i32> + %vbe = zext <8 x i16> %vbs to <8 x i32> + %mul = mul <8 x i32> %vae, %vbe + store <8 x i32> %mul, ptr %res + ret void +} + +define void @vmulwod_d_wu_w_1(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_d_wu_w_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 3 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 1 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 7 +; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 5 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 0 +; LA32-NEXT: srai.w $a4, $a4, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a4, 1 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 2 +; LA32-NEXT: srai.w $a3, $a3, 31 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 3 +; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 0 +; LA32-NEXT: srai.w $a2, $a2, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 1 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 2 +; LA32-NEXT: srai.w $a1, $a1, 31 +; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3 +; LA32-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 1 +; LA32-NEXT: xvrepli.b $xr3, 0 +; LA32-NEXT: xvinsve0.w $xr3, $xr0, 0 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 3 +; LA32-NEXT: xvinsve0.w $xr3, $xr0, 2 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 5 +; LA32-NEXT: xvinsve0.w $xr3, $xr0, 4 +; LA32-NEXT: xvpickve.w $xr0, $xr1, 7 +; LA32-NEXT: xvinsve0.w $xr3, $xr0, 6 +; LA32-NEXT: xvmul.d $xr0, $xr2, $xr3 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_d_wu_w_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 3 +; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 7 +; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 5 +; LA64-NEXT: xvpickve2gr.w $a5, $xr1, 3 +; LA64-NEXT: xvpickve2gr.w $a6, $xr1, 1 +; LA64-NEXT: xvpickve2gr.w $a7, $xr1, 7 +; LA64-NEXT: xvpickve2gr.w $t0, $xr1, 5 +; LA64-NEXT: vinsgr2vr.d $vr0, $a4, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a3, 1 +; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: bstrpick.d $a1, $t0, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $a7, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 1 +; LA64-NEXT: bstrpick.d $a1, $a6, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 0 +; LA64-NEXT: bstrpick.d $a1, $a5, 31, 0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr0, 2 +; LA64-NEXT: xvmul.d $xr0, $xr1, $xr2 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %vas = shufflevector <8 x i32> %va, <8 x i32> poison, <4 x i32> + %vbs = shufflevector <8 x i32> %vb, <8 x i32> poison, <4 x i32> + %vae = sext <4 x i32> %vas to <4 x i64> + %vbe = zext <4 x i32> %vbs to <4 x i64> + %mul = mul <4 x i64> %vae, %vbe + store <4 x i64> %mul, ptr %res + ret void +} + +define void @vmulwod_q_du_d_1(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_q_du_d_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $fp, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 6 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a6, $xr0, 3 +; LA32-NEXT: xvpickve2gr.w $a7, $xr0, 7 +; LA32-NEXT: xvpickve2gr.w $a5, $xr1, 7 +; LA32-NEXT: xvpickve2gr.w $a3, $xr1, 6 +; LA32-NEXT: xvpickve2gr.w $t0, $xr1, 3 +; LA32-NEXT: xvpickve2gr.w $a4, $xr1, 2 +; LA32-NEXT: srai.w $t1, $a7, 31 +; LA32-NEXT: srai.w $t2, $a6, 31 +; LA32-NEXT: mulh.wu $t3, $a2, $a4 +; LA32-NEXT: mul.w $t4, $a6, $a4 +; LA32-NEXT: add.w $t3, $t4, $t3 +; LA32-NEXT: sltu $t4, $t3, $t4 +; LA32-NEXT: mulh.wu $t5, $a6, $a4 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: mul.w $t5, $a2, $t0 +; LA32-NEXT: add.w $t3, $t5, $t3 +; LA32-NEXT: sltu $t5, $t3, $t5 +; LA32-NEXT: mulh.wu $t6, $a2, $t0 +; LA32-NEXT: add.w $t5, $t6, $t5 +; LA32-NEXT: add.w $t5, $t4, $t5 +; LA32-NEXT: mul.w $t6, $a6, $t0 +; LA32-NEXT: add.w $t7, $t6, $t5 +; LA32-NEXT: mul.w $t8, $a4, $t2 +; LA32-NEXT: add.w $fp, $t7, $t8 +; LA32-NEXT: sltu $s0, $fp, $t7 +; LA32-NEXT: sltu $t6, $t7, $t6 +; LA32-NEXT: sltu $t4, $t5, $t4 +; LA32-NEXT: mulh.wu $a6, $a6, $t0 +; LA32-NEXT: add.w $a6, $a6, $t4 +; LA32-NEXT: add.w $a6, $a6, $t6 +; LA32-NEXT: mulh.wu $t4, $a4, $t2 +; LA32-NEXT: add.w $t4, $t4, $t8 +; LA32-NEXT: mul.w $t0, $t0, $t2 +; LA32-NEXT: add.w $t0, $t4, $t0 +; LA32-NEXT: add.w $a6, $a6, $t0 +; LA32-NEXT: add.w $a6, $a6, $s0 +; LA32-NEXT: mulh.wu $t0, $a1, $a3 +; LA32-NEXT: mul.w $t2, $a7, $a3 +; LA32-NEXT: add.w $t0, $t2, $t0 +; LA32-NEXT: sltu $t2, $t0, $t2 +; LA32-NEXT: mulh.wu $t4, $a7, $a3 +; LA32-NEXT: add.w $t2, $t4, $t2 +; LA32-NEXT: mul.w $t4, $a1, $a5 +; LA32-NEXT: add.w $t0, $t4, $t0 +; LA32-NEXT: sltu $t4, $t0, $t4 +; LA32-NEXT: mulh.wu $t5, $a1, $a5 +; LA32-NEXT: add.w $t4, $t5, $t4 +; LA32-NEXT: add.w $t4, $t2, $t4 +; LA32-NEXT: mul.w $t5, $a7, $a5 +; LA32-NEXT: add.w $t6, $t5, $t4 +; LA32-NEXT: mul.w $t7, $a3, $t1 +; LA32-NEXT: add.w $t8, $t6, $t7 +; LA32-NEXT: sltu $s0, $t8, $t6 +; LA32-NEXT: sltu $t5, $t6, $t5 +; LA32-NEXT: sltu $t2, $t4, $t2 +; LA32-NEXT: mulh.wu $a7, $a7, $a5 +; LA32-NEXT: add.w $a7, $a7, $t2 +; LA32-NEXT: add.w $a7, $a7, $t5 +; LA32-NEXT: mulh.wu $t2, $a3, $t1 +; LA32-NEXT: add.w $t2, $t2, $t7 +; LA32-NEXT: mul.w $a5, $a5, $t1 +; LA32-NEXT: add.w $a5, $t2, $a5 +; LA32-NEXT: add.w $a5, $a7, $a5 +; LA32-NEXT: add.w $a5, $a5, $s0 +; LA32-NEXT: mul.w $a2, $a2, $a4 +; LA32-NEXT: mul.w $a1, $a1, $a3 +; LA32-NEXT: st.w $a1, $a0, 16 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $t0, $a0, 20 +; LA32-NEXT: st.w $t3, $a0, 4 +; LA32-NEXT: st.w $t8, $a0, 24 +; LA32-NEXT: st.w $fp, $a0, 8 +; LA32-NEXT: st.w $a5, $a0, 28 +; LA32-NEXT: st.w $a6, $a0, 12 +; LA32-NEXT: ld.w $s0, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_q_du_d_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 1 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 3 +; LA64-NEXT: xvpickve2gr.d $a3, $xr1, 3 +; LA64-NEXT: xvpickve2gr.d $a4, $xr1, 1 +; LA64-NEXT: srai.d $a5, $a2, 63 +; LA64-NEXT: srai.d $a6, $a1, 63 +; LA64-NEXT: mulh.du $a7, $a1, $a4 +; LA64-NEXT: mul.d $a6, $a6, $a4 +; LA64-NEXT: add.d $a6, $a7, $a6 +; LA64-NEXT: mulh.du $a7, $a2, $a3 +; LA64-NEXT: mul.d $a5, $a5, $a3 +; LA64-NEXT: add.d $a5, $a7, $a5 +; LA64-NEXT: mul.d $a1, $a1, $a4 +; LA64-NEXT: mul.d $a2, $a2, $a3 +; LA64-NEXT: st.d $a2, $a0, 16 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a5, $a0, 24 +; LA64-NEXT: st.d $a6, $a0, 8 +; LA64-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %vas = shufflevector <4 x i64> %va, <4 x i64> poison, <2 x i32> + %vbs = shufflevector <4 x i64> %vb, <4 x i64> poison, <2 x i32> + %vae = sext <2 x i64> %vas to <2 x i128> + %vbe = zext <2 x i64> %vbs to <2 x i128> + %mul = mul <2 x i128> %vae, %vbe + store <2 x i128> %mul, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll new file mode 100644 index 0000000000000..cd83c1dff652f --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll @@ -0,0 +1,1145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 + +define void @vmulwev_h_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_h_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vslli.h $vr0, $vr0, 8 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 8 +; CHECK-NEXT: vslli.h $vr1, $vr1, 8 +; CHECK-NEXT: vsrai.h $vr1, $vr1, 8 +; CHECK-NEXT: vmul.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %vas = shufflevector <16 x i8> %va, <16 x i8> poison, <8 x i32> + %vbs = shufflevector <16 x i8> %vb, <16 x i8> poison, <8 x i32> + %vae = sext <8 x i8> %vas to <8 x i16> + %vbe = sext <8 x i8> %vbs to <8 x i16> + %mul = mul <8 x i16> %vae, %vbe + store <8 x i16> %mul, ptr %res + ret void +} + +define void @vmulwev_w_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_w_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vslli.w $vr0, $vr0, 16 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 16 +; CHECK-NEXT: vslli.w $vr1, $vr1, 16 +; CHECK-NEXT: vsrai.w $vr1, $vr1, 16 +; CHECK-NEXT: vmul.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %vas = shufflevector <8 x i16> %va, <8 x i16> poison, <4 x i32> + %vbs = shufflevector <8 x i16> %vb, <8 x i16> poison, <4 x i32> + %vae = sext <4 x i16> %vas to <4 x i32> + %vbe = sext <4 x i16> %vbs to <4 x i32> + %mul = mul <4 x i32> %vae, %vbe + store <4 x i32> %mul, ptr %res + ret void +} + +define void @vmulwev_d_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_d_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vslli.d $vr0, $vr0, 32 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 32 +; CHECK-NEXT: vslli.d $vr1, $vr1, 32 +; CHECK-NEXT: vsrai.d $vr1, $vr1, 32 +; CHECK-NEXT: vmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %vas = shufflevector <4 x i32> %va, <4 x i32> poison, <2 x i32> + %vbs = shufflevector <4 x i32> %vb, <4 x i32> poison, <2 x i32> + %vae = sext <2 x i32> %vas to <2 x i64> + %vbe = sext <2 x i32> %vbs to <2 x i64> + %mul = mul <2 x i64> %vae, %vbe + store <2 x i64> %mul, ptr %res + ret void +} + +define void @vmulwev_q_d(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_q_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a3, $vr1, 0 +; LA32-NEXT: vpickve2gr.w $a4, $vr1, 1 +; LA32-NEXT: srai.w $a5, $a2, 31 +; LA32-NEXT: srai.w $a6, $a4, 31 +; LA32-NEXT: mulh.wu $a7, $a1, $a3 +; LA32-NEXT: mul.w $t0, $a2, $a3 +; LA32-NEXT: add.w $a7, $t0, $a7 +; LA32-NEXT: sltu $t0, $a7, $t0 +; LA32-NEXT: mulh.wu $t1, $a2, $a3 +; LA32-NEXT: add.w $t0, $t1, $t0 +; LA32-NEXT: mul.w $t1, $a1, $a4 +; LA32-NEXT: add.w $a7, $t1, $a7 +; LA32-NEXT: sltu $t1, $a7, $t1 +; LA32-NEXT: mulh.wu $t2, $a1, $a4 +; LA32-NEXT: add.w $t1, $t2, $t1 +; LA32-NEXT: add.w $t1, $t0, $t1 +; LA32-NEXT: mul.w $t2, $a2, $a4 +; LA32-NEXT: add.w $t3, $t2, $t1 +; LA32-NEXT: mul.w $t4, $a3, $a5 +; LA32-NEXT: mul.w $t5, $a6, $a1 +; LA32-NEXT: add.w $t6, $t5, $t4 +; LA32-NEXT: add.w $t7, $t3, $t6 +; LA32-NEXT: sltu $t8, $t7, $t3 +; LA32-NEXT: sltu $t2, $t3, $t2 +; LA32-NEXT: sltu $t0, $t1, $t0 +; LA32-NEXT: mulh.wu $t1, $a2, $a4 +; LA32-NEXT: add.w $t0, $t1, $t0 +; LA32-NEXT: add.w $t0, $t0, $t2 +; LA32-NEXT: mulh.wu $t1, $a3, $a5 +; LA32-NEXT: add.w $t1, $t1, $t4 +; LA32-NEXT: mul.w $a4, $a4, $a5 +; LA32-NEXT: add.w $a4, $t1, $a4 +; LA32-NEXT: mul.w $a2, $a6, $a2 +; LA32-NEXT: mulh.wu $a5, $a6, $a1 +; LA32-NEXT: add.w $a2, $a5, $a2 +; LA32-NEXT: add.w $a2, $a2, $t5 +; LA32-NEXT: add.w $a2, $a2, $a4 +; LA32-NEXT: sltu $a4, $t6, $t5 +; LA32-NEXT: add.w $a2, $a2, $a4 +; LA32-NEXT: add.w $a2, $t0, $a2 +; LA32-NEXT: add.w $a2, $a2, $t8 +; LA32-NEXT: mul.w $a1, $a1, $a3 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a7, $a0, 4 +; LA32-NEXT: st.w $t7, $a0, 8 +; LA32-NEXT: st.w $a2, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_q_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a1, $a1, 0 +; LA64-NEXT: ld.d $a2, $a2, 0 +; LA64-NEXT: mul.d $a3, $a1, $a2 +; LA64-NEXT: mulh.d $a1, $a1, $a2 +; LA64-NEXT: st.d $a1, $a0, 8 +; LA64-NEXT: st.d $a3, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ae = extractelement <2 x i64> %va, i32 0 + %be = extractelement <2 x i64> %vb, i32 0 + %ax = sext i64 %ae to i128 + %bx = sext i64 %be to i128 + %mul = mul i128 %ax, %bx + store i128 %mul, ptr %res + ret void +} + +define void @vmulwod_h_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_h_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 49 +; CHECK-NEXT: vslli.h $vr0, $vr0, 8 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 8 +; CHECK-NEXT: vshuf4i.b $vr1, $vr1, 49 +; CHECK-NEXT: vslli.h $vr1, $vr1, 8 +; CHECK-NEXT: vsrai.h $vr1, $vr1, 8 +; CHECK-NEXT: vmul.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %vas = shufflevector <16 x i8> %va, <16 x i8> poison, <8 x i32> + %vbs = shufflevector <16 x i8> %vb, <16 x i8> poison, <8 x i32> + %vae = sext <8 x i8> %vas to <8 x i16> + %vbe = sext <8 x i8> %vbs to <8 x i16> + %mul = mul <8 x i16> %vae, %vbe + store <8 x i16> %mul, ptr %res + ret void +} + +define void @vmulwod_w_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_w_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 49 +; CHECK-NEXT: vslli.w $vr0, $vr0, 16 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 16 +; CHECK-NEXT: vshuf4i.h $vr1, $vr1, 49 +; CHECK-NEXT: vslli.w $vr1, $vr1, 16 +; CHECK-NEXT: vsrai.w $vr1, $vr1, 16 +; CHECK-NEXT: vmul.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %vas = shufflevector <8 x i16> %va, <8 x i16> poison, <4 x i32> + %vbs = shufflevector <8 x i16> %vb, <8 x i16> poison, <4 x i32> + %vae = sext <4 x i16> %vas to <4 x i32> + %vbe = sext <4 x i16> %vbs to <4 x i32> + %mul = mul <4 x i32> %vae, %vbe + store <4 x i32> %mul, ptr %res + ret void +} + +define void @vmulwod_d_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_d_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 49 +; CHECK-NEXT: vslli.d $vr0, $vr0, 32 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 32 +; CHECK-NEXT: vshuf4i.w $vr1, $vr1, 49 +; CHECK-NEXT: vslli.d $vr1, $vr1, 32 +; CHECK-NEXT: vsrai.d $vr1, $vr1, 32 +; CHECK-NEXT: vmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %vas = shufflevector <4 x i32> %va, <4 x i32> poison, <2 x i32> + %vbs = shufflevector <4 x i32> %vb, <4 x i32> poison, <2 x i32> + %vae = sext <2 x i32> %vas to <2 x i64> + %vbe = sext <2 x i32> %vbs to <2 x i64> + %mul = mul <2 x i64> %vae, %vbe + store <2 x i64> %mul, ptr %res + ret void +} + +define void @vmulwod_q_d(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_q_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a3, $vr1, 2 +; LA32-NEXT: vpickve2gr.w $a4, $vr1, 3 +; LA32-NEXT: srai.w $a5, $a2, 31 +; LA32-NEXT: srai.w $a6, $a4, 31 +; LA32-NEXT: mulh.wu $a7, $a1, $a3 +; LA32-NEXT: mul.w $t0, $a2, $a3 +; LA32-NEXT: add.w $a7, $t0, $a7 +; LA32-NEXT: sltu $t0, $a7, $t0 +; LA32-NEXT: mulh.wu $t1, $a2, $a3 +; LA32-NEXT: add.w $t0, $t1, $t0 +; LA32-NEXT: mul.w $t1, $a1, $a4 +; LA32-NEXT: add.w $a7, $t1, $a7 +; LA32-NEXT: sltu $t1, $a7, $t1 +; LA32-NEXT: mulh.wu $t2, $a1, $a4 +; LA32-NEXT: add.w $t1, $t2, $t1 +; LA32-NEXT: add.w $t1, $t0, $t1 +; LA32-NEXT: mul.w $t2, $a2, $a4 +; LA32-NEXT: add.w $t3, $t2, $t1 +; LA32-NEXT: mul.w $t4, $a3, $a5 +; LA32-NEXT: mul.w $t5, $a6, $a1 +; LA32-NEXT: add.w $t6, $t5, $t4 +; LA32-NEXT: add.w $t7, $t3, $t6 +; LA32-NEXT: sltu $t8, $t7, $t3 +; LA32-NEXT: sltu $t2, $t3, $t2 +; LA32-NEXT: sltu $t0, $t1, $t0 +; LA32-NEXT: mulh.wu $t1, $a2, $a4 +; LA32-NEXT: add.w $t0, $t1, $t0 +; LA32-NEXT: add.w $t0, $t0, $t2 +; LA32-NEXT: mulh.wu $t1, $a3, $a5 +; LA32-NEXT: add.w $t1, $t1, $t4 +; LA32-NEXT: mul.w $a4, $a4, $a5 +; LA32-NEXT: add.w $a4, $t1, $a4 +; LA32-NEXT: mul.w $a2, $a6, $a2 +; LA32-NEXT: mulh.wu $a5, $a6, $a1 +; LA32-NEXT: add.w $a2, $a5, $a2 +; LA32-NEXT: add.w $a2, $a2, $t5 +; LA32-NEXT: add.w $a2, $a2, $a4 +; LA32-NEXT: sltu $a4, $t6, $t5 +; LA32-NEXT: add.w $a2, $a2, $a4 +; LA32-NEXT: add.w $a2, $t0, $a2 +; LA32-NEXT: add.w $a2, $a2, $t8 +; LA32-NEXT: mul.w $a1, $a1, $a3 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a7, $a0, 4 +; LA32-NEXT: st.w $t7, $a0, 8 +; LA32-NEXT: st.w $a2, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_q_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a1, $a1, 8 +; LA64-NEXT: ld.d $a2, $a2, 8 +; LA64-NEXT: mul.d $a3, $a1, $a2 +; LA64-NEXT: mulh.d $a1, $a1, $a2 +; LA64-NEXT: st.d $a1, $a0, 8 +; LA64-NEXT: st.d $a3, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ae = extractelement <2 x i64> %va, i32 1 + %be = extractelement <2 x i64> %vb, i32 1 + %ax = sext i64 %ae to i128 + %bx = sext i64 %be to i128 + %mul = mul i128 %ax, %bx + store i128 %mul, ptr %res + ret void +} + +define void @vmulwev_h_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_h_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI8_0) +; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI8_0) +; CHECK-NEXT: vld $vr2, $a2, 0 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vshuf.b $vr0, $vr3, $vr0, $vr1 +; CHECK-NEXT: vshuf.b $vr1, $vr3, $vr2, $vr1 +; CHECK-NEXT: vmul.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %vas = shufflevector <16 x i8> %va, <16 x i8> poison, <8 x i32> + %vbs = shufflevector <16 x i8> %vb, <16 x i8> poison, <8 x i32> + %vae = zext <8 x i8> %vas to <8 x i16> + %vbe = zext <8 x i8> %vbs to <8 x i16> + %mul = mul <8 x i16> %vae, %vbe + store <8 x i16> %mul, ptr %res + ret void +} + +define void @vmulwev_w_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_w_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI9_0) +; CHECK-NEXT: vld $vr0, $a3, %pc_lo12(.LCPI9_0) +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vld $vr2, $a2, 0 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vori.b $vr4, $vr0, 0 +; CHECK-NEXT: vshuf.h $vr4, $vr3, $vr1 +; CHECK-NEXT: vshuf.h $vr0, $vr3, $vr2 +; CHECK-NEXT: vmul.w $vr0, $vr4, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %vas = shufflevector <8 x i16> %va, <8 x i16> poison, <4 x i32> + %vbs = shufflevector <8 x i16> %vb, <8 x i16> poison, <4 x i32> + %vae = zext <4 x i16> %vas to <4 x i32> + %vbe = zext <4 x i16> %vbs to <4 x i32> + %mul = mul <4 x i32> %vae, %vbe + store <4 x i32> %mul, ptr %res + ret void +} + +define void @vmulwev_d_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_d_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI10_0) +; CHECK-NEXT: vld $vr0, $a3, %pc_lo12(.LCPI10_0) +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vld $vr2, $a2, 0 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vori.b $vr4, $vr0, 0 +; CHECK-NEXT: vshuf.w $vr4, $vr3, $vr1 +; CHECK-NEXT: vshuf.w $vr0, $vr3, $vr2 +; CHECK-NEXT: vmul.d $vr0, $vr4, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %vas = shufflevector <4 x i32> %va, <4 x i32> poison, <2 x i32> + %vbs = shufflevector <4 x i32> %vb, <4 x i32> poison, <2 x i32> + %vae = zext <2 x i32> %vas to <2 x i64> + %vbe = zext <2 x i32> %vbs to <2 x i64> + %mul = mul <2 x i64> %vae, %vbe + store <2 x i64> %mul, ptr %res + ret void +} + +define void @vmulwev_q_du(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_q_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a3, $vr1, 1 +; LA32-NEXT: vpickve2gr.w $a4, $vr1, 0 +; LA32-NEXT: mulh.wu $a5, $a2, $a4 +; LA32-NEXT: mul.w $a6, $a1, $a4 +; LA32-NEXT: add.w $a5, $a6, $a5 +; LA32-NEXT: sltu $a6, $a5, $a6 +; LA32-NEXT: mulh.wu $a7, $a1, $a4 +; LA32-NEXT: add.w $a6, $a7, $a6 +; LA32-NEXT: mul.w $a7, $a2, $a3 +; LA32-NEXT: add.w $a5, $a7, $a5 +; LA32-NEXT: sltu $a7, $a5, $a7 +; LA32-NEXT: mulh.wu $t0, $a2, $a3 +; LA32-NEXT: add.w $a7, $t0, $a7 +; LA32-NEXT: add.w $a7, $a6, $a7 +; LA32-NEXT: mul.w $t0, $a1, $a3 +; LA32-NEXT: add.w $t1, $t0, $a7 +; LA32-NEXT: sltu $t0, $t1, $t0 +; LA32-NEXT: sltu $a6, $a7, $a6 +; LA32-NEXT: mulh.wu $a1, $a1, $a3 +; LA32-NEXT: add.w $a1, $a1, $a6 +; LA32-NEXT: add.w $a1, $a1, $t0 +; LA32-NEXT: mul.w $a2, $a2, $a4 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $a5, $a0, 4 +; LA32-NEXT: st.w $t1, $a0, 8 +; LA32-NEXT: st.w $a1, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_q_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a1, $a1, 0 +; LA64-NEXT: ld.d $a2, $a2, 0 +; LA64-NEXT: mul.d $a3, $a1, $a2 +; LA64-NEXT: mulh.du $a1, $a1, $a2 +; LA64-NEXT: st.d $a1, $a0, 8 +; LA64-NEXT: st.d $a3, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ae = extractelement <2 x i64> %va, i32 0 + %be = extractelement <2 x i64> %vb, i32 0 + %ax = zext i64 %ae to i128 + %bx = zext i64 %be to i128 + %mul = mul i128 %ax, %bx + store i128 %mul, ptr %res + ret void +} + +define void @vmulwod_h_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_h_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vpackod.b $vr1, $vr2, $vr1 +; CHECK-NEXT: vmul.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %vas = shufflevector <16 x i8> %va, <16 x i8> poison, <8 x i32> + %vbs = shufflevector <16 x i8> %vb, <16 x i8> poison, <8 x i32> + %vae = zext <8 x i8> %vas to <8 x i16> + %vbe = zext <8 x i8> %vbs to <8 x i16> + %mul = mul <8 x i16> %vae, %vbe + store <8 x i16> %mul, ptr %res + ret void +} + +define void @vmulwod_w_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_w_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vpackod.h $vr1, $vr2, $vr1 +; CHECK-NEXT: vmul.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %vas = shufflevector <8 x i16> %va, <8 x i16> poison, <4 x i32> + %vbs = shufflevector <8 x i16> %vb, <8 x i16> poison, <4 x i32> + %vae = zext <4 x i16> %vas to <4 x i32> + %vbe = zext <4 x i16> %vbs to <4 x i32> + %mul = mul <4 x i32> %vae, %vbe + store <4 x i32> %mul, ptr %res + ret void +} + +define void @vmulwod_d_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_d_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vpackod.w $vr1, $vr2, $vr1 +; CHECK-NEXT: vmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %vas = shufflevector <4 x i32> %va, <4 x i32> poison, <2 x i32> + %vbs = shufflevector <4 x i32> %vb, <4 x i32> poison, <2 x i32> + %vae = zext <2 x i32> %vas to <2 x i64> + %vbe = zext <2 x i32> %vbs to <2 x i64> + %mul = mul <2 x i64> %vae, %vbe + store <2 x i64> %mul, ptr %res + ret void +} + +define void @vmulwod_q_du(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_q_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a3, $vr1, 3 +; LA32-NEXT: vpickve2gr.w $a4, $vr1, 2 +; LA32-NEXT: mulh.wu $a5, $a2, $a4 +; LA32-NEXT: mul.w $a6, $a1, $a4 +; LA32-NEXT: add.w $a5, $a6, $a5 +; LA32-NEXT: sltu $a6, $a5, $a6 +; LA32-NEXT: mulh.wu $a7, $a1, $a4 +; LA32-NEXT: add.w $a6, $a7, $a6 +; LA32-NEXT: mul.w $a7, $a2, $a3 +; LA32-NEXT: add.w $a5, $a7, $a5 +; LA32-NEXT: sltu $a7, $a5, $a7 +; LA32-NEXT: mulh.wu $t0, $a2, $a3 +; LA32-NEXT: add.w $a7, $t0, $a7 +; LA32-NEXT: add.w $a7, $a6, $a7 +; LA32-NEXT: mul.w $t0, $a1, $a3 +; LA32-NEXT: add.w $t1, $t0, $a7 +; LA32-NEXT: sltu $t0, $t1, $t0 +; LA32-NEXT: sltu $a6, $a7, $a6 +; LA32-NEXT: mulh.wu $a1, $a1, $a3 +; LA32-NEXT: add.w $a1, $a1, $a6 +; LA32-NEXT: add.w $a1, $a1, $t0 +; LA32-NEXT: mul.w $a2, $a2, $a4 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $a5, $a0, 4 +; LA32-NEXT: st.w $t1, $a0, 8 +; LA32-NEXT: st.w $a1, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_q_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a1, $a1, 8 +; LA64-NEXT: ld.d $a2, $a2, 8 +; LA64-NEXT: mul.d $a3, $a1, $a2 +; LA64-NEXT: mulh.du $a1, $a1, $a2 +; LA64-NEXT: st.d $a1, $a0, 8 +; LA64-NEXT: st.d $a3, $a0, 0 +; LA64-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ae = extractelement <2 x i64> %va, i32 1 + %be = extractelement <2 x i64> %vb, i32 1 + %ax = zext i64 %ae to i128 + %bx = zext i64 %be to i128 + %mul = mul i128 %ax, %bx + store i128 %mul, ptr %res + ret void +} + +define void @vmulwev_h_bu_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_h_bu_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI16_0) +; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI16_0) +; CHECK-NEXT: vld $vr2, $a2, 0 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vshuf.b $vr0, $vr3, $vr0, $vr1 +; CHECK-NEXT: vslli.h $vr1, $vr2, 8 +; CHECK-NEXT: vsrai.h $vr1, $vr1, 8 +; CHECK-NEXT: vmul.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %vas = shufflevector <16 x i8> %va, <16 x i8> poison, <8 x i32> + %vbs = shufflevector <16 x i8> %vb, <16 x i8> poison, <8 x i32> + %vae = zext <8 x i8> %vas to <8 x i16> + %vbe = sext <8 x i8> %vbs to <8 x i16> + %mul = mul <8 x i16> %vae, %vbe + store <8 x i16> %mul, ptr %res + ret void +} + +define void @vmulwev_w_hu_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_w_hu_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI17_0) +; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI17_0) +; CHECK-NEXT: vld $vr2, $a2, 0 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vshuf.h $vr1, $vr3, $vr0 +; CHECK-NEXT: vslli.w $vr0, $vr2, 16 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 16 +; CHECK-NEXT: vmul.w $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %vas = shufflevector <8 x i16> %va, <8 x i16> poison, <4 x i32> + %vbs = shufflevector <8 x i16> %vb, <8 x i16> poison, <4 x i32> + %vae = zext <4 x i16> %vas to <4 x i32> + %vbe = sext <4 x i16> %vbs to <4 x i32> + %mul = mul <4 x i32> %vae, %vbe + store <4 x i32> %mul, ptr %res + ret void +} + +define void @vmulwev_d_wu_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_d_wu_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI18_0) +; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI18_0) +; CHECK-NEXT: vld $vr2, $a2, 0 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vshuf.w $vr1, $vr3, $vr0 +; CHECK-NEXT: vslli.d $vr0, $vr2, 32 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 32 +; CHECK-NEXT: vmul.d $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %vas = shufflevector <4 x i32> %va, <4 x i32> poison, <2 x i32> + %vbs = shufflevector <4 x i32> %vb, <4 x i32> poison, <2 x i32> + %vae = zext <2 x i32> %vas to <2 x i64> + %vbe = sext <2 x i32> %vbs to <2 x i64> + %mul = mul <2 x i64> %vae, %vbe + store <2 x i64> %mul, ptr %res + ret void +} + +define void @vmulwev_q_du_d(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_q_du_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a3, $vr1, 0 +; LA32-NEXT: vpickve2gr.w $a4, $vr1, 1 +; LA32-NEXT: srai.w $a5, $a4, 31 +; LA32-NEXT: mulh.wu $a6, $a2, $a3 +; LA32-NEXT: mul.w $a7, $a1, $a3 +; LA32-NEXT: add.w $a6, $a7, $a6 +; LA32-NEXT: sltu $a7, $a6, $a7 +; LA32-NEXT: mulh.wu $t0, $a1, $a3 +; LA32-NEXT: add.w $a7, $t0, $a7 +; LA32-NEXT: mul.w $t0, $a2, $a4 +; LA32-NEXT: add.w $a6, $t0, $a6 +; LA32-NEXT: sltu $t0, $a6, $t0 +; LA32-NEXT: mulh.wu $t1, $a2, $a4 +; LA32-NEXT: add.w $t0, $t1, $t0 +; LA32-NEXT: add.w $t0, $a7, $t0 +; LA32-NEXT: mul.w $t1, $a1, $a4 +; LA32-NEXT: add.w $t2, $t1, $t0 +; LA32-NEXT: mul.w $t3, $a5, $a2 +; LA32-NEXT: add.w $t4, $t2, $t3 +; LA32-NEXT: sltu $t5, $t4, $t2 +; LA32-NEXT: sltu $t1, $t2, $t1 +; LA32-NEXT: sltu $a7, $t0, $a7 +; LA32-NEXT: mulh.wu $a4, $a1, $a4 +; LA32-NEXT: add.w $a4, $a4, $a7 +; LA32-NEXT: add.w $a4, $a4, $t1 +; LA32-NEXT: mul.w $a1, $a5, $a1 +; LA32-NEXT: mulh.wu $a5, $a5, $a2 +; LA32-NEXT: add.w $a1, $a5, $a1 +; LA32-NEXT: add.w $a1, $a1, $t3 +; LA32-NEXT: add.w $a1, $a4, $a1 +; LA32-NEXT: add.w $a1, $a1, $t5 +; LA32-NEXT: mul.w $a2, $a2, $a3 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $a6, $a0, 4 +; LA32-NEXT: st.w $t4, $a0, 8 +; LA32-NEXT: st.w $a1, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_q_du_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a2, $a2, 0 +; LA64-NEXT: ld.d $a1, $a1, 0 +; LA64-NEXT: srai.d $a3, $a2, 63 +; LA64-NEXT: mulh.du $a4, $a1, $a2 +; LA64-NEXT: mul.d $a3, $a1, $a3 +; LA64-NEXT: add.d $a3, $a4, $a3 +; LA64-NEXT: mul.d $a1, $a1, $a2 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a3, $a0, 8 +; LA64-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ae = extractelement <2 x i64> %va, i32 0 + %be = extractelement <2 x i64> %vb, i32 0 + %ax = zext i64 %ae to i128 + %bx = sext i64 %be to i128 + %mul = mul i128 %ax, %bx + store i128 %mul, ptr %res + ret void +} + +define void @vmulwod_h_bu_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_h_bu_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vshuf4i.b $vr1, $vr1, 49 +; CHECK-NEXT: vslli.h $vr1, $vr1, 8 +; CHECK-NEXT: vsrai.h $vr1, $vr1, 8 +; CHECK-NEXT: vmul.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %vas = shufflevector <16 x i8> %va, <16 x i8> poison, <8 x i32> + %vbs = shufflevector <16 x i8> %vb, <16 x i8> poison, <8 x i32> + %vae = zext <8 x i8> %vas to <8 x i16> + %vbe = sext <8 x i8> %vbs to <8 x i16> + %mul = mul <8 x i16> %vae, %vbe + store <8 x i16> %mul, ptr %res + ret void +} + +define void @vmulwod_w_hu_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_w_hu_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vshuf4i.h $vr1, $vr1, 49 +; CHECK-NEXT: vslli.w $vr1, $vr1, 16 +; CHECK-NEXT: vsrai.w $vr1, $vr1, 16 +; CHECK-NEXT: vmul.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %vas = shufflevector <8 x i16> %va, <8 x i16> poison, <4 x i32> + %vbs = shufflevector <8 x i16> %vb, <8 x i16> poison, <4 x i32> + %vae = zext <4 x i16> %vas to <4 x i32> + %vbe = sext <4 x i16> %vbs to <4 x i32> + %mul = mul <4 x i32> %vae, %vbe + store <4 x i32> %mul, ptr %res + ret void +} + +define void @vmulwod_d_wu_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_d_wu_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vshuf4i.w $vr1, $vr1, 49 +; CHECK-NEXT: vslli.d $vr1, $vr1, 32 +; CHECK-NEXT: vsrai.d $vr1, $vr1, 32 +; CHECK-NEXT: vmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %vas = shufflevector <4 x i32> %va, <4 x i32> poison, <2 x i32> + %vbs = shufflevector <4 x i32> %vb, <4 x i32> poison, <2 x i32> + %vae = zext <2 x i32> %vas to <2 x i64> + %vbe = sext <2 x i32> %vbs to <2 x i64> + %mul = mul <2 x i64> %vae, %vbe + store <2 x i64> %mul, ptr %res + ret void +} + +define void @vmulwod_q_du_d(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_q_du_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a3, $vr1, 2 +; LA32-NEXT: vpickve2gr.w $a4, $vr1, 3 +; LA32-NEXT: srai.w $a5, $a4, 31 +; LA32-NEXT: mulh.wu $a6, $a2, $a3 +; LA32-NEXT: mul.w $a7, $a1, $a3 +; LA32-NEXT: add.w $a6, $a7, $a6 +; LA32-NEXT: sltu $a7, $a6, $a7 +; LA32-NEXT: mulh.wu $t0, $a1, $a3 +; LA32-NEXT: add.w $a7, $t0, $a7 +; LA32-NEXT: mul.w $t0, $a2, $a4 +; LA32-NEXT: add.w $a6, $t0, $a6 +; LA32-NEXT: sltu $t0, $a6, $t0 +; LA32-NEXT: mulh.wu $t1, $a2, $a4 +; LA32-NEXT: add.w $t0, $t1, $t0 +; LA32-NEXT: add.w $t0, $a7, $t0 +; LA32-NEXT: mul.w $t1, $a1, $a4 +; LA32-NEXT: add.w $t2, $t1, $t0 +; LA32-NEXT: mul.w $t3, $a5, $a2 +; LA32-NEXT: add.w $t4, $t2, $t3 +; LA32-NEXT: sltu $t5, $t4, $t2 +; LA32-NEXT: sltu $t1, $t2, $t1 +; LA32-NEXT: sltu $a7, $t0, $a7 +; LA32-NEXT: mulh.wu $a4, $a1, $a4 +; LA32-NEXT: add.w $a4, $a4, $a7 +; LA32-NEXT: add.w $a4, $a4, $t1 +; LA32-NEXT: mul.w $a1, $a5, $a1 +; LA32-NEXT: mulh.wu $a5, $a5, $a2 +; LA32-NEXT: add.w $a1, $a5, $a1 +; LA32-NEXT: add.w $a1, $a1, $t3 +; LA32-NEXT: add.w $a1, $a4, $a1 +; LA32-NEXT: add.w $a1, $a1, $t5 +; LA32-NEXT: mul.w $a2, $a2, $a3 +; LA32-NEXT: st.w $a2, $a0, 0 +; LA32-NEXT: st.w $a6, $a0, 4 +; LA32-NEXT: st.w $t4, $a0, 8 +; LA32-NEXT: st.w $a1, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_q_du_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a2, $a2, 8 +; LA64-NEXT: ld.d $a1, $a1, 8 +; LA64-NEXT: srai.d $a3, $a2, 63 +; LA64-NEXT: mulh.du $a4, $a1, $a2 +; LA64-NEXT: mul.d $a3, $a1, $a3 +; LA64-NEXT: add.d $a3, $a4, $a3 +; LA64-NEXT: mul.d $a1, $a1, $a2 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a3, $a0, 8 +; LA64-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ae = extractelement <2 x i64> %va, i32 1 + %be = extractelement <2 x i64> %vb, i32 1 + %ax = zext i64 %ae to i128 + %bx = sext i64 %be to i128 + %mul = mul i128 %ax, %bx + store i128 %mul, ptr %res + ret void +} + +define void @vmulwev_h_bu_b_1(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_h_bu_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI24_0) +; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI24_0) +; CHECK-NEXT: vslli.h $vr0, $vr0, 8 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 8 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vshuf.b $vr1, $vr3, $vr1, $vr2 +; CHECK-NEXT: vmul.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %vas = shufflevector <16 x i8> %va, <16 x i8> poison, <8 x i32> + %vbs = shufflevector <16 x i8> %vb, <16 x i8> poison, <8 x i32> + %vae = sext <8 x i8> %vas to <8 x i16> + %vbe = zext <8 x i8> %vbs to <8 x i16> + %mul = mul <8 x i16> %vae, %vbe + store <8 x i16> %mul, ptr %res + ret void +} + +define void @vmulwev_w_hu_h_1(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_w_hu_h_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI25_0) +; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI25_0) +; CHECK-NEXT: vslli.w $vr0, $vr0, 16 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 16 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vshuf.h $vr2, $vr3, $vr1 +; CHECK-NEXT: vmul.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %vas = shufflevector <8 x i16> %va, <8 x i16> poison, <4 x i32> + %vbs = shufflevector <8 x i16> %vb, <8 x i16> poison, <4 x i32> + %vae = sext <4 x i16> %vas to <4 x i32> + %vbe = zext <4 x i16> %vbs to <4 x i32> + %mul = mul <4 x i32> %vae, %vbe + store <4 x i32> %mul, ptr %res + ret void +} + +define void @vmulwev_d_wu_w_1(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwev_d_wu_w_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI26_0) +; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI26_0) +; CHECK-NEXT: vslli.d $vr0, $vr0, 32 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 32 +; CHECK-NEXT: vrepli.b $vr3, 0 +; CHECK-NEXT: vshuf.w $vr2, $vr3, $vr1 +; CHECK-NEXT: vmul.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %vas = shufflevector <4 x i32> %va, <4 x i32> poison, <2 x i32> + %vbs = shufflevector <4 x i32> %vb, <4 x i32> poison, <2 x i32> + %vae = sext <2 x i32> %vas to <2 x i64> + %vbe = zext <2 x i32> %vbs to <2 x i64> + %mul = mul <2 x i64> %vae, %vbe + store <2 x i64> %mul, ptr %res + ret void +} + +define void @vmulwev_q_du_d_1(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwev_q_du_d_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a3, $vr1, 1 +; LA32-NEXT: vpickve2gr.w $a4, $vr1, 0 +; LA32-NEXT: srai.w $a5, $a2, 31 +; LA32-NEXT: mulh.wu $a6, $a1, $a4 +; LA32-NEXT: mul.w $a7, $a2, $a4 +; LA32-NEXT: add.w $a6, $a7, $a6 +; LA32-NEXT: sltu $a7, $a6, $a7 +; LA32-NEXT: mulh.wu $t0, $a2, $a4 +; LA32-NEXT: add.w $a7, $t0, $a7 +; LA32-NEXT: mul.w $t0, $a1, $a3 +; LA32-NEXT: add.w $a6, $t0, $a6 +; LA32-NEXT: sltu $t0, $a6, $t0 +; LA32-NEXT: mulh.wu $t1, $a1, $a3 +; LA32-NEXT: add.w $t0, $t1, $t0 +; LA32-NEXT: add.w $t0, $a7, $t0 +; LA32-NEXT: mul.w $t1, $a2, $a3 +; LA32-NEXT: add.w $t2, $t1, $t0 +; LA32-NEXT: mul.w $t3, $a4, $a5 +; LA32-NEXT: add.w $t4, $t2, $t3 +; LA32-NEXT: sltu $t5, $t4, $t2 +; LA32-NEXT: sltu $t1, $t2, $t1 +; LA32-NEXT: sltu $a7, $t0, $a7 +; LA32-NEXT: mulh.wu $a2, $a2, $a3 +; LA32-NEXT: add.w $a2, $a2, $a7 +; LA32-NEXT: add.w $a2, $a2, $t1 +; LA32-NEXT: mulh.wu $a7, $a4, $a5 +; LA32-NEXT: add.w $a7, $a7, $t3 +; LA32-NEXT: mul.w $a3, $a3, $a5 +; LA32-NEXT: add.w $a3, $a7, $a3 +; LA32-NEXT: add.w $a2, $a2, $a3 +; LA32-NEXT: add.w $a2, $a2, $t5 +; LA32-NEXT: mul.w $a1, $a1, $a4 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a6, $a0, 4 +; LA32-NEXT: st.w $t4, $a0, 8 +; LA32-NEXT: st.w $a2, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwev_q_du_d_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a1, $a1, 0 +; LA64-NEXT: ld.d $a2, $a2, 0 +; LA64-NEXT: srai.d $a3, $a1, 63 +; LA64-NEXT: mulh.du $a4, $a1, $a2 +; LA64-NEXT: mul.d $a3, $a3, $a2 +; LA64-NEXT: add.d $a3, $a4, $a3 +; LA64-NEXT: mul.d $a1, $a1, $a2 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a3, $a0, 8 +; LA64-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ae = extractelement <2 x i64> %va, i32 0 + %be = extractelement <2 x i64> %vb, i32 0 + %ax = sext i64 %ae to i128 + %bx = zext i64 %be to i128 + %mul = mul i128 %ax, %bx + store i128 %mul, ptr %res + ret void +} + +define void @vmulwod_h_bu_b_1(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_h_bu_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 49 +; CHECK-NEXT: vslli.h $vr0, $vr0, 8 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 8 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.b $vr1, $vr2, $vr1 +; CHECK-NEXT: vmul.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %vas = shufflevector <16 x i8> %va, <16 x i8> poison, <8 x i32> + %vbs = shufflevector <16 x i8> %vb, <16 x i8> poison, <8 x i32> + %vae = sext <8 x i8> %vas to <8 x i16> + %vbe = zext <8 x i8> %vbs to <8 x i16> + %mul = mul <8 x i16> %vae, %vbe + store <8 x i16> %mul, ptr %res + ret void +} + +define void @vmulwod_w_hu_h_1(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_w_hu_h_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 49 +; CHECK-NEXT: vslli.w $vr0, $vr0, 16 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 16 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.h $vr1, $vr2, $vr1 +; CHECK-NEXT: vmul.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %vas = shufflevector <8 x i16> %va, <8 x i16> poison, <4 x i32> + %vbs = shufflevector <8 x i16> %vb, <8 x i16> poison, <4 x i32> + %vae = sext <4 x i16> %vas to <4 x i32> + %vbe = zext <4 x i16> %vbs to <4 x i32> + %mul = mul <4 x i32> %vae, %vbe + store <4 x i32> %mul, ptr %res + ret void +} + +define void @vmulwod_d_wu_w_1(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vmulwod_d_wu_w_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 49 +; CHECK-NEXT: vslli.d $vr0, $vr0, 32 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 32 +; CHECK-NEXT: vrepli.b $vr2, 0 +; CHECK-NEXT: vpackod.w $vr1, $vr2, $vr1 +; CHECK-NEXT: vmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %vas = shufflevector <4 x i32> %va, <4 x i32> poison, <2 x i32> + %vbs = shufflevector <4 x i32> %vb, <4 x i32> poison, <2 x i32> + %vae = sext <2 x i32> %vas to <2 x i64> + %vbe = zext <2 x i32> %vbs to <2 x i64> + %mul = mul <2 x i64> %vae, %vbe + store <2 x i64> %mul, ptr %res + ret void +} + +define void @vmulwod_q_du_d_1(ptr %res, ptr %a, ptr %b) nounwind { +; LA32-LABEL: vmulwod_q_du_d_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a3, $vr1, 3 +; LA32-NEXT: vpickve2gr.w $a4, $vr1, 2 +; LA32-NEXT: srai.w $a5, $a2, 31 +; LA32-NEXT: mulh.wu $a6, $a1, $a4 +; LA32-NEXT: mul.w $a7, $a2, $a4 +; LA32-NEXT: add.w $a6, $a7, $a6 +; LA32-NEXT: sltu $a7, $a6, $a7 +; LA32-NEXT: mulh.wu $t0, $a2, $a4 +; LA32-NEXT: add.w $a7, $t0, $a7 +; LA32-NEXT: mul.w $t0, $a1, $a3 +; LA32-NEXT: add.w $a6, $t0, $a6 +; LA32-NEXT: sltu $t0, $a6, $t0 +; LA32-NEXT: mulh.wu $t1, $a1, $a3 +; LA32-NEXT: add.w $t0, $t1, $t0 +; LA32-NEXT: add.w $t0, $a7, $t0 +; LA32-NEXT: mul.w $t1, $a2, $a3 +; LA32-NEXT: add.w $t2, $t1, $t0 +; LA32-NEXT: mul.w $t3, $a4, $a5 +; LA32-NEXT: add.w $t4, $t2, $t3 +; LA32-NEXT: sltu $t5, $t4, $t2 +; LA32-NEXT: sltu $t1, $t2, $t1 +; LA32-NEXT: sltu $a7, $t0, $a7 +; LA32-NEXT: mulh.wu $a2, $a2, $a3 +; LA32-NEXT: add.w $a2, $a2, $a7 +; LA32-NEXT: add.w $a2, $a2, $t1 +; LA32-NEXT: mulh.wu $a7, $a4, $a5 +; LA32-NEXT: add.w $a7, $a7, $t3 +; LA32-NEXT: mul.w $a3, $a3, $a5 +; LA32-NEXT: add.w $a3, $a7, $a3 +; LA32-NEXT: add.w $a2, $a2, $a3 +; LA32-NEXT: add.w $a2, $a2, $t5 +; LA32-NEXT: mul.w $a1, $a1, $a4 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a6, $a0, 4 +; LA32-NEXT: st.w $t4, $a0, 8 +; LA32-NEXT: st.w $a2, $a0, 12 +; LA32-NEXT: ret +; +; LA64-LABEL: vmulwod_q_du_d_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a1, $a1, 8 +; LA64-NEXT: ld.d $a2, $a2, 8 +; LA64-NEXT: srai.d $a3, $a1, 63 +; LA64-NEXT: mulh.du $a4, $a1, $a2 +; LA64-NEXT: mul.d $a3, $a3, $a2 +; LA64-NEXT: add.d $a3, $a4, $a3 +; LA64-NEXT: mul.d $a1, $a1, $a2 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a3, $a0, 8 +; LA64-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ae = extractelement <2 x i64> %va, i32 1 + %be = extractelement <2 x i64> %vb, i32 1 + %ax = sext i64 %ae to i128 + %bx = zext i64 %be to i128 + %mul = mul i128 %ax, %bx + store i128 %mul, ptr %res + ret void +}