diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 3fd19702923a1c..cf767fc1c7e451 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2298,6 +2298,23 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, LTDest.second, LTSrc.second)) return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); + // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for + // sitofp. + if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && + 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { + Type *ExtSrc = Src->getWithNewBitWidth(32); + unsigned ExtOpc = + (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; + + // For scalar loads the extend would be free. + InstructionCost ExtCost = 0; + if (!(Src->isIntegerTy() && I && isa(I->getOperand(0)))) + ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); + + return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, + TTI::CastContextHint::None, CostKind); + } + return AdjustCost( BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } diff --git a/llvm/test/Analysis/CostModel/X86/sitofp.ll b/llvm/test/Analysis/CostModel/X86/sitofp.ll index b979f658f4a5db..69af3a273a3362 100644 --- a/llvm/test/Analysis/CostModel/X86/sitofp.ll +++ b/llvm/test/Analysis/CostModel/X86/sitofp.ll @@ -1,32 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ ; -; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42 -; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1 define i32 @sitofp_i8_double() { -; SSE-LABEL: 'sitofp_i8_double' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX-LABEL: 'sitofp_i8_double' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'sitofp_i8_double' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'sitofp_i8_double' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'sitofp_i8_double' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'sitofp_i8_double' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'sitofp_i8_double' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f64 = sitofp i8 undef to double ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> @@ -40,22 +54,36 @@ define i32 @sitofp_i8_double() { } define i32 @sitofp_i16_double() { -; SSE-LABEL: 'sitofp_i16_double' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX-LABEL: 'sitofp_i16_double' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'sitofp_i16_double' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'sitofp_i16_double' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'sitofp_i16_double' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'sitofp_i16_double' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'sitofp_i16_double' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f64 = sitofp i16 undef to double ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> @@ -155,24 +183,40 @@ define i32 @sitofp_i64_double() { } define i32 @sitofp_i8_float() { -; SSE-LABEL: 'sitofp_i8_float' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX-LABEL: 'sitofp_i8_float' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'sitofp_i8_float' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'sitofp_i8_float' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'sitofp_i8_float' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'sitofp_i8_float' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'sitofp_i8_float' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f32 = sitofp i8 undef to float ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> ; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> @@ -188,24 +232,40 @@ define i32 @sitofp_i8_float() { } define i32 @sitofp_i16_float() { -; SSE-LABEL: 'sitofp_i16_float' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX-LABEL: 'sitofp_i16_float' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'sitofp_i16_float' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'sitofp_i16_float' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'sitofp_i16_float' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'sitofp_i16_float' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'sitofp_i16_float' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f32 = sitofp i16 undef to float ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> ; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> diff --git a/llvm/test/Analysis/CostModel/X86/uitofp.ll b/llvm/test/Analysis/CostModel/X86/uitofp.ll index 159649c11bea5f..e7dedf74ff0f7b 100644 --- a/llvm/test/Analysis/CostModel/X86/uitofp.ll +++ b/llvm/test/Analysis/CostModel/X86/uitofp.ll @@ -1,32 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ ; -; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1 define i32 @uitofp_i8_double() { -; SSE-LABEL: 'uitofp_i8_double' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX-LABEL: 'uitofp_i8_double' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'uitofp_i8_double' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'uitofp_i8_double' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'uitofp_i8_double' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'uitofp_i8_double' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'uitofp_i8_double' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f64 = uitofp i8 undef to double ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> @@ -40,22 +54,36 @@ define i32 @uitofp_i8_double() { } define i32 @uitofp_i16_double() { -; SSE-LABEL: 'uitofp_i16_double' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX-LABEL: 'uitofp_i16_double' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'uitofp_i16_double' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'uitofp_i16_double' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'uitofp_i16_double' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'uitofp_i16_double' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'uitofp_i16_double' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f64 = uitofp i16 undef to double ; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> @@ -155,24 +183,40 @@ define i32 @uitofp_i64_double() { } define i32 @uitofp_i8_float() { -; SSE-LABEL: 'uitofp_i8_float' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX-LABEL: 'uitofp_i8_float' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'uitofp_i8_float' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'uitofp_i8_float' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'uitofp_i8_float' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'uitofp_i8_float' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'uitofp_i8_float' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i8_f32 = uitofp i8 undef to float ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> @@ -188,24 +232,40 @@ define i32 @uitofp_i8_float() { } define i32 @uitofp_i16_float() { -; SSE-LABEL: 'uitofp_i16_float' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX-LABEL: 'uitofp_i16_float' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'uitofp_i16_float' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'uitofp_i16_float' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'uitofp_i16_float' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'uitofp_i16_float' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'uitofp_i16_float' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_i16_f32 = uitofp i16 undef to float ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll index c5977a72302ecd..44d0f17182694d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; CHECK-LABEL: @sitofp_uitofp( @@ -197,81 +197,24 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) { ; Inspired by PR38154 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) { -; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; SSE-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> -; SSE-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> -; SSE-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> -; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; SSE-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; SSE-NEXT: [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> -; SSE-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> -; SSE-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R72]] -; -; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 -; SLM-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; SLM-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; SLM-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float -; SLM-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x float> -; SLM-NEXT: [[TMP6:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x float> -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; SLM-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP8]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R72]] -; -; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; AVX-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 -; AVX-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 -; AVX-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 -; AVX-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; AVX-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; AVX-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; AVX-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float -; AVX-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float -; AVX-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float -; AVX-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; AVX-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; AVX-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4 -; AVX-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; AVX-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; AVX-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; AVX-NEXT: ret <8 x float> [[R7]] -; -; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; AVX512-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 -; AVX512-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 -; AVX512-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 -; AVX512-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; AVX512-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; AVX512-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float -; AVX512-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float -; AVX512-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float -; AVX512-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; AVX512-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; AVX512-NEXT: ret <8 x float> [[R7]] +; CHECK-LABEL: @sitofp_uitofp_4i32_8i16_16i8( +; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float> +; CHECK-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> +; CHECK-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[R72]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll index 72940f93eba7f5..5b537ac6683f45 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; CHECK-LABEL: @sitofp_uitofp( @@ -197,81 +197,24 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) { ; Inspired by PR38154 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) { -; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; SSE-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> -; SSE-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> -; SSE-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> -; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; SSE-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; SSE-NEXT: [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> -; SSE-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> -; SSE-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R72]] -; -; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 -; SLM-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; SLM-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; SLM-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float -; SLM-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x float> -; SLM-NEXT: [[TMP6:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x float> -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; SLM-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP8]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R72]] -; -; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; AVX-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 -; AVX-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 -; AVX-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 -; AVX-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; AVX-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; AVX-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; AVX-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float -; AVX-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float -; AVX-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float -; AVX-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; AVX-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; AVX-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4 -; AVX-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; AVX-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; AVX-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; AVX-NEXT: ret <8 x float> [[R7]] -; -; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; AVX512-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 -; AVX512-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 -; AVX512-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 -; AVX512-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; AVX512-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; AVX512-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float -; AVX512-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float -; AVX512-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float -; AVX512-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; AVX512-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; AVX512-NEXT: ret <8 x float> [[R7]] +; CHECK-LABEL: @sitofp_uitofp_4i32_8i16_16i8( +; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float> +; CHECK-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> +; CHECK-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[R72]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll index b230a211ea53fb..c0b1bae204ad5e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll @@ -321,14 +321,20 @@ define void @sitofp_8i32_8f64() #0 { } define void @sitofp_2i16_2f64() #0 { -; CHECK-LABEL: @sitofp_2i16_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_2i16_2f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: ret void +; +; AVX-LABEL: @sitofp_2i16_2f64( +; AVX-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; AVX-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; AVX-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double +; AVX-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double +; AVX-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -341,18 +347,12 @@ define void @sitofp_2i16_2f64() #0 { define void @sitofp_4i16_4f64() #0 { ; SSE-LABEL: @sitofp_4i16_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i16_4f64( @@ -378,30 +378,18 @@ define void @sitofp_4i16_4f64() #0 { define void @sitofp_8i16_8f64() #0 { ; SSE-LABEL: @sitofp_8i16_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i16_8f64( @@ -447,14 +435,20 @@ define void @sitofp_8i16_8f64() #0 { } define void @sitofp_2i8_2f64() #0 { -; CHECK-LABEL: @sitofp_2i8_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_2i8_2f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: ret void +; +; AVX-LABEL: @sitofp_2i8_2f64( +; AVX-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 +; AVX-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 +; AVX-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double +; AVX-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double +; AVX-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 @@ -467,18 +461,12 @@ define void @sitofp_2i8_2f64() #0 { define void @sitofp_4i8_4f64() #0 { ; SSE-LABEL: @sitofp_4i8_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i8 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i8_4f64( @@ -504,30 +492,18 @@ define void @sitofp_4i8_4f64() #0 { define void @sitofp_8i8_8f64() #0 { ; SSE-LABEL: @sitofp_8i8_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4 -; SSE-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1 -; SSE-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i8 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = sitofp i8 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = sitofp i8 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = sitofp i8 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = sitofp i8 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i8_8f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll index ffa821ab5e81cf..f0cb3e54287585 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -321,14 +321,20 @@ define void @sitofp_8i32_8f64() #0 { } define void @sitofp_2i16_2f64() #0 { -; CHECK-LABEL: @sitofp_2i16_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_2i16_2f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: ret void +; +; AVX-LABEL: @sitofp_2i16_2f64( +; AVX-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; AVX-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; AVX-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double +; AVX-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double +; AVX-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -341,18 +347,12 @@ define void @sitofp_2i16_2f64() #0 { define void @sitofp_4i16_4f64() #0 { ; SSE-LABEL: @sitofp_4i16_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i16_4f64( @@ -378,30 +378,18 @@ define void @sitofp_4i16_4f64() #0 { define void @sitofp_8i16_8f64() #0 { ; SSE-LABEL: @sitofp_8i16_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i16_8f64( @@ -447,14 +435,20 @@ define void @sitofp_8i16_8f64() #0 { } define void @sitofp_2i8_2f64() #0 { -; CHECK-LABEL: @sitofp_2i8_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_2i8_2f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: ret void +; +; AVX-LABEL: @sitofp_2i8_2f64( +; AVX-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 +; AVX-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 +; AVX-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double +; AVX-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double +; AVX-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX-NEXT: ret void ; %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 @@ -467,18 +461,12 @@ define void @sitofp_2i8_2f64() #0 { define void @sitofp_4i8_4f64() #0 { ; SSE-LABEL: @sitofp_4i8_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i8 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i8_4f64( @@ -504,30 +492,18 @@ define void @sitofp_4i8_4f64() #0 { define void @sitofp_8i8_8f64() #0 { ; SSE-LABEL: @sitofp_8i8_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4 -; SSE-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1 -; SSE-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = sitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = sitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = sitofp i8 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = sitofp i8 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = sitofp i8 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = sitofp i8 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = sitofp i8 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i8_8f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll index b5d10d99358980..67feab3d4875f3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256DQ @@ -258,14 +258,20 @@ define void @uitofp_8i32_8f64() #0 { } define void @uitofp_2i16_2f64() #0 { -; CHECK-LABEL: @uitofp_2i16_2f64( -; CHECK-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to double -; CHECK-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double -; CHECK-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; CHECK-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @uitofp_2i16_2f64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: ret void +; +; AVX-LABEL: @uitofp_2i16_2f64( +; AVX-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 +; AVX-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 +; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to double +; AVX-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double +; AVX-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX-NEXT: ret void ; %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 @@ -278,18 +284,12 @@ define void @uitofp_2i16_2f64() #0 { define void @uitofp_4i16_4f64() #0 { ; SSE-LABEL: @uitofp_4i16_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i16_4f64( @@ -315,30 +315,18 @@ define void @uitofp_4i16_4f64() #0 { define void @uitofp_8i16_8f64() #0 { ; SSE-LABEL: @uitofp_8i16_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_8i16_8f64( @@ -385,22 +373,28 @@ define void @uitofp_8i16_8f64() #0 { define void @uitofp_2i8_2f64() #0 { ; SSE-LABEL: @uitofp_2i8_2f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @uitofp_2i8_2f64( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: ret void +; AVX1-LABEL: @uitofp_2i8_2f64( +; AVX1-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 +; AVX1-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 +; AVX1-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double +; AVX1-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double +; AVX1-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX1-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @uitofp_2i8_2f64( +; AVX2-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 +; AVX2-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 +; AVX2-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double +; AVX2-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double +; AVX2-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX2-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX2-NEXT: ret void ; ; AVX512-LABEL: @uitofp_2i8_2f64( ; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 @@ -425,18 +419,12 @@ define void @uitofp_2i8_2f64() #0 { define void @uitofp_4i8_4f64() #0 { ; SSE-LABEL: @uitofp_4i8_4f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i8 [[LD3]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i8_4f64( @@ -462,30 +450,18 @@ define void @uitofp_4i8_4f64() #0 { define void @uitofp_8i8_8f64() #0 { ; SSE-LABEL: @uitofp_8i8_8f64( -; SSE-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; SSE-NEXT: [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2 -; SSE-NEXT: [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1 -; SSE-NEXT: [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4 -; SSE-NEXT: [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1 -; SSE-NEXT: [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2 -; SSE-NEXT: [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double -; SSE-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; SSE-NEXT: [[CVT2:%.*]] = uitofp i8 [[LD2]] to double -; SSE-NEXT: [[CVT3:%.*]] = uitofp i8 [[LD3]] to double -; SSE-NEXT: [[CVT4:%.*]] = uitofp i8 [[LD4]] to double -; SSE-NEXT: [[CVT5:%.*]] = uitofp i8 [[LD5]] to double -; SSE-NEXT: [[CVT6:%.*]] = uitofp i8 [[LD6]] to double -; SSE-NEXT: [[CVT7:%.*]] = uitofp i8 [[LD7]] to double -; SSE-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16 -; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32 -; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16 -; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double> +; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double> +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_8i8_8f64(