diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 44059fcba73a4c..4937b48acdec6a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2763,6 +2763,30 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
       if (auto *InsElt = dyn_cast<InsertElementInst>(Src))
         return new BitCastInst(InsElt->getOperand(1), DestTy);
     }
+
+    // Convert an artificial vector insert into more analyzable bitwise logic.
+    unsigned BitWidth = DestTy->getScalarSizeInBits();
+    Value *X, *Y;
+    uint64_t IndexC;
+    if (match(Src, m_OneUse(m_InsertElt(m_OneUse(m_BitCast(m_Value(X))),
+                                        m_Value(Y), m_ConstantInt(IndexC)))) &&
+        DestTy->isIntegerTy() && X->getType() == DestTy &&
+        isDesirableIntType(BitWidth)) {
+      // Adjust for big endian - the LSBs are at the high index.
+      if (DL.isBigEndian())
+        IndexC = SrcVTy->getNumElements() - 1 - IndexC;
+
+      // We only handle (endian-normalized) insert to index 0. Any other insert
+      // would require a left-shift, so that is an extra instruction.
+      if (IndexC == 0) {
+        // bitcast (inselt (bitcast X), Y, 0) --> or (and X, MaskC), (zext Y)
+        unsigned EltWidth = Y->getType()->getScalarSizeInBits();
+        APInt MaskC = APInt::getHighBitsSet(BitWidth, BitWidth - EltWidth);
+        Value *AndX = Builder.CreateAnd(X, MaskC);
+        Value *ZextY = Builder.CreateZExt(Y, DestTy);
+        return BinaryOperator::CreateOr(AndX, ZextY);
+      }
+    }
   }
 
   if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) {
diff --git a/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll
index 2c3c59fdc68b6b..464a438f286f7c 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll
@@ -1,15 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S -data-layout="E-n64" | FileCheck %s --check-prefixes=ALL
-; RUN: opt < %s -instcombine -S -data-layout="e-n64" | FileCheck %s --check-prefixes=ALL
+; RUN: opt < %s -instcombine -S -data-layout="E-n64" | FileCheck %s --check-prefixes=ALL,BE
+; RUN: opt < %s -instcombine -S -data-layout="e-n64" | FileCheck %s --check-prefixes=ALL,LE
 
 declare void @use(<2 x i8>)
 
+; i16 is a common type, so we can convert independently of the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i16 @insert0_v2i8(i16 %x, i8 %y) {
-; ALL-LABEL: @insert0_v2i8(
-; ALL-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
-; ALL-NEXT:    [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 0
-; ALL-NEXT:    [[R:%.*]] = bitcast <2 x i8> [[I]] to i16
-; ALL-NEXT:    ret i16 [[R]]
+; BE-LABEL: @insert0_v2i8(
+; BE-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
+; BE-NEXT:    [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 0
+; BE-NEXT:    [[R:%.*]] = bitcast <2 x i8> [[I]] to i16
+; BE-NEXT:    ret i16 [[R]]
+;
+; LE-LABEL: @insert0_v2i8(
+; LE-NEXT:    [[TMP1:%.*]] = and i16 [[X:%.*]], -256
+; LE-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i16
+; LE-NEXT:    [[R:%.*]] = or i16 [[TMP1]], [[TMP2]]
+; LE-NEXT:    ret i16 [[R]]
 ;
   %v = bitcast i16 %x to <2 x i8>
   %i = insertelement <2 x i8> %v, i8 %y, i8 0
@@ -17,12 +26,21 @@ define i16 @insert0_v2i8(i16 %x, i8 %y) {
   ret i16 %r
 }
 
+; i16 is a common type, so we can convert independently of the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i16 @insert1_v2i8(i16 %x, i8 %y) {
-; ALL-LABEL: @insert1_v2i8(
-; ALL-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
-; ALL-NEXT:    [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 1
-; ALL-NEXT:    [[R:%.*]] = bitcast <2 x i8> [[I]] to i16
-; ALL-NEXT:    ret i16 [[R]]
+; BE-LABEL: @insert1_v2i8(
+; BE-NEXT:    [[TMP1:%.*]] = and i16 [[X:%.*]], -256
+; BE-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i16
+; BE-NEXT:    [[R:%.*]] = or i16 [[TMP1]], [[TMP2]]
+; BE-NEXT:    ret i16 [[R]]
+;
+; LE-LABEL: @insert1_v2i8(
+; LE-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
+; LE-NEXT:    [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 1
+; LE-NEXT:    [[R:%.*]] = bitcast <2 x i8> [[I]] to i16
+; LE-NEXT:    ret i16 [[R]]
 ;
   %v = bitcast i16 %x to <2 x i8>
   %i = insertelement <2 x i8> %v, i8 %y, i8 1
@@ -30,12 +48,21 @@ define i16 @insert1_v2i8(i16 %x, i8 %y) {
   ret i16 %r
 }
 
+; i32 is a common type, so we can convert independently of the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i32 @insert0_v4i8(i32 %x, i8 %y) {
-; ALL-LABEL: @insert0_v4i8(
-; ALL-NEXT:    [[V:%.*]] = bitcast i32 [[X:%.*]] to <4 x i8>
-; ALL-NEXT:    [[I:%.*]] = insertelement <4 x i8> [[V]], i8 [[Y:%.*]], i8 0
-; ALL-NEXT:    [[R:%.*]] = bitcast <4 x i8> [[I]] to i32
-; ALL-NEXT:    ret i32 [[R]]
+; BE-LABEL: @insert0_v4i8(
+; BE-NEXT:    [[V:%.*]] = bitcast i32 [[X:%.*]] to <4 x i8>
+; BE-NEXT:    [[I:%.*]] = insertelement <4 x i8> [[V]], i8 [[Y:%.*]], i8 0
+; BE-NEXT:    [[R:%.*]] = bitcast <4 x i8> [[I]] to i32
+; BE-NEXT:    ret i32 [[R]]
+;
+; LE-LABEL: @insert0_v4i8(
+; LE-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], -256
+; LE-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
+; LE-NEXT:    [[R:%.*]] = or i32 [[TMP1]], [[TMP2]]
+; LE-NEXT:    ret i32 [[R]]
 ;
   %v = bitcast i32 %x to <4 x i8>
   %i = insertelement <4 x i8> %v, i8 %y, i8 0
@@ -43,12 +70,21 @@ define i32 @insert0_v4i8(i32 %x, i8 %y) {
   ret i32 %r
 }
 
+; i64 is a legal type, so we can convert based on the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i64 @insert0_v4i16(i64 %x, i16 %y) {
-; ALL-LABEL: @insert0_v4i16(
-; ALL-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; ALL-NEXT:    [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 0
-; ALL-NEXT:    [[R:%.*]] = bitcast <4 x i16> [[I]] to i64
-; ALL-NEXT:    ret i64 [[R]]
+; BE-LABEL: @insert0_v4i16(
+; BE-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
+; BE-NEXT:    [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 0
+; BE-NEXT:    [[R:%.*]] = bitcast <4 x i16> [[I]] to i64
+; BE-NEXT:    ret i64 [[R]]
+;
+; LE-LABEL: @insert0_v4i16(
+; LE-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], -65536
+; LE-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i64
+; LE-NEXT:    [[R:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; LE-NEXT:    ret i64 [[R]]
 ;
   %v = bitcast i64 %x to <4 x i16>
   %i = insertelement <4 x i16> %v, i16 %y, i8 0
@@ -56,6 +92,8 @@ define i64 @insert0_v4i16(i64 %x, i16 %y) {
   ret i64 %r
 }
 
+; Negative test - shifts needed for both endians.
+
 define i64 @insert1_v4i16(i64 %x, i16 %y) {
 ; ALL-LABEL: @insert1_v4i16(
 ; ALL-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
@@ -69,12 +107,21 @@ define i64 @insert1_v4i16(i64 %x, i16 %y) {
   ret i64 %r
 }
 
+; i64 is a legal type, so we can convert based on the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i64 @insert3_v4i16(i64 %x, i16 %y) {
-; ALL-LABEL: @insert3_v4i16(
-; ALL-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; ALL-NEXT:    [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 3
-; ALL-NEXT:    [[R:%.*]] = bitcast <4 x i16> [[I]] to i64
-; ALL-NEXT:    ret i64 [[R]]
+; BE-LABEL: @insert3_v4i16(
+; BE-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], -65536
+; BE-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i64
+; BE-NEXT:    [[R:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; BE-NEXT:    ret i64 [[R]]
+;
+; LE-LABEL: @insert3_v4i16(
+; LE-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
+; LE-NEXT:    [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 3
+; LE-NEXT:    [[R:%.*]] = bitcast <4 x i16> [[I]] to i64
+; LE-NEXT:    ret i64 [[R]]
 ;
   %v = bitcast i64 %x to <4 x i16>
   %i = insertelement <4 x i16> %v, i16 %y, i8 3
@@ -82,6 +129,8 @@ define i64 @insert3_v4i16(i64 %x, i16 %y) {
   ret i64 %r
 }
 
+; Negative test - i128 is not a legal type, so we do not convert based on the data layout.
+
 define i128 @insert0_v4i32(i128 %x, i32 %y) {
 ; ALL-LABEL: @insert0_v4i32(
 ; ALL-NEXT:    [[V:%.*]] = bitcast i128 [[X:%.*]] to <4 x i32>
@@ -95,6 +144,8 @@ define i128 @insert0_v4i32(i128 %x, i32 %y) {
   ret i128 %r
 }
 
+; Negative test - extra use requires more instructions.
+
 define i16 @insert0_v2i8_use1(i16 %x, i8 %y) {
 ; ALL-LABEL: @insert0_v2i8_use1(
 ; ALL-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
@@ -110,6 +161,8 @@ define i16 @insert0_v2i8_use1(i16 %x, i8 %y) {
   ret i16 %r
 }
 
+; Negative test - extra use requires more instructions.
+
 define i16 @insert0_v2i8_use2(i16 %x, i8 %y) {
 ; ALL-LABEL: @insert0_v2i8_use2(
 ; ALL-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>