diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 8b1235b50cc6f..1faf451e8b495 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -517,6 +517,19 @@ Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
 llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
 genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder, Entity entity);
 
+/// Generate an hlfir.designate that produces an 1D section
+/// of \p array using \p oneBasedIndices and \p dim:
+///   i = oneBasedIndices
+///   result => array(i(1), ..., i(dim-1), :, i(dim+1), ..., i(n))
+///
+/// The caller provides the pre-computed \p lbounds, \p extents
+/// and \p typeParams of the array.
+Entity gen1DSection(mlir::Location loc, fir::FirOpBuilder &builder,
+                    Entity array, int64_t dim,
+                    mlir::ArrayRef<mlir::Value> lbounds,
+                    mlir::ArrayRef<mlir::Value> extents,
+                    mlir::ValueRange oneBasedIndices,
+                    mlir::ArrayRef<mlir::Value> typeParams);
 } // namespace hlfir
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 8993065c2bb64..f4967ed3852b9 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1535,3 +1535,52 @@ hlfir::genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder,
     shape.getDefiningOp()->erase();
   return extents;
 }
+
+hlfir::Entity hlfir::gen1DSection(mlir::Location loc,
+                                  fir::FirOpBuilder &builder,
+                                  hlfir::Entity array, int64_t dim,
+                                  mlir::ArrayRef<mlir::Value> lbounds,
+                                  mlir::ArrayRef<mlir::Value> extents,
+                                  mlir::ValueRange oneBasedIndices,
+                                  mlir::ArrayRef<mlir::Value> typeParams) {
+  assert(array.isVariable() && "array must be a variable");
+  assert(dim > 0 && dim <= array.getRank() && "invalid dim number");
+  mlir::Value one =
+      builder.createIntegerConstant(loc, builder.getIndexType(), 1);
+  hlfir::DesignateOp::Subscripts subscripts;
+  unsigned indexId = 0;
+  for (int i = 0; i < array.getRank(); ++i) {
+    if (i == dim - 1) {
+      mlir::Value ubound = genUBound(loc, builder, lbounds[i], extents[i], one);
+      subscripts.emplace_back(
+          hlfir::DesignateOp::Triplet{lbounds[i], ubound, one});
+    } else {
+      mlir::Value index =
+          genUBound(loc, builder, lbounds[i], oneBasedIndices[indexId++], one);
+      subscripts.emplace_back(index);
+    }
+  }
+  mlir::Value sectionShape =
+      builder.create<fir::ShapeOp>(loc, extents[dim - 1]);
+
+  // The result type is one of:
+  //   !fir.box/class<!fir.array<NxT>>
+  //   !fir.box/class<!fir.array<?xT>>
+  //
+  // We could use !fir.ref<!fir.array<NxT>> when the whole dimension's
+  // size is known and it is the leading dimension, but let it be simple
+  // for the time being.
+  auto seqType =
+      mlir::cast<fir::SequenceType>(array.getElementOrSequenceType());
+  int64_t dimExtent = seqType.getShape()[dim - 1];
+  mlir::Type sectionType =
+      fir::SequenceType::get({dimExtent}, seqType.getEleTy());
+  sectionType = fir::wrapInClassOrBoxType(sectionType, array.isPolymorphic());
+
+  auto designate = builder.create<hlfir::DesignateOp>(
+      loc, sectionType, array, /*component=*/"", /*componentShape=*/nullptr,
+      subscripts,
+      /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
+      sectionShape, typeParams);
+  return hlfir::Entity{designate.getResult()};
+}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index c1c3839c47e11..bac10121a881b 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -415,15 +415,13 @@ class SumAsElementalConversion : public mlir::OpRewritePattern<hlfir::SumOp> {
   }
 };
 
-class CShiftAsElementalConversion
-    : public mlir::OpRewritePattern<hlfir::CShiftOp> {
+class CShiftConversion : public mlir::OpRewritePattern<hlfir::CShiftOp> {
 public:
   using mlir::OpRewritePattern<hlfir::CShiftOp>::OpRewritePattern;
 
   llvm::LogicalResult
   matchAndRewrite(hlfir::CShiftOp cshift,
                   mlir::PatternRewriter &rewriter) const override {
-    using Fortran::common::maxRank;
 
     hlfir::ExprType expr = mlir::dyn_cast<hlfir::ExprType>(cshift.getType());
     assert(expr &&
@@ -445,31 +443,88 @@ class CShiftAsElementalConversion
     if (dimVal <= 0 || dimVal > arrayRank)
       return rewriter.notifyMatchFailure(cshift, "Invalid DIM for CSHIFT");
 
-    mlir::Location loc = cshift.getLoc();
-    fir::FirOpBuilder builder{rewriter, cshift.getOperation()};
-    mlir::Type elementType = expr.getElementType();
+    // When DIM==1 and the contiguity of the input array is not statically
+    // known, try to exploit the fact that the leading dimension might be
+    // contiguous. We can do this now using hlfir.eval_in_mem with
+    // a dynamic check for the leading dimension contiguity.
+    // Otherwise, convert hlfir.cshift to hlfir.elemental.
+    //
+    // Note that the hlfir.elemental can be inlined into other hlfir.elemental,
+    // while hlfir.eval_in_mem prevents this, and we will end up creating
+    // a temporary array for the result. We may need to come up with
+    // a more sophisticated logic for picking the most efficient
+    // representation.
     hlfir::Entity array = hlfir::Entity{cshift.getArray()};
-    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
-    llvm::SmallVector<mlir::Value> arrayExtents =
-        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
-    llvm::SmallVector<mlir::Value, 1> typeParams;
-    hlfir::genLengthParameters(loc, builder, array, typeParams);
+    mlir::Type elementType = array.getFortranElementType();
+    if (dimVal == 1 && fir::isa_trivial(elementType) &&
+        // genInMemCShift() only works for variables currently.
+        array.isVariable())
+      rewriter.replaceOp(cshift, genInMemCShift(rewriter, cshift, dimVal));
+    else
+      rewriter.replaceOp(cshift, genElementalCShift(rewriter, cshift, dimVal));
+    return mlir::success();
+  }
+
+private:
+  /// Generate MODULO(\p shiftVal, \p extent).
+  static mlir::Value normalizeShiftValue(mlir::Location loc,
+                                         fir::FirOpBuilder &builder,
+                                         mlir::Value shiftVal,
+                                         mlir::Value extent,
+                                         mlir::Type calcType) {
+    shiftVal = builder.createConvert(loc, calcType, shiftVal);
+    extent = builder.createConvert(loc, calcType, extent);
+    // Make sure that we do not divide by zero. When the dimension
+    // has zero size, turn the extent into 1. Note that the computed
+    // MODULO value won't be used in this case, so it does not matter
+    // which extent value we use.
+    mlir::Value zero = builder.createIntegerConstant(loc, calcType, 0);
+    mlir::Value one = builder.createIntegerConstant(loc, calcType, 1);
+    mlir::Value isZero = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::eq, extent, zero);
+    extent = builder.create<mlir::arith::SelectOp>(loc, isZero, one, extent);
+    shiftVal = fir::IntrinsicLibrary{builder, loc}.genModulo(
+        calcType, {shiftVal, extent});
+    return builder.createConvert(loc, calcType, shiftVal);
+  }
+
+  /// Convert \p cshift into an hlfir.elemental using
+  /// the pre-computed constant \p dimVal.
+  static mlir::Operation *genElementalCShift(mlir::PatternRewriter &rewriter,
+                                             hlfir::CShiftOp cshift,
+                                             int64_t dimVal) {
+    using Fortran::common::maxRank;
     hlfir::Entity shift = hlfir::Entity{cshift.getShift()};
+    hlfir::Entity array = hlfir::Entity{cshift.getArray()};
+
+    mlir::Location loc = cshift.getLoc();
+    fir::FirOpBuilder builder{rewriter, cshift.getOperation()};
     // The new index computation involves MODULO, which is not implemented
     // for IndexType, so use I64 instead.
     mlir::Type calcType = builder.getI64Type();
+    // All the indices arithmetic used below does not overflow
+    // signed and unsigned I64.
+    builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw |
+                                    mlir::arith::IntegerOverflowFlags::nuw);
 
-    mlir::Value one = builder.createIntegerConstant(loc, calcType, 1);
+    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
+    llvm::SmallVector<mlir::Value, maxRank> arrayExtents =
+        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
+    llvm::SmallVector<mlir::Value, 1> typeParams;
+    hlfir::genLengthParameters(loc, builder, array, typeParams);
+    mlir::Value shiftDimExtent =
+        builder.createConvert(loc, calcType, arrayExtents[dimVal - 1]);
     mlir::Value shiftVal;
     if (shift.isScalar()) {
       shiftVal = hlfir::loadTrivialScalar(loc, builder, shift);
-      shiftVal = builder.createConvert(loc, calcType, shiftVal);
+      shiftVal =
+          normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType);
     }
 
     auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                          mlir::ValueRange inputIndices) -> hlfir::Entity {
       llvm::SmallVector<mlir::Value, maxRank> indices{inputIndices};
-      if (!shift.isScalar()) {
+      if (!shiftVal) {
         // When the array is not a vector, section
         // (s(1), s(2), ..., s(dim-1), :, s(dim+1), ..., s(n)
         // of the result has a value equal to:
@@ -482,35 +537,281 @@ class CShiftAsElementalConversion
         hlfir::Entity shiftElement =
             hlfir::getElementAt(loc, builder, shift, shiftIndices);
         shiftVal = hlfir::loadTrivialScalar(loc, builder, shiftElement);
-        shiftVal = builder.createConvert(loc, calcType, shiftVal);
+        shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent,
+                                       calcType);
       }
 
       // Element i of the result (1-based) is element
-      // 'MODULO(i + SH - 1, SIZE(ARRAY)) + 1' (1-based) of the original
+      // 'MODULO(i + SH - 1, SIZE(ARRAY,DIM)) + 1' (1-based) of the original
       // ARRAY (or its section, when ARRAY is not a vector).
+
+      // Compute the index into the original array using the normalized
+      // shift value, which satisfies (SH >= 0 && SH < SIZE(ARRAY,DIM)):
+      //   newIndex =
+      //     i + ((i <= SIZE(ARRAY,DIM) - SH) ? SH : SH - SIZE(ARRAY,DIM))
+      //
+      // Such index computation allows for further loop vectorization
+      // in LLVM.
+      mlir::Value wrapBound =
+          builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal);
+      mlir::Value adjustedShiftVal =
+          builder.create<mlir::arith::SubIOp>(loc, shiftVal, shiftDimExtent);
       mlir::Value index =
           builder.createConvert(loc, calcType, inputIndices[dimVal - 1]);
-      mlir::Value extent = arrayExtents[dimVal - 1];
+      mlir::Value wrapCheck = builder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::sle, index, wrapBound);
+      mlir::Value actualShift = builder.create<mlir::arith::SelectOp>(
+          loc, wrapCheck, shiftVal, adjustedShiftVal);
       mlir::Value newIndex =
-          builder.create<mlir::arith::AddIOp>(loc, index, shiftVal);
-      newIndex = builder.create<mlir::arith::SubIOp>(loc, newIndex, one);
-      newIndex = fir::IntrinsicLibrary{builder, loc}.genModulo(
-          calcType, {newIndex, builder.createConvert(loc, calcType, extent)});
-      newIndex = builder.create<mlir::arith::AddIOp>(loc, newIndex, one);
+          builder.create<mlir::arith::AddIOp>(loc, index, actualShift);
       newIndex = builder.createConvert(loc, builder.getIndexType(), newIndex);
-
       indices[dimVal - 1] = newIndex;
       hlfir::Entity element = hlfir::getElementAt(loc, builder, array, indices);
       return hlfir::loadTrivialScalar(loc, builder, element);
     };
 
+    mlir::Type elementType = array.getFortranElementType();
     hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
         loc, builder, elementType, arrayShape, typeParams, genKernel,
         /*isUnordered=*/true,
         array.isPolymorphic() ? static_cast<mlir::Value>(array) : nullptr,
         cshift.getResult().getType());
-    rewriter.replaceOp(cshift, elementalOp);
-    return mlir::success();
+    return elementalOp.getOperation();
+  }
+
+  /// Convert \p cshift into an hlfir.eval_in_mem using the pre-computed
+  /// constant \p dimVal.
+  /// The converted code looks like this:
+  ///   do i=1,SH
+  ///     result(i + (SIZE(ARRAY,DIM) - SH)) = array(i)
+  ///   end
+  ///   do i=1,SIZE(ARRAY,DIM) - SH
+  ///     result(i) = array(i + SH)
+  ///   end
+  ///
+  /// When \p dimVal is 1, we generate the same code twice
+  /// under a dynamic check for the contiguity of the leading
+  /// dimension. In the code corresponding to the contiguous
+  /// leading dimension, the shift dimension is represented
+  /// as a contiguous slice of the original array.
+  /// This allows recognizing the above two loops as memcpy
+  /// loop idioms in LLVM.
+  static mlir::Operation *genInMemCShift(mlir::PatternRewriter &rewriter,
+                                         hlfir::CShiftOp cshift,
+                                         int64_t dimVal) {
+    using Fortran::common::maxRank;
+    hlfir::Entity shift = hlfir::Entity{cshift.getShift()};
+    hlfir::Entity array = hlfir::Entity{cshift.getArray()};
+    assert(array.isVariable() && "array must be a variable");
+    assert(!array.isPolymorphic() &&
+           "genInMemCShift does not support polymorphic types");
+    mlir::Location loc = cshift.getLoc();
+    fir::FirOpBuilder builder{rewriter, cshift.getOperation()};
+    // The new index computation involves MODULO, which is not implemented
+    // for IndexType, so use I64 instead.
+    mlir::Type calcType = builder.getI64Type();
+    // All the indices arithmetic used below does not overflow
+    // signed and unsigned I64.
+    builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw |
+                                    mlir::arith::IntegerOverflowFlags::nuw);
+
+    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
+    llvm::SmallVector<mlir::Value, maxRank> arrayExtents =
+        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
+    llvm::SmallVector<mlir::Value, 1> typeParams;
+    hlfir::genLengthParameters(loc, builder, array, typeParams);
+    mlir::Value shiftDimExtent =
+        builder.createConvert(loc, calcType, arrayExtents[dimVal - 1]);
+    mlir::Value shiftVal;
+    if (shift.isScalar()) {
+      shiftVal = hlfir::loadTrivialScalar(loc, builder, shift);
+      shiftVal =
+          normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType);
+    }
+
+    hlfir::EvaluateInMemoryOp evalOp =
+        builder.create<hlfir::EvaluateInMemoryOp>(
+            loc, mlir::cast<hlfir::ExprType>(cshift.getType()), arrayShape);
+    builder.setInsertionPointToStart(&evalOp.getBody().front());
+
+    mlir::Value resultArray = evalOp.getMemory();
+    mlir::Type arrayType = fir::dyn_cast_ptrEleTy(resultArray.getType());
+    resultArray = builder.createBox(loc, fir::BoxType::get(arrayType),
+                                    resultArray, arrayShape, /*slice=*/nullptr,
+                                    typeParams, /*tdesc=*/nullptr);
+
+    // This is a generator of the dimension shift code.
+    // The code is inserted inside a loop nest over the other dimensions
+    // (if any). If exposeContiguity is true, the array's section
+    // array(s(1), ..., s(dim-1), :, s(dim+1), ..., s(n)) is represented
+    // as a contiguous 1D array.
+    // shiftVal is the normalized shift value that satisfies (SH >= 0 && SH <
+    // SIZE(ARRAY,DIM)).
+    //
+    auto genDimensionShift = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                                 mlir::Value shiftVal, bool exposeContiguity,
+                                 mlir::ValueRange oneBasedIndices)
+        -> llvm::SmallVector<mlir::Value, 0> {
+      // Create a vector of indices (s(1), ..., s(dim-1), nullptr, s(dim+1),
+      // ..., s(n)) so that we can update the dimVal index as needed.
+      llvm::SmallVector<mlir::Value, maxRank> srcIndices(
+          oneBasedIndices.begin(), oneBasedIndices.begin() + (dimVal - 1));
+      srcIndices.push_back(nullptr);
+      srcIndices.append(oneBasedIndices.begin() + (dimVal - 1),
+                        oneBasedIndices.end());
+      llvm::SmallVector<mlir::Value, maxRank> dstIndices(srcIndices);
+
+      hlfir::Entity srcArray = array;
+      if (exposeContiguity && mlir::isa<fir::BaseBoxType>(srcArray.getType())) {
+        assert(dimVal == 1 && "can expose contiguity only for dim 1");
+        llvm::SmallVector<mlir::Value, maxRank> arrayLbounds =
+            hlfir::genLowerbounds(loc, builder, arrayShape, array.getRank());
+        hlfir::Entity section =
+            hlfir::gen1DSection(loc, builder, srcArray, dimVal, arrayLbounds,
+                                arrayExtents, oneBasedIndices, typeParams);
+        mlir::Value addr = hlfir::genVariableRawAddress(loc, builder, section);
+        mlir::Value shape = hlfir::genShape(loc, builder, section);
+        mlir::Type boxType = fir::wrapInClassOrBoxType(
+            hlfir::getFortranElementOrSequenceType(section.getType()),
+            section.isPolymorphic());
+        srcArray = hlfir::Entity{
+            builder.createBox(loc, boxType, addr, shape, /*slice=*/nullptr,
+                              /*lengths=*/{}, /*tdesc=*/nullptr)};
+        // When shifting the dimension as a 1D section of the original
+        // array, we only need one index for addressing.
+        srcIndices.resize(1);
+      }
+
+      // Copy first portion of the array:
+      // do i=1,SH
+      //   result(i + (SIZE(ARRAY,DIM) - SH)) = array(i)
+      // end
+      auto genAssign1 = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange index,
+                            mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        assert(index.size() == 1 && "expected single loop");
+        mlir::Value srcIndex = builder.createConvert(loc, calcType, index[0]);
+        srcIndices[dimVal - 1] = srcIndex;
+        hlfir::Entity srcElementValue =
+            hlfir::loadElementAt(loc, builder, srcArray, srcIndices);
+        mlir::Value dstIndex = builder.create<mlir::arith::AddIOp>(
+            loc, srcIndex,
+            builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal));
+        dstIndices[dimVal - 1] = dstIndex;
+        hlfir::Entity dstElement = hlfir::getElementAt(
+            loc, builder, hlfir::Entity{resultArray}, dstIndices);
+        builder.create<hlfir::AssignOp>(loc, srcElementValue, dstElement);
+        return {};
+      };
+
+      // Generate the first loop.
+      hlfir::genLoopNestWithReductions(loc, builder, {shiftVal},
+                                       /*reductionInits=*/{}, genAssign1,
+                                       /*isUnordered=*/true);
+
+      // Copy second portion of the array:
+      // do i=1,SIZE(ARRAY,DIM)-SH
+      //   result(i) = array(i + SH)
+      // end
+      auto genAssign2 = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange index,
+                            mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        assert(index.size() == 1 && "expected single loop");
+        mlir::Value dstIndex = builder.createConvert(loc, calcType, index[0]);
+        mlir::Value srcIndex =
+            builder.create<mlir::arith::AddIOp>(loc, dstIndex, shiftVal);
+        srcIndices[dimVal - 1] = srcIndex;
+        hlfir::Entity srcElementValue =
+            hlfir::loadElementAt(loc, builder, srcArray, srcIndices);
+        dstIndices[dimVal - 1] = dstIndex;
+        hlfir::Entity dstElement = hlfir::getElementAt(
+            loc, builder, hlfir::Entity{resultArray}, dstIndices);
+        builder.create<hlfir::AssignOp>(loc, srcElementValue, dstElement);
+        return {};
+      };
+
+      // Generate the second loop.
+      mlir::Value bound =
+          builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal);
+      hlfir::genLoopNestWithReductions(loc, builder, {bound},
+                                       /*reductionInits=*/{}, genAssign2,
+                                       /*isUnordered=*/true);
+      return {};
+    };
+
+    // A wrapper around genDimensionShift that computes the normalized
+    // shift value and manages the insertion of the multiple versions
+    // of the shift based on the dynamic check of the leading dimension's
+    // contiguity (when dimVal == 1).
+    auto genShiftBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange oneBasedIndices,
+                            mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 0> {
+      // Copy the dimension with a shift:
+      // SH is either SHIFT (if scalar) or SHIFT(oneBasedIndices).
+      if (!shiftVal) {
+        assert(!oneBasedIndices.empty() && "scalar shift must be precomputed");
+        hlfir::Entity shiftElement =
+            hlfir::getElementAt(loc, builder, shift, oneBasedIndices);
+        shiftVal = hlfir::loadTrivialScalar(loc, builder, shiftElement);
+        shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent,
+                                       calcType);
+      }
+
+      // If we can fetch the byte stride of the leading dimension,
+      // and the byte size of the element, then we can generate
+      // a dynamic contiguity check and expose the leading dimension's
+      // contiguity in FIR, making memcpy loop idiom recognition
+      // possible.
+      mlir::Value elemSize;
+      mlir::Value stride;
+      if (dimVal == 1 && mlir::isa<fir::BaseBoxType>(array.getType())) {
+        mlir::Type indexType = builder.getIndexType();
+        elemSize =
+            builder.create<fir::BoxEleSizeOp>(loc, indexType, array.getBase());
+        mlir::Value dimIdx =
+            builder.createIntegerConstant(loc, indexType, dimVal - 1);
+        auto boxDim = builder.create<fir::BoxDimsOp>(
+            loc, indexType, indexType, indexType, array.getBase(), dimIdx);
+        stride = boxDim.getByteStride();
+      }
+
+      if (array.isSimplyContiguous() || !elemSize || !stride) {
+        genDimensionShift(loc, builder, shiftVal, /*exposeContiguity=*/false,
+                          oneBasedIndices);
+        return {};
+      }
+
+      mlir::Value isContiguous = builder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::eq, elemSize, stride);
+      builder.genIfOp(loc, {}, isContiguous, /*withElseRegion=*/true)
+          .genThen([&]() {
+            genDimensionShift(loc, builder, shiftVal, /*exposeContiguity=*/true,
+                              oneBasedIndices);
+          })
+          .genElse([&]() {
+            genDimensionShift(loc, builder, shiftVal,
+                              /*exposeContiguity=*/false, oneBasedIndices);
+          });
+
+      return {};
+    };
+
+    // For 1D case, generate a single loop.
+    // For ND case, generate a loop nest over the other dimensions
+    // with a single loop inside (generated separately).
+    llvm::SmallVector<mlir::Value, maxRank> newExtents(arrayExtents);
+    newExtents.erase(newExtents.begin() + (dimVal - 1));
+    if (!newExtents.empty())
+      hlfir::genLoopNestWithReductions(loc, builder, newExtents,
+                                       /*reductionInits=*/{}, genShiftBody,
+                                       /*isUnordered=*/true);
+    else
+      genShiftBody(loc, builder, {}, {});
+
+    return evalOp.getOperation();
   }
 };
 
@@ -1181,7 +1482,7 @@ class SimplifyHLFIRIntrinsics
     mlir::RewritePatternSet patterns(context);
     patterns.insert<TransposeAsElementalConversion>(context);
     patterns.insert<SumAsElementalConversion>(context);
-    patterns.insert<CShiftAsElementalConversion>(context);
+    patterns.insert<CShiftConversion>(context);
     patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
 
     // If forceMatmulAsElemental is false, then hlfir.matmul inlining
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir
index d21d7755062ba..35530c66f4038 100644
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir
@@ -1,4 +1,4 @@
-// Test hlfir.cshift simplification to hlfir.elemental:
+// Test hlfir.cshift simplification to hlfir.elemental and hlfir.eval_in_mem:
 // RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
 
 func.func @cshift_vector(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?xi32>{
@@ -6,39 +6,114 @@ func.func @cshift_vector(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<i32
   return %res : !hlfir.expr<?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_vector(
-// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>,
-// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i32) -> i64
-// CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: index):
-// CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
-// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_7]] : i64
-// CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_11]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.remsi %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.xori %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_14]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.andi %[[VAL_18]], %[[VAL_17]] : i1
-// CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_14]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_20]], %[[VAL_14]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i64) -> index
-// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_27:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_26]] : index
-// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_27]] : index
-// CHECK:             %[[VAL_29:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_30]] : i32
+// CHECK-SAME:                             %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>>,
+// CHECK-SAME:                             %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_4]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[VAL_3]], %[[VAL_8]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.remsi %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.xori %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.andi %[[VAL_14]], %[[VAL_13]] : i1
+// CHECK:           %[[VAL_16:.*]] = arith.addi %[[VAL_11]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_15]], %[[VAL_16]], %[[VAL_11]] : i64
+// CHECK:           %[[VAL_18:.*]] = hlfir.eval_in_mem shape %[[VAL_7]] : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+// CHECK:           ^bb0(%[[VAL_19:.*]]: !fir.ref<!fir.array<?xi32>>):
+// CHECK:             %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_7]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:             %[[VAL_3:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?xi32>>) -> index
+// CHECK:             %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+
+// CHECK:             %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_21]]#2 : index
+// CHECK:             fir.if %[[VAL_22]] {
+// CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:               %[[VAL_24:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]:%[[VAL_6]]#1:%[[VAL_2]])  shape %[[VAL_23]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:               %[[VAL_25:.*]] = fir.box_addr %[[VAL_24]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_17]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_28:.*]] = %[[VAL_2]] to %[[VAL_27]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (index) -> i64
+// CHECK:                 %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_31:.*]] = fir.convert %[[VAL_29]] : (i64) -> index
+// CHECK:                 %[[VAL_32:.*]] = arith.subi %[[VAL_30]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_33:.*]] = arith.addi %[[VAL_31]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_34:.*]] = hlfir.designate %[[VAL_26]] (%[[VAL_33]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_36:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_37:.*]] = arith.addi %[[VAL_29]], %[[VAL_36]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_38:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_39:.*]] = fir.convert %[[VAL_37]] : (i64) -> index
+// CHECK:                 %[[VAL_40:.*]] = arith.subi %[[VAL_38]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_41:.*]] = arith.addi %[[VAL_39]], %[[VAL_40]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_42:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_41]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_35]] to %[[VAL_42]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:               %[[VAL_43:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_45:.*]] = %[[VAL_2]] to %[[VAL_44]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_46:.*]] = fir.convert %[[VAL_45]] : (index) -> i64
+// CHECK:                 %[[VAL_47:.*]] = arith.addi %[[VAL_46]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_48:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (i64) -> index
+// CHECK:                 %[[VAL_50:.*]] = arith.subi %[[VAL_48]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_51:.*]] = arith.addi %[[VAL_49]], %[[VAL_50]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_52:.*]] = hlfir.designate %[[VAL_26]] (%[[VAL_51]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_53:.*]] = fir.load %[[VAL_52]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_54:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_55:.*]] = fir.convert %[[VAL_46]] : (i64) -> index
+// CHECK:                 %[[VAL_56:.*]] = arith.subi %[[VAL_54]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_57:.*]] = arith.addi %[[VAL_55]], %[[VAL_56]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_58:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_57]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_53]] to %[[VAL_58]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               %[[VAL_59:.*]] = fir.convert %[[VAL_17]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_60:.*]] = %[[VAL_2]] to %[[VAL_59]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_61:.*]] = fir.convert %[[VAL_60]] : (index) -> i64
+// CHECK:                 %[[VAL_62:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_63:.*]] = fir.convert %[[VAL_61]] : (i64) -> index
+// CHECK:                 %[[VAL_64:.*]] = arith.subi %[[VAL_62]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_65:.*]] = arith.addi %[[VAL_63]], %[[VAL_64]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_66:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_65]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_67:.*]] = fir.load %[[VAL_66]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_68:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_69:.*]] = arith.addi %[[VAL_61]], %[[VAL_68]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_70:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_71:.*]] = fir.convert %[[VAL_69]] : (i64) -> index
+// CHECK:                 %[[VAL_72:.*]] = arith.subi %[[VAL_70]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_73:.*]] = arith.addi %[[VAL_71]], %[[VAL_72]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_74:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_73]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_67]] to %[[VAL_74]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:               %[[VAL_75:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:               %[[VAL_76:.*]] = fir.convert %[[VAL_75]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_77:.*]] = %[[VAL_2]] to %[[VAL_76]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_78:.*]] = fir.convert %[[VAL_77]] : (index) -> i64
+// CHECK:                 %[[VAL_79:.*]] = arith.addi %[[VAL_78]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_80:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_81:.*]] = fir.convert %[[VAL_79]] : (i64) -> index
+// CHECK:                 %[[VAL_82:.*]] = arith.subi %[[VAL_80]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_83:.*]] = arith.addi %[[VAL_81]], %[[VAL_82]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_84:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_83]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_85:.*]] = fir.load %[[VAL_84]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_86:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_87:.*]] = fir.convert %[[VAL_78]] : (i64) -> index
+// CHECK:                 %[[VAL_88:.*]] = arith.subi %[[VAL_86]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_89:.*]] = arith.addi %[[VAL_87]], %[[VAL_88]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_90:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_89]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_85]] to %[[VAL_90]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:             }
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_18]] : !hlfir.expr<?xi32>
 // CHECK:         }
 
 func.func @cshift_2d_by_scalar(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
@@ -47,43 +122,47 @@ func.func @cshift_2d_by_scalar(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir
   return %res : !hlfir.expr<?x?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_2d_by_scalar(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>,
-// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           %[[VAL_20:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_4]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK-SAME:                                   %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x?xi32>>,
+// CHECK-SAME:                                   %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
 // CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
-// CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           ^bb0(%[[VAL_12:.*]]: index, %[[VAL_13:.*]]: index):
-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i64
-// CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_10]] : i64
-// CHECK:             %[[VAL_16:.*]] = arith.subi %[[VAL_15]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
-// CHECK:             %[[VAL_18:.*]] = arith.remsi %[[VAL_16]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.xori %[[VAL_16]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_20]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_20]] : i64
-// CHECK:             %[[VAL_23:.*]] = arith.andi %[[VAL_22]], %[[VAL_21]] : i1
-// CHECK:             %[[VAL_24:.*]] = arith.addi %[[VAL_18]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.select %[[VAL_23]], %[[VAL_24]], %[[VAL_18]] : i64
-// CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_8]] : i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_8]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.remsi %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.xori %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.andi %[[VAL_14]], %[[VAL_13]] : i1
+// CHECK:           %[[VAL_16:.*]] = arith.addi %[[VAL_11]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_15]], %[[VAL_16]], %[[VAL_11]] : i64
+// CHECK:           %[[VAL_18:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           ^bb0(%[[VAL_19:.*]]: index, %[[VAL_20:.*]]: index):
+// CHECK:             %[[VAL_21:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_17]], %[[VAL_8]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_20]] : (index) -> i64
+// CHECK:             %[[VAL_24:.*]] = arith.cmpi sle, %[[VAL_23]], %[[VAL_21]] : i64
+// CHECK:             %[[VAL_25:.*]] = arith.select %[[VAL_24]], %[[VAL_17]], %[[VAL_22]] : i64
+// CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_23]], %[[VAL_25]] overflow<nsw, nuw> : i64
 // CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i64) -> index
+// CHECK:             %[[VAL_28:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
 // CHECK:             %[[VAL_29:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_33:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_34:.*]] = arith.addi %[[VAL_12]], %[[VAL_33]] : index
-// CHECK:             %[[VAL_35:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_36:.*]] = arith.addi %[[VAL_27]], %[[VAL_35]] : index
-// CHECK:             %[[VAL_37:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_34]], %[[VAL_36]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_38:.*]] = fir.load %[[VAL_37]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_38]] : i32
+// CHECK:             %[[VAL_30:.*]] = arith.subi %[[VAL_28]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_31:.*]] = arith.addi %[[VAL_19]], %[[VAL_30]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_32:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_33:.*]] = arith.addi %[[VAL_27]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_31]], %[[VAL_33]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:             hlfir.yield_element %[[VAL_35]] : i32
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_18]] : !hlfir.expr<?x?xi32>
 // CHECK:         }
 
 func.func @cshift_2d_by_vector(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
@@ -92,47 +171,51 @@ func.func @cshift_2d_by_vector(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir
   return %res : !hlfir.expr<?x?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_2d_by_vector(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>,
-// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_4]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK-SAME:                                   %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x?xi32>>,
+// CHECK-SAME:                                   %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
 // CHECK:           %[[VAL_9:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
 // CHECK:           ^bb0(%[[VAL_10:.*]]: index, %[[VAL_11:.*]]: index):
-// CHECK:             %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_15:.*]] = arith.subi %[[VAL_13]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_10]], %[[VAL_15]] : index
-// CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
-// CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
-// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_11]] : (index) -> i64
-// CHECK:             %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_21]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
-// CHECK:             %[[VAL_24:.*]] = arith.remsi %[[VAL_22]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.xori %[[VAL_22]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_27:.*]] = arith.cmpi slt, %[[VAL_25]], %[[VAL_26]] : i64
-// CHECK:             %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_24]], %[[VAL_26]] : i64
-// CHECK:             %[[VAL_29:.*]] = arith.andi %[[VAL_28]], %[[VAL_27]] : i1
-// CHECK:             %[[VAL_30:.*]] = arith.addi %[[VAL_24]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_31:.*]] = arith.select %[[VAL_29]], %[[VAL_30]], %[[VAL_24]] : i64
-// CHECK:             %[[VAL_32:.*]] = arith.addi %[[VAL_31]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i64) -> index
-// CHECK:             %[[VAL_35:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_37:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_39:.*]] = arith.subi %[[VAL_35]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_40:.*]] = arith.addi %[[VAL_10]], %[[VAL_39]] : index
-// CHECK:             %[[VAL_41:.*]] = arith.subi %[[VAL_37]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_42:.*]] = arith.addi %[[VAL_33]], %[[VAL_41]] : index
-// CHECK:             %[[VAL_43:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_40]], %[[VAL_42]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_44:.*]] = fir.load %[[VAL_43]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_44]] : i32
+// CHECK:             %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_4]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_13:.*]] = arith.subi %[[VAL_12]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_10]], %[[VAL_13]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_14]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<i32>
+// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> i64
+// CHECK:             %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:             %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_8]] : i64
+// CHECK:             %[[VAL_18:.*]] = arith.remsi %[[VAL_17]], %[[EXTENT]] : i64
+// CHECK:             %[[VAL_19:.*]] = arith.xori %[[VAL_17]], %[[EXTENT]] : i64
+// CHECK:             %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_2]] : i64
+// CHECK:             %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_2]] : i64
+// CHECK:             %[[VAL_22:.*]] = arith.andi %[[VAL_21]], %[[VAL_20]] : i1
+// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_18]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_18]] : i64
+// CHECK:             %[[VAL_25:.*]] = arith.subi %[[VAL_8]], %[[VAL_24]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_24]], %[[VAL_8]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_11]] : (index) -> i64
+// CHECK:             %[[VAL_28:.*]] = arith.cmpi sle, %[[VAL_27]], %[[VAL_25]] : i64
+// CHECK:             %[[VAL_29:.*]] = arith.select %[[VAL_28]], %[[VAL_24]], %[[VAL_26]] : i64
+// CHECK:             %[[VAL_30:.*]] = arith.addi %[[VAL_27]], %[[VAL_29]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i64) -> index
+// CHECK:             %[[VAL_32:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_33:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_34:.*]] = arith.subi %[[VAL_32]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_35:.*]] = arith.addi %[[VAL_10]], %[[VAL_34]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_36:.*]] = arith.subi %[[VAL_33]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_37:.*]] = arith.addi %[[VAL_31]], %[[VAL_36]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_38:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_35]], %[[VAL_37]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_39:.*]] = fir.load %[[VAL_38]] : !fir.ref<i32>
+// CHECK:             hlfir.yield_element %[[VAL_39]] : i32
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_9]] : !hlfir.expr<?x?xi32>
 // CHECK:         }
 
 func.func @cshift_vector_char(%arg0: !fir.box<!fir.array<?x!fir.char<2,?>>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
@@ -140,43 +223,47 @@ func.func @cshift_vector_char(%arg0: !fir.box<!fir.array<?x!fir.char<2,?>>>, %ar
   return %res : !hlfir.expr<?x!fir.char<2,?>>
 }
 // CHECK-LABEL:   func.func @cshift_vector_char(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.char<2,?>>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
-// CHECK:           %[[VAL_32:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_19:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_6:.*]] = arith.constant 2 : index
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
-// CHECK:           %[[VAL_7:.*]] = arith.divsi %[[VAL_5]], %[[VAL_6]] : index
-// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
-// CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_4]] typeparams %[[VAL_7]] unordered : (!fir.shape<1>, index) -> !hlfir.expr<?x!fir.char<2,?>> {
-// CHECK:           ^bb0(%[[VAL_12:.*]]: index):
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_10]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.subi %[[VAL_14]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_16:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_17:.*]] = arith.remsi %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.xori %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.andi %[[VAL_21]], %[[VAL_20]] : i1
-// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_17]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i64) -> index
-// CHECK:             %[[VAL_27:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
-// CHECK:             %[[VAL_29:.*]] = arith.divsi %[[VAL_27]], %[[VAL_6]] : index
-// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_33:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_32]] : index
-// CHECK:             %[[VAL_34:.*]] = arith.addi %[[VAL_26]], %[[VAL_33]] : index
-// CHECK:             %[[VAL_35:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_34]])  typeparams %[[VAL_29]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index, index) -> !fir.boxchar<2>
-// CHECK:             hlfir.yield_element %[[VAL_35]] : !fir.boxchar<2>
+// CHECK-SAME:                                  %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x!fir.char<2,?>>>,
+// CHECK-SAME:                                  %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_8:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
+// CHECK:           %[[VAL_9:.*]] = arith.divsi %[[VAL_8]], %[[VAL_4]] : index
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
+// CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_3]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_10]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.remsi %[[VAL_12]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.xori %[[VAL_12]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_13]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_17:.*]] = arith.andi %[[VAL_16]], %[[VAL_15]] : i1
+// CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_13]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_19:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_13]] : i64
+// CHECK:           %[[VAL_20:.*]] = hlfir.elemental %[[VAL_7]] typeparams %[[VAL_9]] unordered : (!fir.shape<1>, index) -> !hlfir.expr<?x!fir.char<2,?>> {
+// CHECK:           ^bb0(%[[VAL_21:.*]]: index):
+// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_10]], %[[VAL_19]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_23:.*]] = arith.subi %[[VAL_19]], %[[VAL_10]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (index) -> i64
+// CHECK:             %[[VAL_25:.*]] = arith.cmpi sle, %[[VAL_24]], %[[VAL_22]] : i64
+// CHECK:             %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_19]], %[[VAL_23]] : i64
+// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_26]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_27]] : (i64) -> index
+// CHECK:             %[[VAL_29:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
+// CHECK:             %[[VAL_30:.*]] = arith.divsi %[[VAL_29]], %[[VAL_4]] : index
+// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_32:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_33:.*]] = arith.addi %[[VAL_28]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_33]])  typeparams %[[VAL_30]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index, index) -> !fir.boxchar<2>
+// CHECK:             hlfir.yield_element %[[VAL_34]] : !fir.boxchar<2>
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_20]] : !hlfir.expr<?x!fir.char<2,?>>
 // CHECK:         }
 
 func.func @cshift_vector_poly(%arg0: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, %arg1: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
@@ -184,37 +271,41 @@ func.func @cshift_vector_poly(%arg0: !fir.class<!fir.array<?x!fir.type<_QFFtestT
   return %res : !hlfir.expr<?x!fir.type<_QFFtestTt>?>
 }
 // CHECK-LABEL:   func.func @cshift_vector_poly(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
-// CHECK:           %[[VAL_25:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_15:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
-// CHECK:           %[[VAL_7:.*]] = hlfir.elemental %[[VAL_4]] mold %[[VAL_0]] unordered : (!fir.shape<1>, !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
-// CHECK:           ^bb0(%[[VAL_8:.*]]: index):
-// CHECK:             %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (index) -> i64
-// CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_11:.*]] = arith.subi %[[VAL_10]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_13:.*]] = arith.remsi %[[VAL_11]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_14:.*]] = arith.xori %[[VAL_11]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_16:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_15]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_13]], %[[VAL_15]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.andi %[[VAL_17]], %[[VAL_16]] : i1
-// CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_20:.*]] = arith.select %[[VAL_18]], %[[VAL_19]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_22:.*]] = fir.convert %[[VAL_21]] : (i64) -> index
-// CHECK:             %[[VAL_24:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_24]]#0, %[[VAL_25]] : index
-// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_22]], %[[VAL_26]] : index
+// CHECK-SAME:                                  %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>,
+// CHECK-SAME:                                  %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]]#1 : (index) -> i64
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_3]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_7]] : i64
+// CHECK:           %[[VAL_9:.*]] = arith.remsi %[[VAL_8]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_10:.*]] = arith.xori %[[VAL_8]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.cmpi slt, %[[VAL_10]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.andi %[[VAL_12]], %[[VAL_11]] : i1
+// CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_9]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_13]], %[[VAL_14]], %[[VAL_9]] : i64
+// CHECK:           %[[VAL_16:.*]] = hlfir.elemental %[[VAL_6]] mold %[[VAL_0]] unordered : (!fir.shape<1>, !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
+// CHECK:           ^bb0(%[[VAL_17:.*]]: index):
+// CHECK:             %[[VAL_18:.*]] = arith.subi %[[VAL_7]], %[[VAL_15]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_19:.*]] = arith.subi %[[VAL_15]], %[[VAL_7]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_17]] : (index) -> i64
+// CHECK:             %[[VAL_21:.*]] = arith.cmpi sle, %[[VAL_20]], %[[VAL_18]] : i64
+// CHECK:             %[[VAL_22:.*]] = arith.select %[[VAL_21]], %[[VAL_15]], %[[VAL_19]] : i64
+// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_20]], %[[VAL_22]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i64) -> index
+// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_26]] overflow<nsw, nuw> : index
 // CHECK:             %[[VAL_28:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_27]])  : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> !fir.class<!fir.type<_QFFtestTt>>
 // CHECK:             hlfir.yield_element %[[VAL_28]] : !fir.class<!fir.type<_QFFtestTt>>
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_16]] : !hlfir.expr<?x!fir.type<_QFFtestTt>?>
 // CHECK:         }
 
 // negative: non-constant dim argument
@@ -243,36 +334,13 @@ func.func @cshift_vector_assumed_dim_1(%arg0: !fir.box<!fir.array<?xi32>>, %arg1
   return %res : !hlfir.expr<?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_vector_assumed_dim_1(
-// CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>,
-// CHECK-SAME:                                           %[[VAL_1:.*]]: i32) -> !hlfir.expr<?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
-// CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_5]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: index):
-// CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
-// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_7]] : i64
-// CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_11]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_4]]#1 : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.remsi %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.xori %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_14]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.andi %[[VAL_18]], %[[VAL_17]] : i1
-// CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_14]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_20]], %[[VAL_14]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i64) -> index
-// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_27:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_26]] : index
-// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_27]] : index
-// CHECK:             %[[VAL_29:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_30]] : i32
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
+// CHECK-NOT: hlfir.cshift
+
+// Check that hlfir.cshift is converted to hlfir.elemental
+// when the argument is an array expression:
+func.func @cshift_vector_expr(%arg0: !hlfir.expr<?xi32>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?xi32>{
+  %res = hlfir.cshift %arg0 %arg1 : (!hlfir.expr<?xi32>, !fir.ref<i32>) -> !hlfir.expr<?xi32>
+  return %res : !hlfir.expr<?xi32>
+}
+// CHECK-LABEL:   func.func @cshift_vector_expr(
+// CHECK: hlfir.elemental