diff --git a/lib/HLSL/HLLowerUDT.cpp b/lib/HLSL/HLLowerUDT.cpp
index 4d68940444..1d9abc56b2 100644
--- a/lib/HLSL/HLLowerUDT.cpp
+++ b/lib/HLSL/HLLowerUDT.cpp
@@ -408,7 +408,7 @@ void hlsl::ReplaceUsesForLoweredUDT(Value *V, Value *NewV) {
 
         std::vector<Instruction*> DeadInsts;
         HLMatrixSubscriptUseReplacer UseReplacer(
-          CI, NewV, ElemIndices, /*AllowLoweredPtrGEPs*/true, DeadInsts);
+          CI, NewV, /*TempLoweredMatrix*/nullptr, ElemIndices, /*AllowLoweredPtrGEPs*/true, DeadInsts);
         DXASSERT(CI->use_empty(),
                  "Expected all matrix subscript uses to have been replaced.");
         CI->eraseFromParent();
diff --git a/lib/HLSL/HLMatrixLowerPass.cpp b/lib/HLSL/HLMatrixLowerPass.cpp
index 6b9f760a1c..55e7f4be56 100644
--- a/lib/HLSL/HLMatrixLowerPass.cpp
+++ b/lib/HLSL/HLMatrixLowerPass.cpp
@@ -406,7 +406,7 @@ Value *HLMatrixLowerPass::tryGetLoweredPtrOperand(Value *Ptr, IRBuilder<> &Build
     RootPtr = GEP->getPointerOperand();
 
   Argument *Arg = dyn_cast<Argument>(RootPtr);
-  bool IsNonShaderArg = Arg != nullptr && !m_pHLModule->IsGraphicsShader(Arg->getParent());
+  bool IsNonShaderArg = Arg != nullptr && !m_pHLModule->IsEntryThatUsesSignatures(Arg->getParent());
   if (IsNonShaderArg || isa<AllocaInst>(RootPtr)) {
     // Bitcast the matrix pointer to its lowered equivalent.
     // The HLMatrixBitcast pass will take care of this later.
@@ -1474,17 +1474,32 @@ void HLMatrixLowerPass::lowerHLMatSubscript(CallInst *Call, Value *MatPtr, Small
 
   IRBuilder<> CallBuilder(Call);
   Value *LoweredPtr = tryGetLoweredPtrOperand(MatPtr, CallBuilder);
-  if (LoweredPtr == nullptr) return;
+  Value *LoweredMatrix = nullptr;
+  Value *RootPtr = LoweredPtr? LoweredPtr: MatPtr;
+  while (GEPOperator *GEP = dyn_cast<GEPOperator>(RootPtr))
+    RootPtr = GEP->getPointerOperand();
 
+  if (LoweredPtr == nullptr) {
+    if (!isa<Argument>(RootPtr))
+      return;
+
+    // For a shader input, load the matrix into a lowered ptr
+    // The load will be handled by LowerSignature
+    HLMatLoadStoreOpcode Opcode = (HLSubscriptOpcode)GetHLOpcode(Call) == HLSubscriptOpcode::RowMatSubscript ?
+                                   HLMatLoadStoreOpcode::RowMatLoad : HLMatLoadStoreOpcode::ColMatLoad;
+    HLMatrixType MatTy = HLMatrixType::cast(MatPtr->getType()->getPointerElementType());
+    LoweredMatrix = callHLFunction(
+      *m_pModule, HLOpcodeGroup::HLMatLoadStore, static_cast<unsigned>(Opcode),
+      MatTy.getLoweredVectorTypeForReg(), { CallBuilder.getInt32((uint32_t)Opcode), MatPtr },
+      Call->getCalledFunction()->getAttributes().getFnAttributes(), CallBuilder);
+  }
   // For global variables, we can GEP directly into the lowered vector pointer.
   // This is necessary to support group shared memory atomics and the likes.
-  Value *RootPtr = LoweredPtr;
-  while (GEPOperator *GEP = dyn_cast<GEPOperator>(RootPtr))
-    RootPtr = GEP->getPointerOperand();
   bool AllowLoweredPtrGEPs = isa<GlobalVariable>(RootPtr);
   
   // Just constructing this does all the work
-  HLMatrixSubscriptUseReplacer UseReplacer(Call, LoweredPtr, ElemIndices, AllowLoweredPtrGEPs, m_deadInsts);
+  HLMatrixSubscriptUseReplacer UseReplacer(Call, LoweredPtr, LoweredMatrix,
+                                           ElemIndices, AllowLoweredPtrGEPs, m_deadInsts);
 
   DXASSERT(Call->use_empty(), "Expected all matrix subscript uses to have been replaced.");
   addToDeadInsts(Call);
diff --git a/lib/HLSL/HLMatrixSubscriptUseReplacer.cpp b/lib/HLSL/HLMatrixSubscriptUseReplacer.cpp
index ba9eeb99f2..d4696ae7f6 100644
--- a/lib/HLSL/HLMatrixSubscriptUseReplacer.cpp
+++ b/lib/HLSL/HLMatrixSubscriptUseReplacer.cpp
@@ -19,9 +19,10 @@
 using namespace llvm;
 using namespace hlsl;
 
-HLMatrixSubscriptUseReplacer::HLMatrixSubscriptUseReplacer(CallInst* Call, Value *LoweredPtr,
+HLMatrixSubscriptUseReplacer::HLMatrixSubscriptUseReplacer(CallInst* Call, Value *LoweredPtr, Value *TempLoweredMatrix,
   SmallVectorImpl<Value*> &ElemIndices, bool AllowLoweredPtrGEPs, std::vector<Instruction*> &DeadInsts)
-  : LoweredPtr(LoweredPtr), ElemIndices(ElemIndices), DeadInsts(DeadInsts), AllowLoweredPtrGEPs(AllowLoweredPtrGEPs)
+  : LoweredPtr(LoweredPtr), ElemIndices(ElemIndices), DeadInsts(DeadInsts),
+    AllowLoweredPtrGEPs(AllowLoweredPtrGEPs), TempLoweredMatrix(TempLoweredMatrix)
 {
   HasScalarResult = !Call->getType()->getPointerElementType()->isVectorTy();
 
@@ -32,6 +33,11 @@ HLMatrixSubscriptUseReplacer::HLMatrixSubscriptUseReplacer(CallInst* Call, Value
     }
   }
 
+  if (TempLoweredMatrix)
+    LoweredTy = TempLoweredMatrix->getType();
+  else
+    LoweredTy = LoweredPtr->getType()->getPointerElementType();
+
   replaceUses(Call, /* GEPIdx */ nullptr);
 }
 
@@ -162,7 +168,8 @@ void HLMatrixSubscriptUseReplacer::cacheLoweredMatrix(bool ForDynamicIndexing, I
 
   // Load without memory to register representation conversion,
   // since the point is to mimic pointer semantics
-  TempLoweredMatrix = Builder.CreateLoad(LoweredPtr);
+  if (!TempLoweredMatrix)
+    TempLoweredMatrix = Builder.CreateLoad(LoweredPtr);
 
   if (!ForDynamicIndexing) return;
 
@@ -238,7 +245,6 @@ Value *HLMatrixSubscriptUseReplacer::loadVector(IRBuilder<> &Builder) {
 
   // Otherwise load elements one by one
   // Lowered form may be array when AllowLoweredPtrGEPs == true.
-  Type* LoweredTy = LoweredPtr->getType()->getPointerElementType();
   Type* ElemTy = LoweredTy->isVectorTy() ? LoweredTy->getScalarType() :
               cast<ArrayType>(LoweredTy)->getArrayElementType();
   VectorType *VecTy = VectorType::get(ElemTy, static_cast<unsigned>(ElemIndices.size()));
@@ -270,7 +276,7 @@ void HLMatrixSubscriptUseReplacer::flushLoweredMatrix(IRBuilder<> &Builder) {
     // First re-create the vector from the temporary array
     DXASSERT_NOMSG(LazyTempElemArrayAlloca != nullptr);
 
-    VectorType *LoweredMatrixTy = cast<VectorType>(LoweredPtr->getType()->getPointerElementType());
+    VectorType *LoweredMatrixTy = cast<VectorType>(LoweredTy);
     TempLoweredMatrix = UndefValue::get(LoweredMatrixTy);
     Value *GEPIndices[2] = { Builder.getInt32(0), nullptr };
     for (unsigned ElemIdx = 0; ElemIdx < LoweredMatrixTy->getNumElements(); ++ElemIdx) {
@@ -284,4 +290,4 @@ void HLMatrixSubscriptUseReplacer::flushLoweredMatrix(IRBuilder<> &Builder) {
   // Store back the lowered matrix to its pointer
   Builder.CreateStore(TempLoweredMatrix, LoweredPtr);
   TempLoweredMatrix = nullptr;
-}
\ No newline at end of file
+}
diff --git a/lib/HLSL/HLMatrixSubscriptUseReplacer.h b/lib/HLSL/HLMatrixSubscriptUseReplacer.h
index afe38704f2..b49aafdb0c 100644
--- a/lib/HLSL/HLMatrixSubscriptUseReplacer.h
+++ b/lib/HLSL/HLMatrixSubscriptUseReplacer.h
@@ -30,7 +30,7 @@ namespace hlsl {
 class HLMatrixSubscriptUseReplacer {
 public:
   // The constructor does everything
-  HLMatrixSubscriptUseReplacer(llvm::CallInst* Call, llvm::Value *LoweredPtr,
+  HLMatrixSubscriptUseReplacer(llvm::CallInst* Call, llvm::Value *LoweredPtr, llvm::Value *TempLoweredMatrix,
     llvm::SmallVectorImpl<llvm::Value*> &ElemIndices, bool AllowLoweredPtrGEPs,
     std::vector<llvm::Instruction*> &DeadInsts);
 
@@ -51,6 +51,7 @@ class HLMatrixSubscriptUseReplacer {
   bool AllowLoweredPtrGEPs = false;
   bool HasScalarResult = false;
   bool HasDynamicElemIndex = false;
+  llvm::Type *LoweredTy = nullptr;
 
   // The entire lowered matrix as loaded from LoweredPtr,
   // nullptr if we copied it to a temporary array.
@@ -64,4 +65,4 @@ class HLMatrixSubscriptUseReplacer {
   // so we can dynamically index the level 1 indices.
   llvm::AllocaInst *LazyTempElemIndicesArrayAlloca = nullptr;
 };
-} // namespace hlsl
\ No newline at end of file
+} // namespace hlsl
diff --git a/lib/HLSL/HLSignatureLower.cpp b/lib/HLSL/HLSignatureLower.cpp
index e53241ffc3..583e7a9050 100644
--- a/lib/HLSL/HLSignatureLower.cpp
+++ b/lib/HLSL/HLSignatureLower.cpp
@@ -588,6 +588,7 @@ Value *GenerateLdInput(Function *loadInput, ArrayRef<Value *> args,
   }
 }
 
+
 Value *replaceLdWithLdInput(Function *loadInput, LoadInst *ldInst,
                             unsigned cols, MutableArrayRef<Value *> args,
                             bool bCast) {
@@ -654,6 +655,96 @@ Value *replaceLdWithLdInput(Function *loadInput, LoadInst *ldInst,
   }
 }
 
+
+void replaceMatStWithStOutputs(CallInst *CI, HLMatLoadStoreOpcode matOp,
+                               Function *ldStFunc, Constant *OpArg, Constant *ID,
+                               Constant *columnConsts[],Value *vertexOrPrimID,
+                               Value *idxVal) {
+  IRBuilder<> LocalBuilder(CI);
+  Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
+  HLMatrixType MatTy = HLMatrixType::cast(
+                                          CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx)
+                                          ->getType()->getPointerElementType());
+
+  Val = MatTy.emitLoweredRegToMem(Val, LocalBuilder);
+
+  if (matOp == HLMatLoadStoreOpcode::ColMatStore) {
+    for (unsigned c = 0; c < MatTy.getNumColumns(); c++) {
+      Constant *constColIdx = LocalBuilder.getInt32(c);
+      Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
+
+      for (unsigned r = 0; r < MatTy.getNumRows(); r++) {
+        unsigned matIdx = MatTy.getColumnMajorIndex(r, c);
+        Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+        LocalBuilder.CreateCall(ldStFunc,
+                                { OpArg, ID, colIdx, columnConsts[r], Elt });
+      }
+    }
+  } else {
+    for (unsigned r = 0; r < MatTy.getNumRows(); r++) {
+      Constant *constRowIdx = LocalBuilder.getInt32(r);
+      Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+      for (unsigned c = 0; c < MatTy.getNumColumns(); c++) {
+        unsigned matIdx = MatTy.getRowMajorIndex(r, c);
+        Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+        LocalBuilder.CreateCall(ldStFunc,
+                                { OpArg, ID, rowIdx, columnConsts[c], Elt });
+      }
+    }
+  }
+  CI->eraseFromParent();
+}
+
+
+void replaceMatLdWithLdInputs(CallInst *CI, HLMatLoadStoreOpcode matOp,
+                              Function *ldStFunc, Constant *OpArg, Constant *ID,
+                              Constant *columnConsts[],Value *vertexOrPrimID,
+                              Value *idxVal) {
+  IRBuilder<> LocalBuilder(CI);
+  HLMatrixType MatTy = HLMatrixType::cast(
+                                          CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
+                                          ->getType()->getPointerElementType());
+  std::vector<Value *> matElts(MatTy.getNumElements());
+
+  if (matOp == HLMatLoadStoreOpcode::ColMatLoad) {
+    for (unsigned c = 0; c < MatTy.getNumColumns(); c++) {
+      Constant *constRowIdx = LocalBuilder.getInt32(c);
+      Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+      for (unsigned r = 0; r < MatTy.getNumRows(); r++) {
+        SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[r] };
+        if (vertexOrPrimID)
+          args.emplace_back(vertexOrPrimID);
+
+        Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+        unsigned matIdx = MatTy.getColumnMajorIndex(r, c);
+        matElts[matIdx] = input;
+      }
+    }
+  } else {
+    for (unsigned r = 0; r < MatTy.getNumRows(); r++) {
+      Constant *constRowIdx = LocalBuilder.getInt32(r);
+      Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+      for (unsigned c = 0; c < MatTy.getNumColumns(); c++) {
+        SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[c] };
+        if (vertexOrPrimID)
+          args.emplace_back(vertexOrPrimID);
+
+        Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+        unsigned matIdx = MatTy.getRowMajorIndex(r, c);
+        matElts[matIdx] = input;
+      }
+    }
+  }
+
+  Value *newVec =
+    HLMatrixLower::BuildVector(matElts[0]->getType(), matElts, LocalBuilder);
+  newVec = MatTy.emitLoweredMemToReg(newVec, LocalBuilder);
+
+  CI->replaceAllUsesWith(newVec);
+  CI->eraseFromParent();
+}
+
+
 void replaceDirectInputParameter(Value *param, Function *loadInput,
                                  unsigned cols, MutableArrayRef<Value *> args,
                                  bool bCast, OP *hlslOP, IRBuilder<> &Builder) {
@@ -964,84 +1055,11 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
     switch (matOp) {
     case HLMatLoadStoreOpcode::ColMatLoad:
     case HLMatLoadStoreOpcode::RowMatLoad: {
-      IRBuilder<> LocalBuilder(CI);
-      HLMatrixType MatTy = HLMatrixType::cast(
-        CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
-          ->getType()->getPointerElementType());
-      std::vector<Value *> matElts(MatTy.getNumElements());
-
-      if (matOp == HLMatLoadStoreOpcode::ColMatLoad) {
-        for (unsigned c = 0; c < MatTy.getNumColumns(); c++) {
-          Constant *constRowIdx = LocalBuilder.getInt32(c);
-          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-          for (unsigned r = 0; r < MatTy.getNumRows(); r++) {
-            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[r] };
-            if (vertexOrPrimID)
-              args.emplace_back(vertexOrPrimID);
-
-            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-            unsigned matIdx = MatTy.getColumnMajorIndex(r, c);
-            matElts[matIdx] = input;
-          }
-        }
-      } else {
-        for (unsigned r = 0; r < MatTy.getNumRows(); r++) {
-          Constant *constRowIdx = LocalBuilder.getInt32(r);
-          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-          for (unsigned c = 0; c < MatTy.getNumColumns(); c++) {
-            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[c] };
-            if (vertexOrPrimID)
-              args.emplace_back(vertexOrPrimID);
-
-            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-            unsigned matIdx = MatTy.getRowMajorIndex(r, c);
-            matElts[matIdx] = input;
-          }
-        }
-      }
-
-      Value *newVec =
-          HLMatrixLower::BuildVector(matElts[0]->getType(), matElts, LocalBuilder);
-      newVec = MatTy.emitLoweredMemToReg(newVec, LocalBuilder);
-
-      CI->replaceAllUsesWith(newVec);
-      CI->eraseFromParent();
+      replaceMatLdWithLdInputs(CI, matOp, ldStFunc, OpArg, ID, columnConsts, vertexOrPrimID, idxVal);
     } break;
     case HLMatLoadStoreOpcode::ColMatStore:
     case HLMatLoadStoreOpcode::RowMatStore: {
-      IRBuilder<> LocalBuilder(CI);
-      Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      HLMatrixType MatTy = HLMatrixType::cast(
-        CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx)
-          ->getType()->getPointerElementType());
-
-      Val = MatTy.emitLoweredRegToMem(Val, LocalBuilder);
-
-      if (matOp == HLMatLoadStoreOpcode::ColMatStore) {
-        for (unsigned c = 0; c < MatTy.getNumColumns(); c++) {
-          Constant *constColIdx = LocalBuilder.getInt32(c);
-          Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
-
-          for (unsigned r = 0; r < MatTy.getNumRows(); r++) {
-            unsigned matIdx = MatTy.getColumnMajorIndex(r, c);
-            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-            LocalBuilder.CreateCall(ldStFunc,
-              { OpArg, ID, colIdx, columnConsts[r], Elt });
-          }
-        }
-      } else {
-        for (unsigned r = 0; r < MatTy.getNumRows(); r++) {
-          Constant *constRowIdx = LocalBuilder.getInt32(r);
-          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-          for (unsigned c = 0; c < MatTy.getNumColumns(); c++) {
-            unsigned matIdx = MatTy.getRowMajorIndex(r, c);
-            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-            LocalBuilder.CreateCall(ldStFunc,
-              { OpArg, ID, rowIdx, columnConsts[c], Elt });
-          }
-        }
-      }
-      CI->eraseFromParent();
+      replaceMatStWithStOutputs(CI, matOp, ldStFunc, OpArg, ID, columnConsts, vertexOrPrimID, idxVal);
     } break;
     }
   } else {
@@ -1386,6 +1404,14 @@ void HLSignatureLower::GenerateDxilPatchConstantFunctionInputs() {
   Type *i1Ty = Type::getInt1Ty(constZero->getContext());
   Type *i32Ty = constZero->getType();
 
+  Constant *columnConsts[] = {
+      hlslOP->GetU8Const(0),  hlslOP->GetU8Const(1),  hlslOP->GetU8Const(2),
+      hlslOP->GetU8Const(3),  hlslOP->GetU8Const(4),  hlslOP->GetU8Const(5),
+      hlslOP->GetU8Const(6),  hlslOP->GetU8Const(7),  hlslOP->GetU8Const(8),
+      hlslOP->GetU8Const(9),  hlslOP->GetU8Const(10), hlslOP->GetU8Const(11),
+      hlslOP->GetU8Const(12), hlslOP->GetU8Const(13), hlslOP->GetU8Const(14),
+      hlslOP->GetU8Const(15)};
+
   for (Argument &arg : patchConstantFunc->args()) {
     DxilParameterAnnotation &paramAnnotation =
         patchFuncAnnotation->GetParameterAnnotation(arg.getArgNo());
@@ -1422,11 +1448,21 @@ void HLSignatureLower::GenerateDxilPatchConstantFunctionInputs() {
       collectInputOutputAccessInfo(&arg, constZero, accessInfoList,
                                    /*hasVertexOrPrimID*/ true, true, bRowMajor, false);
       for (InputOutputAccessInfo &info : accessInfoList) {
+        Constant *OpArg = hlslOP->GetU32Const((unsigned)opcode);
         if (LoadInst *ldInst = dyn_cast<LoadInst>(info.user)) {
-          Constant *OpArg = hlslOP->GetU32Const((unsigned)opcode);
           Value *args[] = {OpArg, inputID, info.idx, info.vectorIdx,
                            info.vertexOrPrimID};
           replaceLdWithLdInput(dxilLdFunc, ldInst, cols, args, bI1Cast);
+        } else if (CallInst *CI = dyn_cast<CallInst>(info.user)) {
+          HLOpcodeGroup group = GetHLOpcodeGroupByName(CI->getCalledFunction());
+          // Intrinsic will be translated later.
+          if (group == HLOpcodeGroup::HLIntrinsic || group == HLOpcodeGroup::NotHL)
+            return;
+          unsigned opcode = GetHLOpcode(CI);
+          DXASSERT_NOMSG(group == HLOpcodeGroup::HLMatLoadStore);
+          HLMatLoadStoreOpcode matOp = static_cast<HLMatLoadStoreOpcode>(opcode);
+          if (matOp == HLMatLoadStoreOpcode::ColMatLoad || matOp == HLMatLoadStoreOpcode::RowMatLoad)
+            replaceMatLdWithLdInputs(CI, matOp, dxilLdFunc, OpArg, inputID, columnConsts, info.vertexOrPrimID, info.idx);
         } else {
           DXASSERT(0, "input should only be ld");
         }
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript.hlsl
new file mode 100644
index 0000000000..ec543822bc
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript.hlsl
@@ -0,0 +1,69 @@
+// RUN: %dxc -DMIDX=1 -DVIDX=2 -T ps_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=2 -T ps_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=j -T ps_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=j -T ps_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=2 -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=2 -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=j -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=j -T lib_6_3 %s | FileCheck %s
+
+// Test for general subscript operations on matrix arrays.
+// Specifically focused on shader inputs which failed to lower previously
+
+float3 GetRow(const float3x3 m, const int j)
+{
+  return m[j];
+}
+
+float3x3 g[2];
+groupshared float3x3 gs[2];
+
+struct JustMtx {
+  float3x3 mtx;
+};
+
+struct MtxArray {
+  float3x3 mtx[2];
+};
+
+[shader("pixel")]
+float3 main(const int i : I, const int j : J, const float3x3 m[2]: M, JustMtx jm[2] : JM, MtxArray ma : A) : SV_Target
+{
+  float3 ret = 0.0;
+
+  // CHECK: call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32
+  // CHECK: call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32
+  // CHECK: call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32
+  ret += g[MIDX][VIDX];
+
+  // CHECK: load float, float addrspace(3)*
+  // CHECK: load float, float addrspace(3)*
+  // CHECK: load float, float addrspace(3)*
+  ret += gs[MIDX][VIDX];
+
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 2, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 2, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 2, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  ret += m[MIDX][VIDX];
+
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 3, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 3, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 3, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  ret += jm[MIDX].mtx[VIDX];
+
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 4, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 4, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 4, i32 {{%?[0-9]*}}, i8 2, i32 undef)
+  ret += ma.mtx[MIDX][VIDX];
+
+  ret += GetRow(g[MIDX], VIDX);
+  ret += GetRow(gs[MIDX], VIDX);
+  ret += GetRow(m[MIDX], VIDX);
+  ret += GetRow(jm[MIDX].mtx, VIDX);
+  ret += GetRow(ma.mtx[MIDX], VIDX);
+
+  // CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %{{.*}})
+  // CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float %{{.*}})
+  // CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float %{{.*}})
+  return ret;
+}
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript_ds.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript_ds.hlsl
new file mode 100644
index 0000000000..7ddabc3702
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript_ds.hlsl
@@ -0,0 +1,40 @@
+// RUN: %dxc -DMIDX=1 -DVIDX=2 -T ds_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=2 -T ds_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=j -T ds_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=j -T ds_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=2 -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=2 -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=j -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=j -T lib_6_3 %s | FileCheck %s
+
+// Specific test for subscript operation on OutputPatch matrix data
+
+struct MatStruct {
+ float4x4 mtx : M;
+};
+
+float4 GetRow(const OutputPatch<MatStruct, 3> tri, int i, int j)
+{
+  return tri[MIDX].mtx[VIDX];
+}
+
+[domain("tri")]
+[shader("domain")]
+float4 main(int i : I, int j : J, const OutputPatch<MatStruct, 3> tri) : SV_Position {
+
+  float4 ret = 0;
+
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 {{%?[0-9]*}}, i8 2, i32 {{%?[0-9]*}})
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 {{%?[0-9]*}}, i8 2, i32 {{%?[0-9]*}})
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 {{%?[0-9]*}}, i8 2, i32 {{%?[0-9]*}})
+  // CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 {{%?[0-9]*}}, i8 2, i32 {{%?[0-9]*}})
+  ret += tri[MIDX].mtx[VIDX];
+
+  ret += GetRow(tri, MIDX, VIDX);
+
+  // CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %{{.*}})
+  // CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float %{{.*}})
+  // CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float %{{.*}})
+  // CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float %{{.*}})
+  return ret;
+}
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript_hs.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript_hs.hlsl
new file mode 100644
index 0000000000..ed210634c7
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/types/matrix/matrix_subscript_hs.hlsl
@@ -0,0 +1,62 @@
+// RUN: %dxc -DMIDX=1 -DVIDX=2 -T hs_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=2 -T hs_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=j -T hs_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=j -T hs_6_0 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=2 -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=2 -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=1 -DVIDX=j -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -DMIDX=i -DVIDX=j -T lib_6_3 %s | FileCheck %s
+
+// Specific test for subscript operation on matrix array inputs in patch functions
+
+struct MatStruct {
+  int2 uv : TEXCOORD0;
+  float3x4  m_ObjectToWorld : TEXCOORD1;
+};
+
+struct Output {
+  float edges[3] : SV_TessFactor;
+  float inside : SV_InsideTessFactor;
+};
+
+// Instruction order here is a bit inconsistent.
+// So we can't test for all the outputs
+// CHECK: call float @dx.op.loadInput.f32
+// CHECK: call float @dx.op.loadInput.f32
+// CHECK: call float @dx.op.loadInput.f32
+// CHECK: call void @dx.op.storePatchConstant.f32
+// CHECK: call void @dx.op.storePatchConstant.f32
+Output Patch(InputPatch<MatStruct, 3> inputs)
+{
+  Output ret;
+  int i = inputs[0].uv.x;
+  int j = inputs[0].uv.y;
+
+  ret.edges[0] = inputs[MIDX].m_ObjectToWorld[VIDX][0];
+  ret.edges[1] = inputs[MIDX].m_ObjectToWorld[VIDX][1];
+  ret.edges[2] = inputs[MIDX].m_ObjectToWorld[VIDX][2];
+  ret.inside = 1.0f;
+  return ret;
+}
+
+
+// CHECK: call float @dx.op.loadInput.f32
+// CHECK: call float @dx.op.loadInput.f32
+// CHECK: call float @dx.op.loadInput.f32
+// CHECK: call float @dx.op.loadInput.f32
+// CHECK: call void @dx.op.storeOutput.f32
+// CHECK: call void @dx.op.storeOutput.f32
+// CHECK: call void @dx.op.storeOutput.f32
+// CHECK: call void @dx.op.storeOutput.f32
+[domain("tri")]
+[partitioning("fractional_odd")]
+[outputtopology("triangle_cw")]
+[patchconstantfunc("Patch")]
+[outputcontrolpoints(3)]
+[shader("hull")]
+float4 main(InputPatch<MatStruct, 3> inputs) : SV_Position
+{
+  int i = inputs[0].uv.x;
+  int j = inputs[0].uv.y;
+  return inputs[MIDX].m_ObjectToWorld[VIDX];
+}