diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index fc09a068fe285..dddb25618f5c0 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -188,6 +188,8 @@ def SparseTensor_LexInsertOp : SparseTensor_Op<"lex_insert", []>,
     is solely defined by side-effects and not SSA values. The semantics
     may be refined over time as our sparse abstractions evolve.
 
+    Example:
+
     ```mlir
     sparse_tensor.lex_insert %tensor, %indices, %val
       : tensor<1024x1024xf64, #CSR>, memref<?xindex>, f64
@@ -385,7 +387,8 @@ def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [NoSideEffect]>,
       would be equivalent to a union operation where non-overlapping values
       in the inputs are copied to the output unchanged.
 
-      Example of isEqual applied to intersecting elements only.
+      Example of isEqual applied to intersecting elements only:
+
       ```mlir
       %C = bufferization.alloc_tensor...
       %0 = linalg.generic #trait
@@ -405,8 +408,8 @@ def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [NoSideEffect]>,
       } -> tensor<?xi8, #SparseVec>
       ```
 
-      Example of A+B in upper triangle, A-B in lower triangle
-      (not working yet, but construct will be available soon).
+      Example of A+B in upper triangle, A-B in lower triangle:
+
       ```mlir
       %C = bufferization.alloc_tensor...
       %1 = linalg.generic #trait
@@ -438,7 +441,8 @@ def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [NoSideEffect]>,
 
       Example of set difference. Returns a copy of A where its sparse structure
       is *not* overlapped by B. The element type of B can be different than A
-      because we never use its values, only its sparse structure.
+      because we never use its values, only its sparse structure:
+
       ```mlir
       %C = bufferization.alloc_tensor...
       %2 = linalg.generic #trait
@@ -486,6 +490,7 @@ def SparseTensor_UnaryOp : SparseTensor_Op<"unary", [NoSideEffect]>,
       region does not contribute to the output.
 
       Example of A+1, restricted to existing elements:
+
       ```mlir
       %C = bufferization.alloc_tensor...
       %0 = linalg.generic #trait
@@ -546,6 +551,7 @@ def SparseTensor_YieldOp : SparseTensor_Op<"yield", [NoSideEffect, Terminator]>,
       Yields a value from within a `binary` or `unary` block.
 
       Example:
+
       ```
       %0 = sparse_tensor.unary %a : i64 to i64 {
         ^bb0(%arg0: i64):
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index b6aa7b99e54a4..d0b6758d00ab8 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -881,9 +881,8 @@ static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc,
 }
 
 /// Generates an index value.
-static Value genIndexValue(Merger &merger, CodeGen &codegen, OpBuilder &builder,
-                           unsigned exp, unsigned ldx) {
-  unsigned idx = merger.exp(exp).index;
+static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx,
+                           unsigned ldx) {
   Value ival = codegen.loops[idx];
   Type itype = ival.getType();
   // During vectorization, we either encounter:
@@ -913,6 +912,25 @@ static Value genIndexValue(Merger &merger, CodeGen &codegen, OpBuilder &builder,
   return ival;
 }
 
+/// Semi-ring branches are simply inlined by the sparse compiler. Prior
+/// analysis has verified that all computations are "local" to the inlined
+/// branch or otherwise invariantly defined outside the loop nest, with the
+/// exception of index computations, which need to be relinked to actual
+/// inlined cloned code.
+static Value relinkBranch(CodeGen &codegen, RewriterBase &rewriter,
+                          Block *block, Value e, unsigned ldx) {
+  if (Operation *def = e.getDefiningOp()) {
+    if (auto indexOp = dyn_cast<linalg::IndexOp>(def))
+      return genIndexValue(codegen, rewriter, indexOp.dim(), ldx);
+    if (def->getBlock() == block) {
+      for (unsigned i = 0, n = def->getNumOperands(); i < n; i++)
+        def->setOperand(
+            i, relinkBranch(codegen, rewriter, block, def->getOperand(i), ldx));
+    }
+  }
+  return e;
+}
+
 /// Recursively generates tensor expression.
 static Value genExp(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
                     linalg::GenericOp op, unsigned exp, unsigned ldx) {
@@ -924,12 +942,17 @@ static Value genExp(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
   if (merger.exp(exp).kind == Kind::kInvariant)
     return genInvariantValue(merger, codegen, rewriter, exp);
   if (merger.exp(exp).kind == Kind::kIndex)
-    return genIndexValue(merger, codegen, rewriter, exp, ldx);
+    return genIndexValue(codegen, rewriter, merger.exp(exp).index, ldx);
   Value v0 =
       genExp(merger, codegen, rewriter, op, merger.exp(exp).children.e0, ldx);
   Value v1 =
       genExp(merger, codegen, rewriter, op, merger.exp(exp).children.e1, ldx);
-  return merger.buildExp(rewriter, loc, exp, v0, v1);
+  Value ee = merger.buildExp(rewriter, loc, exp, v0, v1);
+  if (ee && (merger.exp(exp).kind == Kind::kUnary ||
+             merger.exp(exp).kind == Kind::kBinary ||
+             merger.exp(exp).kind == Kind::kBinaryBranch))
+    ee = relinkBranch(codegen, rewriter, ee.getParentBlock(), ee, ldx);
+  return ee;
 }
 
 /// Determines if affine expression is invariant.
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 78771d1a90835..ed19cf7a18690 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -798,7 +798,9 @@ unsigned Merger::buildLattices(unsigned e, unsigned i) {
 }
 
 Optional<unsigned> Merger::buildTensorExpFromLinalg(linalg::GenericOp op) {
+  // Build the linalg semantics backward from yield.
   Operation *yield = op.region().front().getTerminator();
+  assert(isa<linalg::YieldOp>(yield));
   return buildTensorExp(op, yield->getOperand(0));
 }
 
@@ -832,6 +834,37 @@ Type Merger::inferType(unsigned e, Value src) {
   return dtp;
 }
 
+/// Ensures that sparse compiler can generate code for expression.
+static bool isAdmissableBranchExp(Operation *op, Block *block, Value v) {
+  // Arguments are always admissable.
+  if (auto arg = v.dyn_cast<BlockArgument>())
+    return true;
+  // Accept index anywhere.
+  Operation *def = v.getDefiningOp();
+  if (isa<linalg::IndexOp>(def))
+    return true;
+  // Operation defined outside branch.
+  if (def->getBlock() != block) {
+    return def->getBlock() != op->getBlock(); // invariant?
+  }
+  // Operation defined within branch. Anything is accepted,
+  // as long as all subexpressions are admissable.
+  for (unsigned i = 0, n = def->getNumOperands(); i < n; i++)
+    if (!isAdmissableBranchExp(op, block, def->getOperand(i)))
+      return false;
+  return true;
+}
+
+/// Ensures that sparse compiler can generate code for branch.
+static bool isAdmissableBranch(Operation *op, Region &region) {
+  if (region.empty())
+    return true;
+  // Build the semi-ring branch semantics backward from yield.
+  Operation *yield = region.front().getTerminator();
+  assert(isa<YieldOp>(yield));
+  return isAdmissableBranchExp(op, &region.front(), yield->getOperand(0));
+}
+
 Optional<unsigned> Merger::buildTensorExp(linalg::GenericOp op, Value v) {
   if (auto arg = v.dyn_cast<BlockArgument>()) {
     unsigned argN = arg.getArgNumber();
@@ -920,8 +953,11 @@ Optional<unsigned> Merger::buildTensorExp(linalg::GenericOp op, Value v) {
         return addExp(kCRe, e);
       if (isa<arith::BitcastOp>(def))
         return addExp(kBitCast, e, v);
-      if (isa<sparse_tensor::UnaryOp>(def))
-        return addExp(kUnary, e, Value(), def);
+      if (auto unop = dyn_cast<sparse_tensor::UnaryOp>(def)) {
+        if (isAdmissableBranch(unop, unop.presentRegion()) &&
+            isAdmissableBranch(unop, unop.absentRegion()))
+          return addExp(kUnary, e, Value(), def);
+      }
     }
   }
   // Construct binary operations if subexpressions can be built.
@@ -971,8 +1007,14 @@ Optional<unsigned> Merger::buildTensorExp(linalg::GenericOp op, Value v) {
         return addExp(kShrU, e0, e1);
       if (isa<arith::ShLIOp>(def) && isInvariant(e1))
         return addExp(kShlI, e0, e1);
-      if (isa<sparse_tensor::BinaryOp>(def))
-        return addExp(kBinary, e0, e1, Value(), def);
+      if (auto binop = dyn_cast<sparse_tensor::BinaryOp>(def)) {
+        if (isAdmissableBranch(binop, binop.overlapRegion()) &&
+            (binop.left_identity() ||
+             isAdmissableBranch(binop, binop.leftRegion())) &&
+            (binop.right_identity() ||
+             isAdmissableBranch(binop, binop.rightRegion())))
+          return addExp(kBinary, e0, e1, Value(), def);
+      }
     }
   }
   // Cannot build.
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_triangular_bin.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_triangular_bin.mlir
new file mode 100644
index 0000000000000..f07f4e9903469
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_triangular_bin.mlir
@@ -0,0 +1,95 @@
+// RUN: mlir-opt %s --sparse-compiler | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#SparseMatrix = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
+
+#trait_op = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>, // A
+    affine_map<(i,j) -> (i,j)>, // B
+    affine_map<(i,j) -> (i,j)>  // X (out)
+  ],
+  iterator_types = ["parallel","parallel"],
+  doc = "X(i,j) = A(i,j) OP B(i,j)"
+}
+
+module {
+  // Performs triangular add/sub operation (using semi-ring binary op).
+  func.func @triangular(%A: tensor<4x4xf64, #SparseMatrix>,
+                        %B: tensor<4x4xf64, #SparseMatrix>) -> tensor<4x4xf64, #SparseMatrix> {
+    %C = bufferization.alloc_tensor() : tensor<4x4xf64, #SparseMatrix>
+    %0 = linalg.generic #trait_op
+      ins(%A, %B: tensor<4x4xf64, #SparseMatrix>,
+                  tensor<4x4xf64, #SparseMatrix>)
+      outs(%C: tensor<4x4xf64, #SparseMatrix>) {
+        ^bb0(%a: f64, %b: f64, %c: f64) :
+          %row = linalg.index 0 : index
+          %col = linalg.index 1 : index
+          %result = sparse_tensor.binary %a, %b : f64, f64 to f64
+            overlap={
+              ^bb0(%x: f64, %y: f64):
+                %cmp = arith.cmpi "uge", %col, %row : index
+                %upperTriangleResult = arith.addf %x, %y : f64
+                %lowerTriangleResult = arith.subf %x, %y : f64
+                %ret = arith.select %cmp, %upperTriangleResult, %lowerTriangleResult : f64
+                sparse_tensor.yield %ret : f64
+            }
+            left=identity
+            right={
+              ^bb0(%y: f64):
+                %cmp = arith.cmpi "uge", %col, %row : index
+                %lowerTriangleResult = arith.negf %y : f64
+                %ret = arith.select %cmp, %y, %lowerTriangleResult : f64
+                sparse_tensor.yield %ret : f64
+            }
+          linalg.yield %result : f64
+      } -> tensor<4x4xf64, #SparseMatrix>
+    return %0 : tensor<4x4xf64, #SparseMatrix>
+  }
+
+  // Driver method to call and verify triangular kernel.
+  func.func @entry() {
+    %c0 = arith.constant 0 : index
+    %du = arith.constant -1.0 : f64
+
+    %am = arith.constant dense<
+      [ [ 1.0, 0.0, 3.0, 0.0],
+        [ 0.0, 2.0, 0.0, 0.0],
+        [ 0.0, 0.0, 0.0, 4.0],
+        [ 3.0, 4.0, 0.0, 0.0] ]> : tensor<4x4xf64>
+    %bm = arith.constant dense<
+      [ [ 1.0, 0.0, 1.0, 1.0],
+        [ 0.0, 0.5, 0.0, 0.0],
+        [ 1.0, 5.0, 2.0, 0.0],
+        [ 2.0, 0.0, 0.0, 0.0] ]> : tensor<4x4xf64>
+
+    %a = sparse_tensor.convert %am : tensor<4x4xf64> to tensor<4x4xf64, #SparseMatrix>
+    %b = sparse_tensor.convert %bm : tensor<4x4xf64> to tensor<4x4xf64, #SparseMatrix>
+    %0 = call @triangular(%a, %b) : (tensor<4x4xf64, #SparseMatrix>,
+                                     tensor<4x4xf64, #SparseMatrix>) -> tensor<4x4xf64, #SparseMatrix>
+
+    //
+    // Verify the results.
+    //
+    // CHECK:     ( ( 2, 0, 4, 1 ), ( 0, 2.5, 0, 0 ), ( -1, -5, 2, 4 ), ( 1, 4, 0, 0 ) )
+    // CHECK-NEXST: ( 2, 4, 1, 2.5, -1, -5, 2, 4, 1, 4, -1, -1, -1, -1, -1, -1 )
+    //
+    %c = sparse_tensor.convert %0 : tensor<4x4xf64, #SparseMatrix> to tensor<4x4xf64>
+    %m = bufferization.to_memref %c : memref<4x4xf64>
+    %v = vector.transfer_read %m[%c0, %c0], %du: memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v : vector<4x4xf64>
+    %1 = sparse_tensor.values %0 : tensor<4x4xf64, #SparseMatrix> to memref<?xf64>
+    %2 = vector.transfer_read %1[%c0], %du: memref<?xf64>, vector<16xf64>
+    vector.print %2 : vector<16xf64>
+
+    // Release the resources.
+    memref.dealloc %m : memref<4x4xf64>
+    sparse_tensor.release %a : tensor<4x4xf64, #SparseMatrix>
+    sparse_tensor.release %b : tensor<4x4xf64, #SparseMatrix>
+    sparse_tensor.release %0 : tensor<4x4xf64, #SparseMatrix>
+    return
+  }
+}