diff --git a/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt b/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
index b7853634bc44e..f8112bbdd548f 100644
--- a/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
+++ b/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_conversion_library(MLIRSCFToGPU
   MLIRConversionPassIncGen
 
   LINK_LIBS PUBLIC
+  MLIRAffineAnalysis
   MLIRAffineDialect
   MLIRAffineToStandard
   MLIRArithDialect
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
index c816356cf3f96..4c1fd59bb9042 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"
 
 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
+#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -29,11 +30,33 @@ namespace {
 struct ForLoopMapper
     : public impl::ConvertAffineForToGPUPassBase<ForLoopMapper> {
   using Base::Base;
+  /// Checks if the loop nest rooted at 'forOp' has parallelism for the first
+  /// `numMappedLoops` loops.
+  static bool areMappedLoopsParallel(affine::AffineForOp forOp,
+                                     unsigned numMappedLoops) {
+    affine::AffineForOp currentLoop = forOp;
+    for (unsigned i = 0; i < numMappedLoops; ++i) {
+      if (!affine::isLoopParallel(currentLoop))
+        return false;
+      if (i + 1 < numMappedLoops) {
+        auto nestedLoops = currentLoop.getBody()->getOps<affine::AffineForOp>();
+        if (nestedLoops.empty())
+          // Return true here to let the conversion fail later on structural
+          // mismatch if the nest is not deep enough.
+          return true;
+        // Target only the first nested loop in a perfect nest.
+        currentLoop = *nestedLoops.begin();
+      }
+    }
+    return true;
+  }
 
   void runOnOperation() override {
     for (Operation &op : llvm::make_early_inc_range(
              getOperation().getFunctionBody().getOps())) {
       if (auto forOp = dyn_cast<affine::AffineForOp>(&op)) {
+        if (!areMappedLoopsParallel(forOp, numBlockDims + numThreadDims))
+          continue;
         if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims,
                                                     numThreadDims)))
           signalPassFailure();
diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
index 79eef8ae7eb85..107a6c3838010 100644
--- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
+++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
@@ -21,14 +21,13 @@ func.func @one_d_loop(%A : memref<?xf32>, %B : memref<?xf32>) {
   // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
   affine.for %i = 0 to 42 {
   // CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]]
-  // CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]]
+  // CHECK-THREADS-NEXT: affine.load %{{.*}}[%[[INDEX]]]
   // CHECK-BLOCKS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[B0]]
-  // CHECK-BLOCKS-NEXT: memref.load %{{.*}}[%[[INDEX]]]
-    %0 = memref.load %A[%i] : memref<?xf32>
-    memref.store %0, %B[%i] : memref<?xf32>
+  // CHECK-BLOCKS-NEXT: affine.load %{{.*}}[%[[INDEX]]]
+    %0 = affine.load %A[%i] : memref<?xf32>
+    affine.store %0, %B[%i] : memref<?xf32>
     // CHECK-THREADS: gpu.terminator
     // CHECK-BLOCKS: gpu.terminator
   }
   return
 }
-
diff --git a/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir b/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir
new file mode 100644
index 0000000000000..16770ccba58c3
--- /dev/null
+++ b/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir
@@ -0,0 +1,105 @@
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-affine-for-to-gpu{gpu-block-dims=1 gpu-thread-dims=1}))" %s | FileCheck %s
+
+/// Test parallelization legality checks in affine-for-to-gpu conversion.
+/// The pass is configured to map the first 2 loops to GPU block (depth 0) and
+/// GPU thread (depth 1) respectively.
+
+// CHECK-LABEL: func @map_to_gpu_inner_dep_unmapped
+// CHECK-SAME: %[[MEM:.*]]: memref<10x10x10xf32>
+func.func @map_to_gpu_inner_dep_unmapped(%mem: memref<10x10x10xf32>) {
+  /// The inner loop 'k' (depth=2) carries a dependency. However, since the
+  /// mapping only covers depth 0 and 1, 'k' remains sequential inside the
+  /// GPU kernel. The outer loops are dependency-free and safe to map.
+
+  // CHECK: gpu.launch
+  affine.for %i = 0 to 10 {
+    affine.for %j = 0 to 10 {
+      // CHECK: affine.for %{{.*}} = 1 to 10
+      // CHECK: affine.load %[[MEM]]
+      // CHECK: affine.store %{{.*}}, %[[MEM]]
+      affine.for %k = 1 to 10 {
+         %0 = affine.load %mem[%i, %j, %k - 1] : memref<10x10x10xf32>
+         affine.store %0, %mem[%i, %j, %k] : memref<10x10x10xf32>
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @negative_map_to_gpu_block_dep
+func.func @negative_map_to_gpu_block_dep(%mem: memref<10xf32>) {
+  /// The loop 'i' is mapped to a block dimension (depth=0).
+  /// The loop-carried dependency makes parallelization unsafe.
+
+  // CHECK-NOT: gpu.launch
+  // CHECK: affine.for
+  affine.for %i = 1 to 10 {
+     %0 = affine.load %mem[%i - 1] : memref<10xf32>
+     affine.store %0, %mem[%i] : memref<10xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @negative_map_to_gpu_thread_dep
+func.func @negative_map_to_gpu_thread_dep(%mem: memref<10x10xf32>) {
+  /// The inner loop 'j' is mapped to a thread dimension (depth=1).
+  /// A dependency in any mapped loop invalidates the entire nest conversion.
+
+  // CHECK-NOT: gpu.launch
+  // CHECK: affine.for
+  affine.for %i = 0 to 10 {
+    // CHECK: affine.for
+    affine.for %j = 1 to 10 {
+       %0 = affine.load %mem[%i, %j - 1] : memref<10x10xf32>
+       affine.store %0, %mem[%i, %j] : memref<10x10xf32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @negative_map_to_gpu_imperfect_nest_dep
+func.func @negative_map_to_gpu_imperfect_nest_dep(%mem: memref<10x10xf32>) {
+  /// Imperfect nest: The first inner loop 'j' has a dependency and is mapped
+  /// to a thread dimension. This prevents parallelization of the parent loop.
+
+  // CHECK-NOT: gpu.launch
+  // CHECK: affine.for
+  affine.for %i = 0 to 10 {
+    // CHECK: affine.for
+    affine.for %j = 1 to 10 {
+       %0 = affine.load %mem[%i, %j - 1] : memref<10x10xf32>
+       affine.store %0, %mem[%i, %j] : memref<10x10xf32>
+    }
+    // CHECK: affine.for
+    affine.for %k = 0 to 10 {
+       %1 = affine.load %mem[%i, %k] : memref<10x10xf32>
+       affine.store %1, %mem[%i, %k] : memref<10x10xf32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @mixed_parallel_and_seq_siblings
+func.func @mixed_parallel_and_seq_siblings(%mem: memref<10x10xf32>) {
+  /// Sibling top-level loops are analyzed independently. The first nest is
+  /// safe; the second has a dependency in a mapped loop (thread dim).
+
+  // CHECK: gpu.launch
+  affine.for %i = 0 to 10 {
+    affine.for %j = 0 to 10 {
+       %0 = affine.load %mem[%i, %j] : memref<10x10xf32>
+       affine.store %0, %mem[%i, %j] : memref<10x10xf32>
+    }
+  }
+
+  // CHECK-NOT: gpu.launch
+  // CHECK: affine.for
+  affine.for %i2 = 0 to 10 {
+    // CHECK: affine.for
+    affine.for %j2 = 1 to 10 {
+       %1 = affine.load %mem[%i2, %j2 - 1] : memref<10x10xf32>
+       affine.store %1, %mem[%i2, %j2] : memref<10x10xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Conversion/SCFToGPU/step_one.mlir b/mlir/test/Conversion/SCFToGPU/step_one.mlir
index be6fadfbd0ad3..3f8a1a847bc22 100644
--- a/mlir/test/Conversion/SCFToGPU/step_one.mlir
+++ b/mlir/test/Conversion/SCFToGPU/step_one.mlir
@@ -64,12 +64,12 @@ func.func @step_1(%A : memref<?x?x?x?xf32>, %B : memref<?x?x?x?xf32>) {
           // CHECK-22-NEXT:   %[[jj:.*]] = arith.addi %{{.*}}, %{{.*}} : index
 
           // Using remapped values instead of loop iterators.
-          // CHECK-11:        {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
-          // CHECK-22:        {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
-          %0 = memref.load %A[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
-          // CHECK-11-NEXT:   memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
-          // CHECK-22-NEXT:   memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
-          memref.store %0, %B[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
+          // CHECK-11:        {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          // CHECK-22:        {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          %0 = affine.load %A[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
+          // CHECK-11-NEXT:   affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          // CHECK-22-NEXT:   affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          affine.store %0, %B[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
 
           // CHECK-11: gpu.terminator
           // CHECK-22: gpu.terminator
@@ -79,4 +79,3 @@ func.func @step_1(%A : memref<?x?x?x?xf32>, %B : memref<?x?x?x?xf32>) {
   }
   return
 }
-
diff --git a/mlir/test/Conversion/SCFToGPU/step_positive.mlir b/mlir/test/Conversion/SCFToGPU/step_positive.mlir
index 84e8454e56171..4da3a458a9152 100644
--- a/mlir/test/Conversion/SCFToGPU/step_positive.mlir
+++ b/mlir/test/Conversion/SCFToGPU/step_positive.mlir
@@ -18,10 +18,10 @@ func.func @step_var(%A : memref<?x?xf32>, %B : memref<?x?xf32>) {
       // CHECK-NEXT: %[[prod_j:.*]] = arith.muli %{{.*}}, %{{.*}} : index
       // CHECK-NEXT: %[[j:.*]] = arith.addi %{{.*}}, %[[prod_j]] : index
 
-      // CHECK:     {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
-      %0 = memref.load %A[%i, %j] : memref<?x?xf32>
-      // CHECK:     memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
-      memref.store %0, %B[%i, %j] : memref<?x?xf32>
+      // CHECK:     {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
+      %0 = affine.load %A[%i, %j] : memref<?x?xf32>
+      // CHECK:     affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
+      affine.store %0, %B[%i, %j] : memref<?x?xf32>
     }
   }
   return