m96-chan · m96-chan · Dec 30, 2025 · Dec 30, 2025
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -73,31 +73,38 @@ native/ops/matmul/
 ├── common/                          # Shared utilities
 │   └── aligned_copy_sm120.cuh
 ├── gemm/                            # GEMM kernels (M > 1)
-│   └── {input_dtype}/{output_dtype}/{arch}/{compute}_{suffix}.{cu,cuh}
+│   └── {w_dtype}_{a_dtype}_{out_dtype}/{arch}/{kernel}.{cu,cuh}
 ├── gemv/                            # GEMV kernels (M = 1)
-│   └── {input_dtype}/{output_dtype}/{arch}/{compute}_{suffix}.{cu,cuh}
+│   └── {w_dtype}_{a_dtype}_{out_dtype}/{arch}/{kernel}.{cu,cuh}
 ├── cublaslt.cuh                     # cuBLASLt wrapper
 ├── matmul.cu                        # Main dispatcher
 └── matmul_cutlass.cu                # CUTLASS dispatcher
 ```
 
-**Path Convention:** `{gemm|gemv}/{input_dtype}/{output_dtype}/{arch}/{compute}_{suffix}.cu`
+**Path Convention:** `{gemm|gemv}/{w{weight}a{act}_{out}}/{arch}/{kernel}.cu`
 
-| Component | Values | Examples |
-|-----------|--------|----------|
-| `input_dtype` | `f32`, `bf16`, `fp8`, `nvf4` | Input tensor dtype |
-| `output_dtype` | `f32`, `bf16`, `fp8` | Output tensor dtype |
+| Component | Values | Description |
+|-----------|--------|-------------|
+| `w_dtype` | `w4`, `w8`, `bf16`, `f32`, `int4`, `int8` | Weight dtype (w=weight) |
+| `a_dtype` | `a4`, `a8`, `a16`, `bf16`, `f32`, `int4`, `int8` | Activation dtype (a=act) |
+| `out_dtype` | `bf16`, `f32` | Output dtype |
 | `arch` | `generic`, `sm80`, `sm90`, `sm100`, `sm120` | Target architecture |
-| `compute` | `naive`, `wmma`, `mma`, `cutlass` | Compute method |
-| `suffix` | `blockwise`, `kernels`, etc. | Variant identifier |
+
+**Naming Rationale (Issue #122 Option 2):**
+- `w8a16_bf16`: FP8 weights, BF16 activations, BF16 output (W8A16 GEMM)
+- `w4a16_bf16`: NVF4 weights, BF16 activations, BF16 output (NVF4 GEMV)
+- `w8a8_bf16`: FP8 weights, FP8 activations, BF16 output (pure FP8)
+- `bf16_bf16`: BF16 weights, BF16 activations (no quantization)
+- `f32_f32`: FP32 weights, FP32 activations (baseline)
 
 **Examples:**
 ```
-gemm/bf16/bf16/sm120/bf16_cutlass.cuh    # BF16->BF16 GEMM, SM120, CUTLASS
-gemm/fp8/f32/sm90/fp8_cutlass.cu         # FP8->F32 GEMM, SM90, CUTLASS
-gemm/nvf4/bf16/sm120/nvf4_cutlass.cu     # NVF4->BF16 GEMM, SM120, CUTLASS
-gemv/bf16/bf16/sm120/nvf4.cu             # NVF4->BF16 GEMV, SM120
-gemm/f32/f32/generic/tf32_mma.cuh        # TF32 GEMM, generic (SM80+)
+gemm/bf16_bf16/sm80/bf16_cutlass.cuh     # BF16 GEMM, SM80, CUTLASS
+gemm/w8a8_f32/sm90/fp8_cutlass.cu        # FP8->F32 GEMM, SM90, CUTLASS
+gemm/w4a16_bf16/sm120/nvf4_cutlass.cu    # NVF4 weights, BF16 act->BF16, SM120
+gemv/w4a16_bf16/sm120/nvf4.cu            # NVF4 GEMV, SM120
+gemv/w8a16_bf16/sm120/fp8_opt_kernels.cu # FP8 weight, BF16 act GEMV, SM120
+gemm/f32_f32/generic/tf32_mma.cuh        # TF32 GEMM, generic (SM80+)
 ```
 
 ### Module Separation Policy

diff --git a/native/CMakeLists.txt b/native/CMakeLists.txt
@@ -153,30 +153,30 @@ pybind11_add_module(${MODULE_NAME}
     ops/reduction/reduction.cu
     ops/matmul/matmul.cu
     ops/matmul/matmul_cutlass.cu
-    # GEMM kernels
-    ops/matmul/gemm/f32/f32/generic/f32_ampere.cu
-    ops/matmul/gemm/fp8/f32/sm90/fp8_cutlass.cu
-    ops/matmul/gemm/fp8/f32/sm100/fp8_blockwise.cu
-    ops/matmul/gemm/fp8/bf16/sm120/fp8_blockwise.cu
-    ops/matmul/gemm/fp8/bf16/sm120/w8a16_gemm.cu
-    ops/matmul/gemm/fp8/bf16/sm120/w8a16_cutlass.cu
-    ops/matmul/gemm/fp8/bf16/sm120/grouped_gemm.cu
-    ops/matmul/gemm/fp8/fp8/sm120/fp8_cutlass.cu
-    ops/matmul/gemm/fp8/fp8/sm120/fp8_cutlass_v2.cu
-    ops/matmul/gemm/fp8/fp8/sm120/fp8_cutlass_v3.cu
-    ops/matmul/gemm/int8/int8/sm120/int8_native.cu
-    ops/matmul/gemm/int4/int4/sm120/int4_via_int8.cu
-    ops/matmul/gemm/nvf4/bf16/sm120/nvf4_cutlass.cu
-    ops/matmul/gemm/nvf4/bf16/sm120/nvf4_nvf4_cutlass.cu
-    # GEMV kernels
-    ops/matmul/gemv/bf16/bf16/sm120/nvf4.cu
-    ops/matmul/gemv/bf16/bf16/sm120/nvf4_kernels.cu
-    ops/matmul/gemv/bf16/bf16/sm120/fp8_opt_kernels.cu
-    ops/matmul/gemv/bf16/bf16/sm120/bf16_opt.cu
-    ops/matmul/gemv/fp8/fp8/sm120/fp8_gemv.cu
-    ops/matmul/gemv/fp8/fp8/sm120/fp8_accurate.cu
-    ops/matmul/gemv/nvf4/nvf4/sm120/nvf4_gemv.cu
-    ops/matmul/gemv/int4/int4/sm120/int4_gemv.cu
+    # GEMM kernels (Issue #122: Reorganized with w{weight}a{act}_{out} naming)
+    ops/matmul/gemm/f32_f32/generic/f32_ampere.cu
+    ops/matmul/gemm/w8a8_f32/sm90/fp8_cutlass.cu
+    ops/matmul/gemm/w8a8_f32/sm100/fp8_blockwise.cu
+    ops/matmul/gemm/w8a16_bf16/sm120/fp8_blockwise.cu
+    ops/matmul/gemm/w8a16_bf16/sm120/w8a16_gemm.cu
+    ops/matmul/gemm/w8a16_bf16/sm120/w8a16_cutlass.cu
+    ops/matmul/gemm/w8a16_bf16/sm120/grouped_gemm.cu
+    ops/matmul/gemm/w8a8_bf16/sm120/fp8_cutlass.cu
+    ops/matmul/gemm/w8a8_bf16/sm120/fp8_cutlass_v2.cu
+    ops/matmul/gemm/w8a8_bf16/sm120/fp8_cutlass_v3.cu
+    ops/matmul/gemm/int8_int8/sm120/int8_native.cu
+    ops/matmul/gemm/int4_int4/sm120/int4_via_int8.cu
+    ops/matmul/gemm/w4a16_bf16/sm120/nvf4_cutlass.cu
+    ops/matmul/gemm/w4a16_bf16/sm120/nvf4_nvf4_cutlass.cu
+    # GEMV kernels (Issue #122: Reorganized with w{weight}a{act}_{out} naming)
+    ops/matmul/gemv/w4a16_bf16/sm120/nvf4.cu
+    ops/matmul/gemv/w4a16_bf16/sm120/nvf4_kernels.cu
+    ops/matmul/gemv/w8a16_bf16/sm120/fp8_opt_kernels.cu
+    ops/matmul/gemv/bf16_bf16/sm120/bf16_opt.cu
+    ops/matmul/gemv/w8a8_bf16/sm120/fp8_gemv.cu
+    ops/matmul/gemv/w8a8_bf16/sm120/fp8_accurate.cu
+    ops/matmul/gemv/w4a4_bf16/sm120/nvf4_gemv.cu
+    ops/matmul/gemv/int4_int4/sm120/int4_gemv.cu
     ops/nn/nn.cu
     ops/quantize/quantize.cu
     ops/attention/paged_attention.cu

diff --git a/...mul/gemm/bf16/bf16/generic/bf16_naive.cuh → ...mul/gemm/bf16_bf16/generic/bf16_naive.cuh b/...mul/gemm/bf16/bf16/generic/bf16_naive.cuh → ...mul/gemm/bf16_bf16/generic/bf16_naive.cuh
@@ -14,7 +14,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
-#include "../../../../../../core/cuda_graph.hpp"
+#include "../../../../../core/cuda_graph.hpp"
 
 namespace pygpukit {
 namespace ops {

diff --git a/...tmul/gemm/bf16/bf16/generic/bf16_wmma.cuh → ...tmul/gemm/bf16_bf16/generic/bf16_wmma.cuh b/...tmul/gemm/bf16/bf16/generic/bf16_wmma.cuh → ...tmul/gemm/bf16_bf16/generic/bf16_wmma.cuh
@@ -14,7 +14,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
-#include "../../../../../../core/cuda_graph.hpp"
+#include "../../../../../core/cuda_graph.hpp"
 
 namespace pygpukit {
 namespace ops {

diff --git a/...m/bf16/bf16/generic/bf16_wmma_generic.cuh → ...m/bf16_bf16/generic/bf16_wmma_generic.cuh b/...m/bf16/bf16/generic/bf16_wmma_generic.cuh → ...m/bf16_bf16/generic/bf16_wmma_generic.cuh
@@ -12,7 +12,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
-#include "../../../../../../core/cuda_graph.hpp"
+#include "../../../../../core/cuda_graph.hpp"
 
 namespace pygpukit {
 namespace ops {

diff --git a/...mul/gemm/bf16/bf16/sm100/bf16_cutlass.cuh → ...mul/gemm/bf16_bf16/sm100/bf16_cutlass.cuh b/...mul/gemm/bf16/bf16/sm100/bf16_cutlass.cuh → ...mul/gemm/bf16_bf16/sm100/bf16_cutlass.cuh
diff --git a/...mul/gemm/bf16/bf16/sm120/bf16_cutlass.cuh → ...mul/gemm/bf16_bf16/sm120/bf16_cutlass.cuh b/...mul/gemm/bf16/bf16/sm120/bf16_cutlass.cuh → ...mul/gemm/bf16_bf16/sm120/bf16_cutlass.cuh
diff --git a/...tmul/gemm/bf16/bf16/sm80/bf16_cutlass.cuh → ...tmul/gemm/bf16_bf16/sm80/bf16_cutlass.cuh b/...tmul/gemm/bf16/bf16/sm80/bf16_cutlass.cuh → ...tmul/gemm/bf16_bf16/sm80/bf16_cutlass.cuh
diff --git a/...tmul/gemm/bf16/bf16/sm90/bf16_cutlass.cuh → ...tmul/gemm/bf16_bf16/sm90/bf16_cutlass.cuh b/...tmul/gemm/bf16/bf16/sm90/bf16_cutlass.cuh → ...tmul/gemm/bf16_bf16/sm90/bf16_cutlass.cuh
diff --git a/...matmul/gemm/f32/f32/generic/f32_ampere.cu → ...matmul/gemm/f32_f32/generic/f32_ampere.cu b/...matmul/gemm/f32/f32/generic/f32_ampere.cu → ...matmul/gemm/f32_f32/generic/f32_ampere.cu
diff --git a/...atmul/gemm/f32/f32/generic/f32_ampere.cuh → ...atmul/gemm/f32_f32/generic/f32_ampere.cuh b/...atmul/gemm/f32/f32/generic/f32_ampere.cuh → ...atmul/gemm/f32_f32/generic/f32_ampere.cuh
@@ -18,7 +18,7 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include "../../../../../../core/cuda_graph.hpp"
+#include "../../../../../core/cuda_graph.hpp"
 
 namespace pygpukit {
 namespace ops {

diff --git a/...matmul/gemm/f32/f32/generic/f32_naive.cuh → ...matmul/gemm/f32_f32/generic/f32_naive.cuh b/...matmul/gemm/f32/f32/generic/f32_naive.cuh → ...matmul/gemm/f32_f32/generic/f32_naive.cuh
@@ -10,7 +10,7 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include "../../../../../../core/cuda_graph.hpp"
+#include "../../../../../core/cuda_graph.hpp"
 
 namespace pygpukit {
 namespace ops {

diff --git a/.../matmul/gemm/f32/f32/generic/tf32_mma.cuh → .../matmul/gemm/f32_f32/generic/tf32_mma.cuh b/.../matmul/gemm/f32/f32/generic/tf32_mma.cuh → .../matmul/gemm/f32_f32/generic/tf32_mma.cuh
@@ -11,7 +11,7 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include "../../../../../../core/cuda_graph.hpp"
+#include "../../../../../core/cuda_graph.hpp"
 
 namespace pygpukit {
 namespace ops {

diff --git a/...matmul/gemm/f32/f32/generic/tf32_wmma.cuh → ...matmul/gemm/f32_f32/generic/tf32_wmma.cuh b/...matmul/gemm/f32/f32/generic/tf32_wmma.cuh → ...matmul/gemm/f32_f32/generic/tf32_wmma.cuh
@@ -1,7 +1,7 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include "../../../../../../core/cuda_graph.hpp"
+#include "../../../../../core/cuda_graph.hpp"
 
 namespace pygpukit {
 namespace ops {

diff --git a/...mul/gemm/int4/int4/sm120/int4_via_int8.cu → ...mul/gemm/int4_int4/sm120/int4_via_int8.cu b/...mul/gemm/int4/int4/sm120/int4_via_int8.cu → ...mul/gemm/int4_int4/sm120/int4_via_int8.cu
@@ -37,7 +37,7 @@
 #include "cutlass/util/device_memory.h"
 
 #define PYGPUKIT_PATCH_CUTLASS_LDSM_POST 1
-#include "../../../../common/aligned_copy_sm120.cuh"
+#include "../../../common/aligned_copy_sm120.cuh"
 
 using namespace cute;
 

diff --git a/...atmul/gemm/int8/int8/sm120/int8_native.cu → ...atmul/gemm/int8_int8/sm120/int8_native.cu b/...atmul/gemm/int8/int8/sm120/int8_native.cu → ...atmul/gemm/int8_int8/sm120/int8_native.cu
diff --git a/...tmul/gemm/nvf4/bf16/sm120/nvf4_cutlass.cu → ...mul/gemm/w4a16_bf16/sm120/nvf4_cutlass.cu b/...tmul/gemm/nvf4/bf16/sm120/nvf4_cutlass.cu → ...mul/gemm/w4a16_bf16/sm120/nvf4_cutlass.cu
diff --git a/...gemm/nvf4/bf16/sm120/nvf4_nvf4_cutlass.cu → ...emm/w4a16_bf16/sm120/nvf4_nvf4_cutlass.cu b/...gemm/nvf4/bf16/sm120/nvf4_nvf4_cutlass.cu → ...emm/w4a16_bf16/sm120/nvf4_nvf4_cutlass.cu
diff --git a/...tmul/gemm/fp8/bf16/sm120/fp8_blockwise.cu → ...ul/gemm/w8a16_bf16/sm120/fp8_blockwise.cu b/...tmul/gemm/fp8/bf16/sm120/fp8_blockwise.cu → ...ul/gemm/w8a16_bf16/sm120/fp8_blockwise.cu
@@ -48,7 +48,7 @@
 // Provides alignment-safe LDSM operations for Issue #2902 workaround
 // ============================================================================
 #define PYGPUKIT_PATCH_CUTLASS_LDSM_POST 1
-#include "../../../../common/aligned_copy_sm120.cuh"
+#include "../../../common/aligned_copy_sm120.cuh"
 
 using namespace cute;
 

diff --git a/...atmul/gemm/fp8/bf16/sm120/grouped_gemm.cu → ...mul/gemm/w8a16_bf16/sm120/grouped_gemm.cu b/...atmul/gemm/fp8/bf16/sm120/grouped_gemm.cu → ...mul/gemm/w8a16_bf16/sm120/grouped_gemm.cu
diff --git a/...tmul/gemm/fp8/bf16/sm120/w8a16_cutlass.cu → ...ul/gemm/w8a16_bf16/sm120/w8a16_cutlass.cu b/...tmul/gemm/fp8/bf16/sm120/w8a16_cutlass.cu → ...ul/gemm/w8a16_bf16/sm120/w8a16_cutlass.cu
@@ -36,7 +36,7 @@
 #include "cutlass/util/device_memory.h"
 
 #define PYGPUKIT_PATCH_CUTLASS_LDSM_POST 1
-#include "../../../../common/aligned_copy_sm120.cuh"
+#include "../../../common/aligned_copy_sm120.cuh"
 
 using namespace cute;
 

diff --git a/.../matmul/gemm/fp8/bf16/sm120/w8a16_gemm.cu → ...atmul/gemm/w8a16_bf16/sm120/w8a16_gemm.cu b/.../matmul/gemm/fp8/bf16/sm120/w8a16_gemm.cu → ...atmul/gemm/w8a16_bf16/sm120/w8a16_gemm.cu
diff --git a/.../matmul/gemm/fp8/fp8/sm120/fp8_cutlass.cu → ...atmul/gemm/w8a8_bf16/sm120/fp8_cutlass.cu b/.../matmul/gemm/fp8/fp8/sm120/fp8_cutlass.cu → ...atmul/gemm/w8a8_bf16/sm120/fp8_cutlass.cu
@@ -38,7 +38,7 @@
 
 // Alignment patch for Issue #2902 workaround
 #define PYGPUKIT_PATCH_CUTLASS_LDSM_POST 1
-#include "../../../../common/aligned_copy_sm120.cuh"
+#include "../../../common/aligned_copy_sm120.cuh"
 
 using namespace cute;
 

diff --git a/...tmul/gemm/fp8/fp8/sm120/fp8_cutlass_v2.cu → ...ul/gemm/w8a8_bf16/sm120/fp8_cutlass_v2.cu b/...tmul/gemm/fp8/fp8/sm120/fp8_cutlass_v2.cu → ...ul/gemm/w8a8_bf16/sm120/fp8_cutlass_v2.cu
@@ -24,7 +24,7 @@
 #include "cutlass/util/device_memory.h"
 
 #define PYGPUKIT_PATCH_CUTLASS_LDSM_POST 1
-#include "../../../../common/aligned_copy_sm120.cuh"
+#include "../../../common/aligned_copy_sm120.cuh"
 
 using namespace cute;
 

diff --git a/...tmul/gemm/fp8/fp8/sm120/fp8_cutlass_v3.cu → ...ul/gemm/w8a8_bf16/sm120/fp8_cutlass_v3.cu b/...tmul/gemm/fp8/fp8/sm120/fp8_cutlass_v3.cu → ...ul/gemm/w8a8_bf16/sm120/fp8_cutlass_v3.cu
@@ -31,7 +31,7 @@
 #include "cutlass/util/device_memory.h"
 
 #define PYGPUKIT_PATCH_CUTLASS_LDSM_POST 1
-#include "../../../../common/aligned_copy_sm120.cuh"
+#include "../../../common/aligned_copy_sm120.cuh"
 
 using namespace cute;
 

diff --git a/...atmul/gemm/fp8/f32/sm100/fp8_blockwise.cu → ...tmul/gemm/w8a8_f32/sm100/fp8_blockwise.cu b/...atmul/gemm/fp8/f32/sm100/fp8_blockwise.cu → ...tmul/gemm/w8a8_f32/sm100/fp8_blockwise.cu
diff --git a/...s/matmul/gemm/fp8/f32/sm90/fp8_cutlass.cu → .../matmul/gemm/w8a8_f32/sm90/fp8_cutlass.cu b/...s/matmul/gemm/fp8/f32/sm90/fp8_cutlass.cu → .../matmul/gemm/w8a8_f32/sm90/fp8_cutlass.cu
diff --git a/...l/gemv/bf16/bf16/generic/bf16_cutlass.cuh → ...l/gemv/bf16_bf16/generic/bf16_cutlass.cuh b/...l/gemv/bf16/bf16/generic/bf16_cutlass.cuh → ...l/gemv/bf16_bf16/generic/bf16_cutlass.cuh
diff --git a/...s/matmul/gemv/bf16/bf16/sm120/bf16_opt.cu → ...s/matmul/gemv/bf16_bf16/sm120/bf16_opt.cu b/...s/matmul/gemv/bf16/bf16/sm120/bf16_opt.cu → ...s/matmul/gemv/bf16_bf16/sm120/bf16_opt.cu
diff --git a/.../matmul/gemv/bf16/bf16/sm120/bf16_opt.cuh → .../matmul/gemv/bf16_bf16/sm120/bf16_opt.cuh b/.../matmul/gemv/bf16/bf16/sm120/bf16_opt.cuh → .../matmul/gemv/bf16_bf16/sm120/bf16_opt.cuh
diff --git a/.../matmul/gemv/int4/int4/sm120/int4_gemv.cu → .../matmul/gemv/int4_int4/sm120/int4_gemv.cu b/.../matmul/gemv/int4/int4/sm120/int4_gemv.cu → .../matmul/gemv/int4_int4/sm120/int4_gemv.cu
diff --git a/...matmul/gemv/int4/int4/sm120/int4_gemv.cuh → ...matmul/gemv/int4_int4/sm120/int4_gemv.cuh b/...matmul/gemv/int4/int4/sm120/int4_gemv.cuh → ...matmul/gemv/int4_int4/sm120/int4_gemv.cuh
diff --git a/...e/ops/matmul/gemv/bf16/bf16/sm120/nvf4.cu → .../ops/matmul/gemv/w4a16_bf16/sm120/nvf4.cu b/...e/ops/matmul/gemv/bf16/bf16/sm120/nvf4.cu → .../ops/matmul/gemv/w4a16_bf16/sm120/nvf4.cu
diff --git a/.../ops/matmul/gemv/bf16/bf16/sm120/nvf4.cuh → ...ops/matmul/gemv/w4a16_bf16/sm120/nvf4.cuh b/.../ops/matmul/gemv/bf16/bf16/sm120/nvf4.cuh → ...ops/matmul/gemv/w4a16_bf16/sm120/nvf4.cuh
diff --git a/...tmul/gemv/bf16/bf16/sm120/nvf4_kernels.cu → ...mul/gemv/w4a16_bf16/sm120/nvf4_kernels.cu b/...tmul/gemv/bf16/bf16/sm120/nvf4_kernels.cu → ...mul/gemv/w4a16_bf16/sm120/nvf4_kernels.cu
diff --git a/.../matmul/gemv/nvf4/nvf4/sm120/nvf4_gemv.cu → .../matmul/gemv/w4a4_bf16/sm120/nvf4_gemv.cu b/.../matmul/gemv/nvf4/nvf4/sm120/nvf4_gemv.cu → .../matmul/gemv/w4a4_bf16/sm120/nvf4_gemv.cu
diff --git a/...matmul/gemv/nvf4/nvf4/sm120/nvf4_gemv.cuh → ...matmul/gemv/w4a4_bf16/sm120/nvf4_gemv.cuh b/...matmul/gemv/nvf4/nvf4/sm120/nvf4_gemv.cuh → ...matmul/gemv/w4a4_bf16/sm120/nvf4_gemv.cuh
diff --git a/...e/ops/matmul/gemv/bf16/bf16/sm120/fp8.cuh → .../ops/matmul/gemv/w8a16_bf16/sm120/fp8.cuh b/...e/ops/matmul/gemv/bf16/bf16/sm120/fp8.cuh → .../ops/matmul/gemv/w8a16_bf16/sm120/fp8.cuh
diff --git a/...s/matmul/gemv/bf16/bf16/sm120/fp8_opt.cuh → .../matmul/gemv/w8a16_bf16/sm120/fp8_opt.cuh b/...s/matmul/gemv/bf16/bf16/sm120/fp8_opt.cuh → .../matmul/gemv/w8a16_bf16/sm120/fp8_opt.cuh
diff --git a/...l/gemv/bf16/bf16/sm120/fp8_opt_kernels.cu → .../gemv/w8a16_bf16/sm120/fp8_opt_kernels.cu b/...l/gemv/bf16/bf16/sm120/fp8_opt_kernels.cu → .../gemv/w8a16_bf16/sm120/fp8_opt_kernels.cu
diff --git a/...matmul/gemv/fp8/fp8/sm120/fp8_accurate.cu → ...tmul/gemv/w8a8_bf16/sm120/fp8_accurate.cu b/...matmul/gemv/fp8/fp8/sm120/fp8_accurate.cu → ...tmul/gemv/w8a8_bf16/sm120/fp8_accurate.cu
diff --git a/...atmul/gemv/fp8/fp8/sm120/fp8_accurate.cuh → ...mul/gemv/w8a8_bf16/sm120/fp8_accurate.cuh b/...atmul/gemv/fp8/fp8/sm120/fp8_accurate.cuh → ...mul/gemv/w8a8_bf16/sm120/fp8_accurate.cuh
diff --git a/...ops/matmul/gemv/fp8/fp8/sm120/fp8_gemv.cu → ...s/matmul/gemv/w8a8_bf16/sm120/fp8_gemv.cu b/...ops/matmul/gemv/fp8/fp8/sm120/fp8_gemv.cu → ...s/matmul/gemv/w8a8_bf16/sm120/fp8_gemv.cu
diff --git a/...ps/matmul/gemv/fp8/fp8/sm120/fp8_gemv.cuh → .../matmul/gemv/w8a8_bf16/sm120/fp8_gemv.cuh b/...ps/matmul/gemv/fp8/fp8/sm120/fp8_gemv.cuh → .../matmul/gemv/w8a8_bf16/sm120/fp8_gemv.cuh
diff --git a/native/ops/matmul/matmul.cu b/native/ops/matmul/matmul.cu
@@ -1,22 +1,22 @@
 /**
  * Matrix multiplication dispatch
  */
-#include "gemm/f32/f32/generic/f32_naive.cuh"
+#include "gemm/f32_f32/generic/f32_naive.cuh"
 #include "../common/error.cuh"
 #include "../common/device.cuh"
 #include "../../core/memory.hpp"
 #include "../../core/cuda_graph.hpp"
 #include "../ops.cuh"  // For transpose()
 
-// Include existing optimized kernels
-#include "gemm/f32/f32/generic/f32_ampere.cuh"
-#include "gemm/f32/f32/generic/tf32_wmma.cuh"
-#include "gemm/f32/f32/generic/tf32_mma.cuh"
-#include "gemm/bf16/bf16/generic/bf16_naive.cuh"
-#include "gemm/bf16/bf16/generic/bf16_wmma.cuh"
-#include "gemm/bf16/bf16/generic/bf16_wmma_generic.cuh"
+// Include existing optimized kernels (Issue #122: Updated paths)
+#include "gemm/f32_f32/generic/f32_ampere.cuh"
+#include "gemm/f32_f32/generic/tf32_wmma.cuh"
+#include "gemm/f32_f32/generic/tf32_mma.cuh"
+#include "gemm/bf16_bf16/generic/bf16_naive.cuh"
+#include "gemm/bf16_bf16/generic/bf16_wmma.cuh"
+#include "gemm/bf16_bf16/generic/bf16_wmma_generic.cuh"
 #include "cublaslt.cuh"
-#include "gemm/bf16/bf16/sm80/bf16_cutlass.cuh"
+#include "gemm/bf16_bf16/sm80/bf16_cutlass.cuh"
 
 #include <cstdlib>
 #include <algorithm>

diff --git a/native/ops/matmul/matmul_cutlass.cu b/native/ops/matmul/matmul_cutlass.cu
@@ -11,7 +11,7 @@
 
 #if PYGPUKIT_HAS_CUTLASS
 
-#include "gemm/bf16/bf16/sm80/bf16_cutlass.cuh"
+#include "gemm/bf16_bf16/sm80/bf16_cutlass.cuh"
 
 namespace pygpukit {
 namespace ops {