Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,28 @@ jobs:

- name: Install CUDA Toolkit
uses: Jimver/cuda-toolkit@v0.2.19
id: cuda-toolkit
with:
cuda: "12.6.0"
method: network
use-github-cache: true
use-local-cache: false

- name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: cuda-build-${{ runner.os }}
max-size: 500M

- name: Configure CMake
run: |
cd native
mkdir -p build && cd build
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES="80;86;89;90" \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-Dpybind11_DIR=$(python -c "import pybind11; print(pybind11.get_cmake_dir())")

- name: Build native module
Expand Down
48 changes: 27 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,22 @@ PyGPUkit aims to be the "micro-runtime for GPU computing": small, fast, and idea

---

## v0.2 Features (NEW)
## v0.2.2 Features (NEW)

### Ampere-Optimized SGEMM
| Feature | Description |
|---------|-------------|
| **cp.async Pipeline** | 4-stage software pipeline with async memory transfers |
| **Vectorized Loads** | float4 (16-byte) loads for A and B matrices |
| **Shared Memory Tiling** | BM=128, BN=128, BK=16 with 8x8 thread tiles |
| **SM 80+ Required** | Ampere architecture (RTX 30XX+) required |

### Performance (RTX 3090 Ti)
| Matrix Size | TFLOPS | Efficiency | vs NumPy |
|-------------|--------|------------|----------|
| 2048x2048 | 7.6 | 19% | 10x |
| 4096x4096 | 13.2 | 33% | 16x |
| 8192x8192 | **18.2** | 46% | **22x** |

### Core Infrastructure (Rust)
| Feature | Description |
Expand All @@ -40,15 +55,6 @@ PyGPUkit aims to be the "micro-runtime for GPU computing": small, fast, and idea
| **Pinned Memory** | Page-locked host memory with pooling |
| **Kernel Cache** | PTX caching, LRU eviction, TTL |
| **GPU Partitioning** | Resource isolation, multi-tenant support |
| **Tiled Matmul** | Shared memory + double buffering |

### Performance (RTX 3090 Ti)
| Matrix Size | Performance | vs NumPy |
|-------------|-------------|----------|
| 512x512 | 1262 GFLOPS | 11.6x |
| 1024x1024 | 1350 GFLOPS | 2.2x |
| 2048x2048 | 4417 GFLOPS | 6.1x |
| 4096x4096 | **6555 GFLOPS** | 7.9x |

---

Expand Down Expand Up @@ -320,17 +326,17 @@ PyGPUkit/
- [x] Tiled Matmul (shared memory)
- [x] 106 Rust tests

### **v0.2.1 — Stabilization Phase**
- [ ] Admission / QoS spec finalization
- [ ] Python API inconsistency fixes
- [ ] Rust error propagation unification
- [ ] 24h stress test script

### **v0.2.2 — Performance Phase**
- [ ] 64x64 tile kernel refinement
- [ ] TensorCore (Ampere+) availability check
- [ ] Pinned Memory fragmentation test
- [ ] Async Engine 3-stream support
### **v0.2.1 — Stabilization Phase (Released)**
- [x] Admission / QoS spec finalization
- [x] Python API inconsistency fixes
- [x] Rust error propagation unification

### **v0.2.2 — Performance Phase (Released)**
- [x] Ampere-optimized SGEMM with cp.async pipeline
- [x] 4-stage software pipelining for latency hiding
- [x] float4 vectorized memory loads
- [x] 18.2 TFLOPS on RTX 3090 Ti (46% efficiency)
- [x] SM 80+ (Ampere) architecture requirement

### **v0.2.3 — Reliability Phase**
- [ ] Kernel cache LRU completion
Expand Down
99 changes: 99 additions & 0 deletions benchmark_ampere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Benchmark Ampere-optimized GEMM kernel."""
import os
import time

import numpy as np

# Setup CUDA DLL path (if CUDA is installed)
cuda_path = os.environ.get(
"CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
)
cuda_bin = os.path.join(cuda_path, "bin")
if os.path.isdir(cuda_bin):
if cuda_bin not in os.environ.get("PATH", ""):
os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")
if hasattr(os, "add_dll_directory"):
os.add_dll_directory(cuda_bin)

# Import native module
try:
import _pygpukit_native as native
except ImportError:
from pygpukit import _pygpukit_native as native

props = native.get_device_properties(0)
print(f"GPU: {props.name}")
print()


def verify_correctness(m, n, k):
"""Verify kernel correctness."""
A = np.random.randn(m, k).astype(np.float32)
B = np.random.randn(k, n).astype(np.float32)

A_gpu = native.from_numpy(A)
B_gpu = native.from_numpy(B)
C_gpu = native.matmul(A_gpu, B_gpu)
C_result = C_gpu.to_numpy()

C_expected = A @ B
rel_error = np.max(np.abs(C_result - C_expected)) / np.max(np.abs(C_expected))
return rel_error


def benchmark_matmul(m, n, k, warmup=3, iterations=10):
"""Benchmark matmul and return median time and TFLOPS."""
A_np = np.random.randn(m, k).astype(np.float32)
B_np = np.random.randn(k, n).astype(np.float32)

# Pre-allocate GPU arrays
A_gpu = native.from_numpy(A_np)
B_gpu = native.from_numpy(B_np)

# Warmup
for _ in range(warmup):
_ = native.matmul(A_gpu, B_gpu)

# Benchmark (reuse same input arrays)
times = []
for _ in range(iterations):
start = time.perf_counter()
_ = native.matmul(A_gpu, B_gpu)
elapsed = time.perf_counter() - start
times.append(elapsed)

median_time = np.median(times)
min_time = np.min(times)
flops = 2 * m * n * k
tflops_median = flops / median_time / 1e12
tflops_max = flops / min_time / 1e12
return median_time, tflops_median, tflops_max


# First verify correctness
print("=== Correctness Verification ===")
for size in [256, 512, 1024, 2048, 4096]:
error = verify_correctness(size, size, size)
status = "PASS" if error < 1e-4 else "FAIL"
print(f"{size}x{size}: relative error = {error:.2e} [{status}]")

print()

# Benchmark different sizes
sizes = [
(2048, 2048, 2048),
(4096, 4096, 4096),
(8192, 8192, 8192),
]

print("=== Ampere-Optimized GEMM Benchmark ===")
print()
for m, n, k in sizes:
iters = 5 if m >= 8192 else 10
time_ms, tflops_med, tflops_max = benchmark_matmul(m, n, k, warmup=5, iterations=iters)
status = "PASS" if tflops_med >= 22.0 else "FAIL"
print(f"{m}x{n}x{k}: {tflops_med:.1f} TFLOPS (max: {tflops_max:.1f}) - {time_ms*1000:.2f} ms [{status}]")

print()
print("Target: 22-32 TFLOPS (62-90% efficiency on RTX 3090 Ti)")
print("Minimum: 22 TFLOPS to beat PyTorch baseline")
40 changes: 40 additions & 0 deletions benchmark_pytorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Benchmark PyTorch cuBLAS for comparison with PyGPUkit."""
import time
import numpy as np

try:
import torch
except ImportError:
print("PyTorch not installed")
exit(0)

# Check PyTorch CUDA
print("PyTorch CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))

# Benchmark
sizes = [2048, 4096, 8192]
for size in sizes:
A = torch.randn(size, size, device="cuda", dtype=torch.float32)
B = torch.randn(size, size, device="cuda", dtype=torch.float32)

# Warmup
for _ in range(3):
C = torch.matmul(A, B)
torch.cuda.synchronize()

# Benchmark
times = []
iterations = 10 if size < 8192 else 5
for _ in range(iterations):
torch.cuda.synchronize()
start = time.perf_counter()
C = torch.matmul(A, B)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start
times.append(elapsed)

median_time = np.median(times)
tflops = 2 * size**3 / median_time / 1e12
print(f"{size}x{size}: {tflops:.1f} TFLOPS ({median_time*1000:.2f} ms)")
Loading