Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions .github/workflows/gpu_ci_trigger.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
# SETUP INSTRUCTIONS:
# ------------------
# This workflow synchronizes the code to GitLab via SSH to trigger GPU-enabled CI.
#
# 1. GENERATE SSH KEY PAIR (on your local machine):
# ssh-keygen -t ed25519 -f ~/.ssh/gitlab_sync_key -N "" -C "github-to-gitlab-sync"
#
# 2. CONFIGURE GITLAB (The Target):
# - Go to GitLab project > Settings > Repository > Deploy keys.
# - Add the content of '~/.ssh/gitlab_sync_key.pub'.
# - IMPORTANT: Check "Allow write access to this repository".
#
# 3. CONFIGURE GITHUB (The Source):
# - Go to GitHub repo > Settings > Secrets and variables > Actions.
# - Add new Repository Secrets:
# - Name: GITLAB_SSH_PRIVATE_KEY
# Value: Paste the entire content of '~/.ssh/gitlab_sync_key'.
# - Name: GITLAB_TOKEN
# Value: Your GitLab Personal Access Token (with 'api' and 'read_repository' scopes).
#

name: Sync to GitLab and Run GPU CI

on:
Expand All @@ -22,17 +43,21 @@ jobs:
ssh-private-key: ${{ secrets.GITLAB_SSH_PRIVATE_KEY }}

- name: Push to GitLab via SSH & Provide Link
id: push
run: |
# 1. Setup SSH known hosts
mkdir -p ~/.ssh
ssh-keyscan gitlab.mpcdf.mpg.de >> ~/.ssh/known_hosts

# 2. Determine target branch
if [ "${{ github.event_name }}" == "pull_request" ]; then
TARGET_BRANCH="pr-${{ github.event.number }}"
TARGET_BRANCH="gpu-test-pr-${{ github.event.number }}"
else
TARGET_BRANCH="${{ github.ref_name }}"
SOURCE_REF="${{ github.ref_name }}"
SAFE_REF="${SOURCE_REF//\//-}"
TARGET_BRANCH="gpu-test-${SAFE_REF}"
fi
echo "TARGET_BRANCH=$TARGET_BRANCH" >> $GITHUB_ENV

# 3. Add GitLab SSH remote
git remote add gitlab git@gitlab.mpcdf.mpg.de:maxlin/cunumpy.git
Expand All @@ -41,9 +66,16 @@ jobs:
git push -f gitlab HEAD:refs/heads/$TARGET_BRANCH

# 5. Provide the direct link
# We construct the URL manually since the push triggers the pipeline automatically
PIPELINE_URL="https://gitlab.mpcdf.mpg.de/maxlin/cunumpy/-/pipelines?ref=$TARGET_BRANCH"

echo "::notice::GitLab GPU CI Pipeline started automatically via Push!"
echo "::notice::View Pipeline: $PIPELINE_URL"

- name: Wait for GitLab Pipeline
uses: docker://gitlab/glab:latest
env:
GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }}
GITLAB_HOST: gitlab.mpcdf.mpg.de
with:
entrypoint: glab
args: ci status --live --branch ${{ env.TARGET_BRANCH }} --repo maxlin/cunumpy
8 changes: 7 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ gpu_tests:
before_script:
- module load python-waterboa/2025.06
- module load nvhpcsdk/26
- module load fftw-serial/3.3.10
script:
- echo "--- CUDA Sanity Check ---"
- nvidia-smi
Expand All @@ -31,10 +32,15 @@ gpu_tests:
# The MPCDF image likely has a specific python environment.
# We install our dependencies into the user directory or a virtualenv.
- python3 -m pip install --user cupy-cuda12x
- python3 -m pip install --user nvidia-cublas-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12
- python3 -m pip install --user -e .

# Add the user bin to PATH for pytest
- export PATH="$HOME/.local/bin:$PATH"

# Try to find libcublas and other libraries in the HPC environment
- export LD_LIBRARY_PATH=$(find /mpcdf/soft /opt/nvidia -name libcublas.so.12 -exec dirname {} \; 2>/dev/null | head -n 1):$LD_LIBRARY_PATH

- export ARRAY_BACKEND=cupy

- python3 -m pytest tests/unit/
- pytest -xvs .
2 changes: 2 additions & 0 deletions src/cunumpy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# cunumpy/__init__.py
from . import xp
from .xp import (
cupy_available,
get_backend,
is_cpu,
is_gpu,
Expand All @@ -14,6 +15,7 @@

__all__ = [
"xp",
"cupy_available",
"to_numpy",
"to_cupy",
"to_cunumpy",
Expand Down
1 change: 1 addition & 0 deletions src/cunumpy/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ from . import xp
def to_numpy(array: Any) -> np.ndarray: ...
def to_cupy(array: Any) -> Any: ...
def to_cunumpy(array: Any) -> Any: ...
def cupy_available() -> bool: ...
def get_backend(array: Any) -> str: ...
def is_gpu(array: Any) -> bool: ...
def is_cpu(array: Any) -> bool: ...
Expand Down
38 changes: 29 additions & 9 deletions src/cunumpy/xp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,26 @@
BackendType = Literal["numpy", "cupy"]


_CUPY_AVAILABLE_CACHE = None


def cupy_available() -> bool:
"""Check if CuPy is available and functional."""
global _CUPY_AVAILABLE_CACHE
if _CUPY_AVAILABLE_CACHE is not None:
return _CUPY_AVAILABLE_CACHE

try:
import cupy as cp

# Check if a GPU is available
_CUPY_AVAILABLE_CACHE = cp.is_available()
return _CUPY_AVAILABLE_CACHE
except (ImportError, Exception):
_CUPY_AVAILABLE_CACHE = False
return False


class ArrayBackend:
def __init__(
self,
Expand All @@ -27,13 +47,13 @@ def __init__(

def _load_backend(self, backend: BackendType, verbose: bool = False) -> ModuleType:
if backend == "cupy":
try:
if cupy_available():
import cupy as cp

return cp
except ImportError:
else:
if verbose:
print("CuPy not available.")
print("CuPy not available or not functional.")
return np
import numpy as np_mod

Expand Down Expand Up @@ -123,17 +143,17 @@ def to_numpy(array: Any) -> np.ndarray:

def to_cupy(array: Any) -> Any:
"""Convert an array to a CuPy array."""
try:
import cupy as cp
if not cupy_available():
raise ImportError("CuPy is not available or not functional.")

import cupy as cp

return cp.asarray(array)
except ImportError:
raise ImportError("CuPy is not available.")
return cp.asarray(array)


def to_cunumpy(array: Any) -> Any:
"""Convert an array to the currently active backend."""
if array_backend.backend == "cupy":
if array_backend.backend == "cupy" and cupy_available():
return to_cupy(array)
return to_numpy(array)

Expand Down
80 changes: 80 additions & 0 deletions tests/unit/test_benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import time

import numpy as np
import pytest

import cunumpy as xp


@pytest.mark.skipif(
not xp.cupy_available(), reason="CuPy/GPU not available or not functional"
)
def test_benchmark_matmul():
"""Benchmark matrix multiplication to show CuPy performance gain."""
size = 2000

# --- Benchmark NumPy ---
with xp.use_backend("numpy"):
a_np = xp.random.rand(size, size).astype(xp.float32)
b_np = xp.random.rand(size, size).astype(xp.float32)

start_np = time.perf_counter()
c_np = a_np @ b_np
# No sync needed for NumPy as it is synchronous
end_np = time.perf_counter()
t_np = end_np - start_np

# --- Benchmark CuPy ---
with xp.use_backend("cupy"):
a_cp = xp.random.rand(size, size).astype(xp.float32)
b_cp = xp.random.rand(size, size).astype(xp.float32)

# Warm up
_ = a_cp @ b_cp
xp.synchronize()

start_cp = time.perf_counter()
c_cp = a_cp @ b_cp
xp.synchronize() # CRITICAL for benchmarking GPU
end_cp = time.perf_counter()
t_cp = end_cp - start_cp

print(f"\n[Benchmark] Size: {size}x{size}")
print(f"NumPy time: {t_np:.4f}s")
print(f"CuPy time: {t_cp:.4f}s")
print(f"Speedup: {t_np/t_cp:.2f}x")

# On a real GPU (A100/A30), CuPy should be significantly faster
# We use a conservative threshold of 1.5x for the test to pass on various hardware
assert t_cp < t_np, f"CuPy ({t_cp:.4f}s) was not faster than NumPy ({t_np:.4f}s)"


@pytest.mark.skipif(
not xp.cupy_available(), reason="CuPy/GPU not available or not functional"
)
def test_benchmark_fft():
"""Benchmark FFT performance."""
size = 2**22 # ~4 million elements

with xp.use_backend("numpy"):
data_np = xp.random.rand(size).astype(xp.complex64)
start = time.perf_counter()
_ = xp.fft.fft(data_np)
t_np = time.perf_counter() - start

with xp.use_backend("cupy"):
data_cp = xp.random.rand(size).astype(xp.complex64)
# Warm up
_ = xp.fft.fft(data_cp)
xp.synchronize()

start = time.perf_counter()
_ = xp.fft.fft(data_cp)
xp.synchronize()
t_cp = time.perf_counter() - start

print(f"\n[Benchmark] FFT Size: {size}")
print(f"NumPy time: {t_np:.4f}s")
print(f"CuPy time: {t_cp:.4f}s")
print(f"Speedup: {t_np/t_cp:.2f}x")
assert t_cp < t_np
24 changes: 10 additions & 14 deletions tests/unit/test_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@


def test_to_cupy_available():
try:
import cupy as cp
except ImportError:
pytest.skip("CuPy not installed")
if not xp.cupy_available():
pytest.skip("CuPy not installed or not functional")

import cupy as cp

with xp.use_backend("cupy"):
arr = np.array([1, 2, 3])
Expand All @@ -17,12 +17,8 @@ def test_to_cupy_available():


def test_to_cupy_not_available():
try:
import cupy

pytest.skip("CuPy is installed, cannot test missing cupy error")
except ImportError:
pass
if xp.cupy_available():
pytest.skip("CuPy is installed and functional, cannot test missing cupy error")

with xp.use_backend("cupy"):
arr = np.array([1, 2, 3])
Expand All @@ -42,10 +38,10 @@ def test_synchronize():


def test_xp_array_cupy():
try:
import cupy as cp
except ImportError:
pytest.skip("CuPy not installed")
if not xp.cupy_available():
pytest.skip("CuPy not installed or not functional")

import cupy as cp

with xp.use_backend("cupy"):
arr = xp.array([1, 2])
Expand Down
Loading
Loading