max-models · max-models · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.github/workflows/gpu_ci_trigger.yml b/.github/workflows/gpu_ci_trigger.yml
@@ -1,3 +1,24 @@
+# SETUP INSTRUCTIONS:
+# ------------------
+# This workflow synchronizes the code to GitLab via SSH to trigger GPU-enabled CI.
+# 
+# 1. GENERATE SSH KEY PAIR (on your local machine):
+#    ssh-keygen -t ed25519 -f ~/.ssh/gitlab_sync_key -N "" -C "github-to-gitlab-sync"
+#
+# 2. CONFIGURE GITLAB (The Target):
+#    - Go to GitLab project > Settings > Repository > Deploy keys.
+#    - Add the content of '~/.ssh/gitlab_sync_key.pub'.
+#    - IMPORTANT: Check "Allow write access to this repository".
+#
+# 3. CONFIGURE GITHUB (The Source):
+#    - Go to GitHub repo > Settings > Secrets and variables > Actions.
+#    - Add new Repository Secrets:
+#      - Name: GITLAB_SSH_PRIVATE_KEY
+#        Value: Paste the entire content of '~/.ssh/gitlab_sync_key'.
+#      - Name: GITLAB_TOKEN
+#        Value: Your GitLab Personal Access Token (with 'api' and 'read_repository' scopes).
+#
+
 name: Sync to GitLab and Run GPU CI
 
 on:
@@ -22,17 +43,21 @@ jobs:
           ssh-private-key: ${{ secrets.GITLAB_SSH_PRIVATE_KEY }}
 
       - name: Push to GitLab via SSH & Provide Link
+        id: push
         run: |
           # 1. Setup SSH known hosts
           mkdir -p ~/.ssh
           ssh-keyscan gitlab.mpcdf.mpg.de >> ~/.ssh/known_hosts
 
           # 2. Determine target branch
           if [ "${{ github.event_name }}" == "pull_request" ]; then
-            TARGET_BRANCH="pr-${{ github.event.number }}"
+            TARGET_BRANCH="gpu-test-pr-${{ github.event.number }}"
           else
-            TARGET_BRANCH="${{ github.ref_name }}"
+            SOURCE_REF="${{ github.ref_name }}"
+            SAFE_REF="${SOURCE_REF//\//-}"
+            TARGET_BRANCH="gpu-test-${SAFE_REF}"
           fi
+          echo "TARGET_BRANCH=$TARGET_BRANCH" >> $GITHUB_ENV
 
           # 3. Add GitLab SSH remote
           git remote add gitlab git@gitlab.mpcdf.mpg.de:maxlin/cunumpy.git
@@ -41,9 +66,16 @@ jobs:
           git push -f gitlab HEAD:refs/heads/$TARGET_BRANCH
 
           # 5. Provide the direct link
-          # We construct the URL manually since the push triggers the pipeline automatically
           PIPELINE_URL="https://gitlab.mpcdf.mpg.de/maxlin/cunumpy/-/pipelines?ref=$TARGET_BRANCH"
 
           echo "::notice::GitLab GPU CI Pipeline started automatically via Push!"
           echo "::notice::View Pipeline: $PIPELINE_URL"
 
+      - name: Wait for GitLab Pipeline
+        uses: docker://gitlab/glab:latest
+        env:
+          GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }}
+          GITLAB_HOST: gitlab.mpcdf.mpg.de
+        with:
+          entrypoint: glab
+          args: ci status --live --branch ${{ env.TARGET_BRANCH }} --repo maxlin/cunumpy
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -15,6 +15,7 @@ gpu_tests:
   before_script:
     - module load python-waterboa/2025.06
     - module load nvhpcsdk/26
+    - module load fftw-serial/3.3.10
   script:
     - echo "--- CUDA Sanity Check ---"
     - nvidia-smi
@@ -31,10 +32,15 @@ gpu_tests:
     # The MPCDF image likely has a specific python environment. 
     # We install our dependencies into the user directory or a virtualenv.
     - python3 -m pip install --user cupy-cuda12x
+    - python3 -m pip install --user nvidia-cublas-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12
     - python3 -m pip install --user -e .
 
     # Add the user bin to PATH for pytest
     - export PATH="$HOME/.local/bin:$PATH"
+
+    # Try to find libcublas and other libraries in the HPC environment
+    - export LD_LIBRARY_PATH=$(find /mpcdf/soft /opt/nvidia -name libcublas.so.12 -exec dirname {} \; 2>/dev/null | head -n 1):$LD_LIBRARY_PATH
+
     - export ARRAY_BACKEND=cupy
 
-    - python3 -m pytest tests/unit/
+    - pytest -xvs .
diff --git a/src/cunumpy/__init__.py b/src/cunumpy/__init__.py
@@ -1,6 +1,7 @@
 # cunumpy/__init__.py
 from . import xp
 from .xp import (
+    cupy_available,
     get_backend,
     is_cpu,
     is_gpu,
@@ -14,6 +15,7 @@
 
 __all__ = [
     "xp",
+    "cupy_available",
     "to_numpy",
     "to_cupy",
     "to_cunumpy",

diff --git a/src/cunumpy/__init__.pyi b/src/cunumpy/__init__.pyi
@@ -12,6 +12,7 @@ from . import xp
 def to_numpy(array: Any) -> np.ndarray: ...
 def to_cupy(array: Any) -> Any: ...
 def to_cunumpy(array: Any) -> Any: ...
+def cupy_available() -> bool: ...
 def get_backend(array: Any) -> str: ...
 def is_gpu(array: Any) -> bool: ...
 def is_cpu(array: Any) -> bool: ...

diff --git a/src/cunumpy/xp.py b/src/cunumpy/xp.py
@@ -8,6 +8,26 @@
 BackendType = Literal["numpy", "cupy"]
 
 
+_CUPY_AVAILABLE_CACHE = None
+
+
+def cupy_available() -> bool:
+    """Check if CuPy is available and functional."""
+    global _CUPY_AVAILABLE_CACHE
+    if _CUPY_AVAILABLE_CACHE is not None:
+        return _CUPY_AVAILABLE_CACHE
+
+    try:
+        import cupy as cp
+
+        # Check if a GPU is available
+        _CUPY_AVAILABLE_CACHE = cp.is_available()
+        return _CUPY_AVAILABLE_CACHE
+    except (ImportError, Exception):
+        _CUPY_AVAILABLE_CACHE = False
+        return False
+
+
 class ArrayBackend:
     def __init__(
         self,
@@ -27,13 +47,13 @@ def __init__(
 
     def _load_backend(self, backend: BackendType, verbose: bool = False) -> ModuleType:
         if backend == "cupy":
-            try:
+            if cupy_available():
                 import cupy as cp
 
                 return cp
-            except ImportError:
+            else:
                 if verbose:
-                    print("CuPy not available.")
+                    print("CuPy not available or not functional.")
                 return np
         import numpy as np_mod
 
@@ -123,17 +143,17 @@ def to_numpy(array: Any) -> np.ndarray:
 
 def to_cupy(array: Any) -> Any:
     """Convert an array to a CuPy array."""
-    try:
-        import cupy as cp
+    if not cupy_available():
+        raise ImportError("CuPy is not available or not functional.")
+
+    import cupy as cp
 
-        return cp.asarray(array)
-    except ImportError:
-        raise ImportError("CuPy is not available.")
+    return cp.asarray(array)
 
 
 def to_cunumpy(array: Any) -> Any:
     """Convert an array to the currently active backend."""
-    if array_backend.backend == "cupy":
+    if array_backend.backend == "cupy" and cupy_available():
         return to_cupy(array)
     return to_numpy(array)
 

diff --git a/tests/unit/test_benchmarks.py b/tests/unit/test_benchmarks.py
@@ -0,0 +1,80 @@
+import time
+
+import numpy as np
+import pytest
+
+import cunumpy as xp
+
+
+@pytest.mark.skipif(
+    not xp.cupy_available(), reason="CuPy/GPU not available or not functional"
+)
+def test_benchmark_matmul():
+    """Benchmark matrix multiplication to show CuPy performance gain."""
+    size = 2000
+
+    # --- Benchmark NumPy ---
+    with xp.use_backend("numpy"):
+        a_np = xp.random.rand(size, size).astype(xp.float32)
+        b_np = xp.random.rand(size, size).astype(xp.float32)
+
+        start_np = time.perf_counter()
+        c_np = a_np @ b_np
+        # No sync needed for NumPy as it is synchronous
+        end_np = time.perf_counter()
+        t_np = end_np - start_np
+
+    # --- Benchmark CuPy ---
+    with xp.use_backend("cupy"):
+        a_cp = xp.random.rand(size, size).astype(xp.float32)
+        b_cp = xp.random.rand(size, size).astype(xp.float32)
+
+        # Warm up
+        _ = a_cp @ b_cp
+        xp.synchronize()
+
+        start_cp = time.perf_counter()
+        c_cp = a_cp @ b_cp
+        xp.synchronize()  # CRITICAL for benchmarking GPU
+        end_cp = time.perf_counter()
+        t_cp = end_cp - start_cp
+
+    print(f"\n[Benchmark] Size: {size}x{size}")
+    print(f"NumPy time: {t_np:.4f}s")
+    print(f"CuPy time:  {t_cp:.4f}s")
+    print(f"Speedup:    {t_np/t_cp:.2f}x")
+
+    # On a real GPU (A100/A30), CuPy should be significantly faster
+    # We use a conservative threshold of 1.5x for the test to pass on various hardware
+    assert t_cp < t_np, f"CuPy ({t_cp:.4f}s) was not faster than NumPy ({t_np:.4f}s)"
+
+
+@pytest.mark.skipif(
+    not xp.cupy_available(), reason="CuPy/GPU not available or not functional"
+)
+def test_benchmark_fft():
+    """Benchmark FFT performance."""
+    size = 2**22  # ~4 million elements
+
+    with xp.use_backend("numpy"):
+        data_np = xp.random.rand(size).astype(xp.complex64)
+        start = time.perf_counter()
+        _ = xp.fft.fft(data_np)
+        t_np = time.perf_counter() - start
+
+    with xp.use_backend("cupy"):
+        data_cp = xp.random.rand(size).astype(xp.complex64)
+        # Warm up
+        _ = xp.fft.fft(data_cp)
+        xp.synchronize()
+
+        start = time.perf_counter()
+        _ = xp.fft.fft(data_cp)
+        xp.synchronize()
+        t_cp = time.perf_counter() - start
+
+    print(f"\n[Benchmark] FFT Size: {size}")
+    print(f"NumPy time: {t_np:.4f}s")
+    print(f"CuPy time:  {t_cp:.4f}s")
+    print(f"Speedup:    {t_np/t_cp:.2f}x")
+    assert t_cp < t_np
diff --git a/tests/unit/test_cupy.py b/tests/unit/test_cupy.py
@@ -5,10 +5,10 @@
 
 
 def test_to_cupy_available():
-    try:
-        import cupy as cp
-    except ImportError:
-        pytest.skip("CuPy not installed")
+    if not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    import cupy as cp
 
     with xp.use_backend("cupy"):
         arr = np.array([1, 2, 3])
@@ -17,12 +17,8 @@ def test_to_cupy_available():
 
 
 def test_to_cupy_not_available():
-    try:
-        import cupy
-
-        pytest.skip("CuPy is installed, cannot test missing cupy error")
-    except ImportError:
-        pass
+    if xp.cupy_available():
+        pytest.skip("CuPy is installed and functional, cannot test missing cupy error")
 
     with xp.use_backend("cupy"):
         arr = np.array([1, 2, 3])
@@ -42,10 +38,10 @@ def test_synchronize():
 
 
 def test_xp_array_cupy():
-    try:
-        import cupy as cp
-    except ImportError:
-        pytest.skip("CuPy not installed")
+    if not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    import cupy as cp
 
     with xp.use_backend("cupy"):
         arr = xp.array([1, 2])