diff --git a/.github/workflows/gpu_ci_trigger.yml b/.github/workflows/gpu_ci_trigger.yml
index 9ed33d7..4418074 100644
--- a/.github/workflows/gpu_ci_trigger.yml
+++ b/.github/workflows/gpu_ci_trigger.yml
@@ -1,3 +1,24 @@
+# SETUP INSTRUCTIONS:
+# ------------------
+# This workflow synchronizes the code to GitLab via SSH to trigger GPU-enabled CI.
+# 
+# 1. GENERATE SSH KEY PAIR (on your local machine):
+#    ssh-keygen -t ed25519 -f ~/.ssh/gitlab_sync_key -N "" -C "github-to-gitlab-sync"
+#
+# 2. CONFIGURE GITLAB (The Target):
+#    - Go to GitLab project > Settings > Repository > Deploy keys.
+#    - Add the content of '~/.ssh/gitlab_sync_key.pub'.
+#    - IMPORTANT: Check "Allow write access to this repository".
+#
+# 3. CONFIGURE GITHUB (The Source):
+#    - Go to GitHub repo > Settings > Secrets and variables > Actions.
+#    - Add new Repository Secrets:
+#      - Name: GITLAB_SSH_PRIVATE_KEY
+#        Value: Paste the entire content of '~/.ssh/gitlab_sync_key'.
+#      - Name: GITLAB_TOKEN
+#        Value: Your GitLab Personal Access Token (with 'api' and 'read_repository' scopes).
+#
+
 name: Sync to GitLab and Run GPU CI
 
 on:
@@ -22,6 +43,7 @@ jobs:
           ssh-private-key: ${{ secrets.GITLAB_SSH_PRIVATE_KEY }}
 
       - name: Push to GitLab via SSH & Provide Link
+        id: push
         run: |
           # 1. Setup SSH known hosts
           mkdir -p ~/.ssh
@@ -29,10 +51,13 @@ jobs:
 
           # 2. Determine target branch
           if [ "${{ github.event_name }}" == "pull_request" ]; then
-            TARGET_BRANCH="pr-${{ github.event.number }}"
+            TARGET_BRANCH="gpu-test-pr-${{ github.event.number }}"
           else
-            TARGET_BRANCH="${{ github.ref_name }}"
+            SOURCE_REF="${{ github.ref_name }}"
+            SAFE_REF="${SOURCE_REF//\//-}"
+            TARGET_BRANCH="gpu-test-${SAFE_REF}"
           fi
+          echo "TARGET_BRANCH=$TARGET_BRANCH" >> $GITHUB_ENV
 
           # 3. Add GitLab SSH remote
           git remote add gitlab git@gitlab.mpcdf.mpg.de:maxlin/cunumpy.git
@@ -41,9 +66,16 @@ jobs:
           git push -f gitlab HEAD:refs/heads/$TARGET_BRANCH
 
           # 5. Provide the direct link
-          # We construct the URL manually since the push triggers the pipeline automatically
           PIPELINE_URL="https://gitlab.mpcdf.mpg.de/maxlin/cunumpy/-/pipelines?ref=$TARGET_BRANCH"
 
           echo "::notice::GitLab GPU CI Pipeline started automatically via Push!"
           echo "::notice::View Pipeline: $PIPELINE_URL"
 
+      - name: Wait for GitLab Pipeline
+        uses: docker://gitlab/glab:latest
+        env:
+          GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }}
+          GITLAB_HOST: gitlab.mpcdf.mpg.de
+        with:
+          entrypoint: glab
+          args: ci status --live --branch ${{ env.TARGET_BRANCH }} --repo maxlin/cunumpy
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b76c09b..79d4881 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,6 +15,7 @@ gpu_tests:
   before_script:
     - module load python-waterboa/2025.06
     - module load nvhpcsdk/26
+    - module load fftw-serial/3.3.10
   script:
     - echo "--- CUDA Sanity Check ---"
     - nvidia-smi
@@ -31,10 +32,15 @@ gpu_tests:
     # The MPCDF image likely has a specific python environment. 
     # We install our dependencies into the user directory or a virtualenv.
     - python3 -m pip install --user cupy-cuda12x
+    - python3 -m pip install --user nvidia-cublas-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12
     - python3 -m pip install --user -e .
     
     # Add the user bin to PATH for pytest
     - export PATH="$HOME/.local/bin:$PATH"
+    
+    # Try to find libcublas and other libraries in the HPC environment
+    - export LD_LIBRARY_PATH=$(find /mpcdf/soft /opt/nvidia -name libcublas.so.12 -exec dirname {} \; 2>/dev/null | head -n 1):$LD_LIBRARY_PATH
+    
     - export ARRAY_BACKEND=cupy
     
-    - python3 -m pytest tests/unit/
+    - pytest -xvs .
diff --git a/src/cunumpy/__init__.py b/src/cunumpy/__init__.py
index c6592bd..383f32a 100644
--- a/src/cunumpy/__init__.py
+++ b/src/cunumpy/__init__.py
@@ -1,6 +1,7 @@
 # cunumpy/__init__.py
 from . import xp
 from .xp import (
+    cupy_available,
     get_backend,
     is_cpu,
     is_gpu,
@@ -14,6 +15,7 @@
 
 __all__ = [
     "xp",
+    "cupy_available",
     "to_numpy",
     "to_cupy",
     "to_cunumpy",
diff --git a/src/cunumpy/__init__.pyi b/src/cunumpy/__init__.pyi
index 2a4fb33..8f51ee6 100644
--- a/src/cunumpy/__init__.pyi
+++ b/src/cunumpy/__init__.pyi
@@ -12,6 +12,7 @@ from . import xp
 def to_numpy(array: Any) -> np.ndarray: ...
 def to_cupy(array: Any) -> Any: ...
 def to_cunumpy(array: Any) -> Any: ...
+def cupy_available() -> bool: ...
 def get_backend(array: Any) -> str: ...
 def is_gpu(array: Any) -> bool: ...
 def is_cpu(array: Any) -> bool: ...
diff --git a/src/cunumpy/xp.py b/src/cunumpy/xp.py
index 36d3b9c..8e92e9a 100644
--- a/src/cunumpy/xp.py
+++ b/src/cunumpy/xp.py
@@ -8,6 +8,26 @@
 BackendType = Literal["numpy", "cupy"]
 
 
+_CUPY_AVAILABLE_CACHE = None
+
+
+def cupy_available() -> bool:
+    """Check if CuPy is available and functional."""
+    global _CUPY_AVAILABLE_CACHE
+    if _CUPY_AVAILABLE_CACHE is not None:
+        return _CUPY_AVAILABLE_CACHE
+
+    try:
+        import cupy as cp
+
+        # Check if a GPU is available
+        _CUPY_AVAILABLE_CACHE = cp.is_available()
+        return _CUPY_AVAILABLE_CACHE
+    except (ImportError, Exception):
+        _CUPY_AVAILABLE_CACHE = False
+        return False
+
+
 class ArrayBackend:
     def __init__(
         self,
@@ -27,13 +47,13 @@ def __init__(
 
     def _load_backend(self, backend: BackendType, verbose: bool = False) -> ModuleType:
         if backend == "cupy":
-            try:
+            if cupy_available():
                 import cupy as cp
 
                 return cp
-            except ImportError:
+            else:
                 if verbose:
-                    print("CuPy not available.")
+                    print("CuPy not available or not functional.")
                 return np
         import numpy as np_mod
 
@@ -123,17 +143,17 @@ def to_numpy(array: Any) -> np.ndarray:
 
 def to_cupy(array: Any) -> Any:
     """Convert an array to a CuPy array."""
-    try:
-        import cupy as cp
+    if not cupy_available():
+        raise ImportError("CuPy is not available or not functional.")
+
+    import cupy as cp
 
-        return cp.asarray(array)
-    except ImportError:
-        raise ImportError("CuPy is not available.")
+    return cp.asarray(array)
 
 
 def to_cunumpy(array: Any) -> Any:
     """Convert an array to the currently active backend."""
-    if array_backend.backend == "cupy":
+    if array_backend.backend == "cupy" and cupy_available():
         return to_cupy(array)
     return to_numpy(array)
 
diff --git a/tests/unit/test_benchmarks.py b/tests/unit/test_benchmarks.py
new file mode 100644
index 0000000..f9a9e35
--- /dev/null
+++ b/tests/unit/test_benchmarks.py
@@ -0,0 +1,80 @@
+import time
+
+import numpy as np
+import pytest
+
+import cunumpy as xp
+
+
+@pytest.mark.skipif(
+    not xp.cupy_available(), reason="CuPy/GPU not available or not functional"
+)
+def test_benchmark_matmul():
+    """Benchmark matrix multiplication to show CuPy performance gain."""
+    size = 2000
+
+    # --- Benchmark NumPy ---
+    with xp.use_backend("numpy"):
+        a_np = xp.random.rand(size, size).astype(xp.float32)
+        b_np = xp.random.rand(size, size).astype(xp.float32)
+
+        start_np = time.perf_counter()
+        c_np = a_np @ b_np
+        # No sync needed for NumPy as it is synchronous
+        end_np = time.perf_counter()
+        t_np = end_np - start_np
+
+    # --- Benchmark CuPy ---
+    with xp.use_backend("cupy"):
+        a_cp = xp.random.rand(size, size).astype(xp.float32)
+        b_cp = xp.random.rand(size, size).astype(xp.float32)
+
+        # Warm up
+        _ = a_cp @ b_cp
+        xp.synchronize()
+
+        start_cp = time.perf_counter()
+        c_cp = a_cp @ b_cp
+        xp.synchronize()  # CRITICAL for benchmarking GPU
+        end_cp = time.perf_counter()
+        t_cp = end_cp - start_cp
+
+    print(f"\n[Benchmark] Size: {size}x{size}")
+    print(f"NumPy time: {t_np:.4f}s")
+    print(f"CuPy time:  {t_cp:.4f}s")
+    print(f"Speedup:    {t_np/t_cp:.2f}x")
+
+    # On a real GPU (A100/A30), CuPy should be significantly faster
+    # We use a conservative threshold of 1.5x for the test to pass on various hardware
+    assert t_cp < t_np, f"CuPy ({t_cp:.4f}s) was not faster than NumPy ({t_np:.4f}s)"
+
+
+@pytest.mark.skipif(
+    not xp.cupy_available(), reason="CuPy/GPU not available or not functional"
+)
+def test_benchmark_fft():
+    """Benchmark FFT performance."""
+    size = 2**22  # ~4 million elements
+
+    with xp.use_backend("numpy"):
+        data_np = xp.random.rand(size).astype(xp.complex64)
+        start = time.perf_counter()
+        _ = xp.fft.fft(data_np)
+        t_np = time.perf_counter() - start
+
+    with xp.use_backend("cupy"):
+        data_cp = xp.random.rand(size).astype(xp.complex64)
+        # Warm up
+        _ = xp.fft.fft(data_cp)
+        xp.synchronize()
+
+        start = time.perf_counter()
+        _ = xp.fft.fft(data_cp)
+        xp.synchronize()
+        t_cp = time.perf_counter() - start
+
+    print(f"\n[Benchmark] FFT Size: {size}")
+    print(f"NumPy time: {t_np:.4f}s")
+    print(f"CuPy time:  {t_cp:.4f}s")
+    print(f"Speedup:    {t_np/t_cp:.2f}x")
+    assert t_cp < t_np
diff --git a/tests/unit/test_cupy.py b/tests/unit/test_cupy.py
index 935a27a..a8e804d 100644
--- a/tests/unit/test_cupy.py
+++ b/tests/unit/test_cupy.py
@@ -5,10 +5,10 @@
 
 
 def test_to_cupy_available():
-    try:
-        import cupy as cp
-    except ImportError:
-        pytest.skip("CuPy not installed")
+    if not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    import cupy as cp
 
     with xp.use_backend("cupy"):
         arr = np.array([1, 2, 3])
@@ -17,12 +17,8 @@ def test_to_cupy_available():
 
 
 def test_to_cupy_not_available():
-    try:
-        import cupy
-
-        pytest.skip("CuPy is installed, cannot test missing cupy error")
-    except ImportError:
-        pass
+    if xp.cupy_available():
+        pytest.skip("CuPy is installed and functional, cannot test missing cupy error")
 
     with xp.use_backend("cupy"):
         arr = np.array([1, 2, 3])
@@ -42,10 +38,10 @@ def test_synchronize():
 
 
 def test_xp_array_cupy():
-    try:
-        import cupy as cp
-    except ImportError:
-        pytest.skip("CuPy not installed")
+    if not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    import cupy as cp
 
     with xp.use_backend("cupy"):
         arr = xp.array([1, 2])
diff --git a/tests/unit/test_features.py b/tests/unit/test_features.py
new file mode 100644
index 0000000..1e0aced
--- /dev/null
+++ b/tests/unit/test_features.py
@@ -0,0 +1,156 @@
+import numpy as np
+import pytest
+
+import cunumpy as xp
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_matrix_multiplication(backend):
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        # Test basic @ operator and matmul
+        a = xp.array([[1, 2], [3, 4]], dtype=float)
+        b = xp.array([[5, 6], [7, 8]], dtype=float)
+        c = a @ b
+
+        expected = np.array([[19, 22], [43, 50]])
+        assert xp.array_equal(xp.to_numpy(c), expected)
+
+        # Test linalg.norm
+        norm = xp.linalg.norm(a)
+        assert np.isclose(float(norm), np.linalg.norm([[1, 2], [3, 4]]))
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_reductions_and_axes(backend):
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        a = xp.array([[1, 10, 100], [2, 20, 200]], dtype=float)
+
+        assert xp.sum(a) == 333
+        assert np.array_equal(xp.to_numpy(xp.max(a, axis=0)), [2, 20, 200])
+        assert np.array_equal(xp.to_numpy(xp.min(a, axis=1)), [1, 2])
+        assert xp.mean(a) == 333 / 6
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_complex_elementwise(backend):
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        a = xp.array([-1, 0, 1], dtype=float)
+
+        # Exp and Log
+        exp_a = xp.exp(a)
+        assert np.allclose(xp.to_numpy(exp_a), np.exp([-1, 0, 1]))
+
+        # Trig
+        b = xp.array([0, xp.pi / 2], dtype=float)
+        assert np.allclose(xp.to_numpy(xp.cos(b)), [1, 0], atol=1e-7)
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_broadcasting_logic(backend):
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        # 3D + 1D broadcasting
+        a = xp.ones((2, 3, 4))
+        b = xp.arange(4)
+        c = a * b
+
+        assert c.shape == (2, 3, 4)
+        assert np.array_equal(xp.to_numpy(c[0, 0]), [0, 1, 2, 3])
+        assert np.array_equal(xp.to_numpy(c[1, 2]), [0, 1, 2, 3])
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_fft_parity(backend):
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        # Create a signal with two frequencies
+        t = xp.linspace(0, 1, 128)
+        sig = xp.sin(2 * xp.pi * 5 * t) + 0.5 * xp.sin(2 * xp.pi * 20 * t)
+
+        freqs = xp.fft.fft(sig)
+        inv = xp.fft.ifft(freqs)
+
+        # ifft(fft(x)) == x
+        assert np.allclose(xp.to_numpy(inv.real), xp.to_numpy(sig))
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_realistic_normalization_workflow(backend):
+    """Workflow: Load data -> Compute Stats -> Normalize -> Mask Outliers."""
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        # 1. Create dummy data with clear outliers
+        data = xp.array([1.0, 2.0, 3.0, 4.0, 100.0, -100.0])
+
+        # 2. Normalize
+        mean = xp.mean(data)
+        std = xp.std(data)
+        norm_data = (data - mean) / std
+
+        # 3. Mask outliers (abs > 1.0 in this specific small set)
+        mask = xp.abs(norm_data) < 1.0
+        clean_data = data[mask]
+
+        # Verify: -100 and 100 should be gone
+        res = xp.to_numpy(xp.sort(clean_data))
+        assert np.array_equal(res, [1.0, 2.0, 3.0, 4.0])
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_stacking_and_concatenation(backend):
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        a = xp.array([1, 2, 3])
+        b = xp.array([4, 5, 6])
+
+        res_cat = xp.concatenate([a, b])
+        assert np.array_equal(xp.to_numpy(res_cat), [1, 2, 3, 4, 5, 6])
+
+        res_stack = xp.stack([a, b])
+        assert res_stack.shape == (2, 3)
+        assert np.array_equal(xp.to_numpy(res_stack[1]), [4, 5, 6])
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_advanced_indexing(backend):
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        a = xp.arange(10).reshape(2, 5)
+
+        # Pick specific elements: (0,1) and (1,3)
+        rows = xp.array([0, 1])
+        cols = xp.array([1, 3])
+
+        indexed = a[rows, cols]
+        assert np.array_equal(xp.to_numpy(indexed), [1, 8])
+
+
+@pytest.mark.parametrize("backend", ["numpy", "cupy"])
+def test_random_generation(backend):
+    if backend == "cupy" and not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    with xp.use_backend(backend):
+        # Test reproducibility if we were to add seed (checking existing proxy)
+        a = xp.random.normal(0, 1, size=(100, 100))
+        assert a.shape == (100, 100)
+        assert xp.abs(xp.mean(a)) < 0.5  # Basic statistical sanity
diff --git a/tests/unit/test_integration.py b/tests/unit/test_integration.py
new file mode 100644
index 0000000..d13764f
--- /dev/null
+++ b/tests/unit/test_integration.py
@@ -0,0 +1,78 @@
+import numpy as np
+import pytest
+
+import cunumpy as xp
+
+
+def test_data_movement_chain():
+    """Test CPU -> GPU -> CPU multi-hop movement."""
+    if not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    # 1. Start on CPU
+    data_orig = np.random.rand(100, 100).astype(np.float32)
+
+    # 2. Move to GPU
+    data_gpu = xp.to_cupy(data_orig)
+    assert xp.is_gpu(data_gpu)
+
+    # 3. Do operation on GPU
+    with xp.use_backend("cupy"):
+        res_gpu = xp.sin(data_gpu) ** 2 + xp.cos(data_gpu) ** 2
+
+    # 4. Move back to CPU
+    res_cpu = xp.to_numpy(res_gpu)
+    assert isinstance(res_cpu, np.ndarray)
+    assert np.allclose(res_cpu, 1.0)
+
+
+def test_synchronize_logic():
+    """Verify synchronize can be called and handles errors gracefully."""
+    # This is more of a smoke test to ensure the path doesn't crash
+    xp.synchronize()
+
+    if xp.cupy_available():
+        import cupy as cp
+
+        with xp.use_backend("cupy"):
+            a = xp.random.rand(100)
+            xp.synchronize()
+            assert xp.is_gpu(a)
+
+
+def test_fft_interop():
+    """Test FFT between backends."""
+    if not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    # Create signal on CPU
+    sig_cpu = np.random.rand(1024).astype(np.complex128)
+
+    # Move to GPU and transform
+    sig_gpu = xp.to_cupy(sig_cpu)
+    freq_gpu = xp.fft.fft(sig_gpu)
+
+    # Move frequencies to CPU and transform back
+    freq_cpu = xp.to_numpy(freq_gpu)
+    sig_reconstructed = np.fft.ifft(freq_cpu)
+
+    assert np.allclose(sig_cpu, sig_reconstructed)
+
+
+def test_mixed_backend_errors():
+    """Verify that mixing backends in operations raises errors (standard NumPy/CuPy behavior)."""
+    if not xp.cupy_available():
+        pytest.skip("CuPy not installed or not functional")
+
+    a_cpu = np.array([1, 2, 3])
+    a_gpu = xp.to_cupy(a_cpu)
+
+    # This should fail because you can't add CPU and GPU arrays directly
+    with pytest.raises(Exception):
+        _ = a_cpu + a_gpu
+
+    # But to_cunumpy should fix it
+    a_gpu_fixed = xp.to_cunumpy(a_cpu)
+    with xp.use_backend("cupy"):
+        res = a_gpu + a_gpu_fixed
+        assert xp.is_gpu(res)