<a href="https://colab.research.google.com/github/nujudaly/QIS25/blob/main/CUDA_Q.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Intro to CUDA-Q NAVIDIA

In [1]:
!pip install cudaq


Collecting cudaq
  Downloading cudaq-0.9.1.tar.gz (9.0 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting cuda-quantum-cu12==0.9.1 (from cudaq)
  Downloading cuda_quantum_cu12-0.9.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (8.7 kB)
Collecting astpretty~=3.0 (from cuda-quantum-cu12==0.9.1->cudaq)
  Downloading astpretty-3.0.0-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting cuquantum-python-cu12>=24.11 (from cuda-quantum-cu12==0.9.1->cudaq)
  Downloading cuquantum_python_cu12-24.11.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting custatevec-cu12~=1.7 (from cuquantum-python-cu12>=24.11->cuda-quantum-cu12==0.9.1->cudaq)
  Downloading custatevec_cu12-1.7.0-py3-none-manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting cutensornet-cu12~=2.6 (from cuquantum-python-cu12>=24.11->cuda-quantum-cu12==0.9.1->cudaq)
  Downloading cutensorne

In [2]:
!pip install qutip


Collecting qutip
  Downloading qutip-5.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Downloading qutip-5.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.1/30.1 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: qutip
Successfully installed qutip-5.1.1


In [3]:
import cudaq
import qutip


In [4]:

# إنشاء الكيرنل
kernel = cudaq.make_kernel()
qubits = kernel.qalloc(2)

# تطبيق بوابات هادامارد و CNOT
kernel.h(qubits[0])
kernel.cx(qubits[0], qubits[1])

# قياس الكيوبتات
kernel.mz(qubits)

# تنفيذ الكيرنل وأخذ العينات
result = cudaq.sample(kernel)
print(result)


{ 00:513 11:487 }



In [5]:

# تحديد عدد الكيوبتات
qubit_count = 3

# تعريف الكيرنل
@cudaq.kernel
def ghz_kernel(qubit_count: int):
    # تخصيص الكيوبتات
    qvector = cudaq.qvector(qubit_count)

    # وضع أول كيوبت في حالة التراكب
    h(qvector[0])

    # تطبيق بوابة CNOT بين الكيوبتات
    for i in range(qubit_count - 1):
        x.ctrl(qvector[i], qvector[i + 1])

    # قياس الكيوبتات
    mz(qvector)


In [6]:
# تحديد عدد الكيوبتات
qubit_count = 3

# تشغيل الكيرنل وجمع النتائج
results = cudaq.sample(ghz_kernel, qubit_count)

# عرض النتائج
print(results)


{ 000:490 111:510 }



###Experminating with NAVIDIA **GPU**

In [7]:
pip install cudaq




In [8]:
pip install --upgrade cudaq




In [9]:
import cudaq
import timeit

# تعريف الكيرنل
@cudaq.kernel
def kernel():
    qubit_count = 25  # عدد الكيوبتات
    qubits = kernel.qalloc(qubit_count)  # تخصيص الكيوبتات

    # وضع أول كيوبت في حالة التراكب
    kernel.h(qubits[0])

    # تطبيق بوابات CNOT لإنشاء حالة GHZ
    for i in range(qubit_count - 1):
        kernel.cx(qubits[i], qubits[i + 1])

    # قياس الكيوبتات
    kernel.mz(qubits)

# إعداد الكود المراد تنفيذه
code_to_time = 'cudaq.sample(kernel, shots_count=1000000)'

# تنفيذ الكيرنل على المعالج المركزي (CPU)
try:
    cudaq.set_target('qpp-cpu')  # تعيين الهدف إلى CPU
    print('CPU time:')
    cpu_time = timeit.timeit(stmt=code_to_time, globals=globals(), number=1)
    print(f'{cpu_time:.5f} seconds')
except Exception as e:
    print(f"Error running on CPU: {e}")

# التحقق من وجود GPU وتنفيذ الكيرنل عليه
try:
    if cudaq.num_available_gpus() > 0:  # التحقق من عدد وحدات GPU المتاحة
        cudaq.set_target('nvidia')  # تعيين الهدف إلى GPU
        print('GPU time:')
        gpu_time = timeit.timeit(stmt=code_to_time, globals=globals(), number=1)
        print(f'{gpu_time:.5f} seconds')
    else:
        print("No GPU available.")
except Exception as e:
    print(f"Error running on GPU: {e}")


CPU time:
[1mError running on CPU: <ipython-input-9-0e32737f6b93>:8: [91merror: [0m[1mInvalid function call - 'kernel' is unknown.
	 (offending source -> kernel.qalloc)[0m
GPU time:
[1mError running on GPU: <ipython-input-9-0e32737f6b93>:8: [91merror: [0m[1mInvalid function call - 'kernel' is unknown.
	 (offending source -> kernel.qalloc)[0m


In [10]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available.")


GPU is available!
GPU Name: Tesla T4


In [11]:
import cudaq

num_gpus = cudaq.num_available_gpus()
print(f"Number of GPUs available: {num_gpus}")
if num_gpus > 0:
    print("CUDA-Q GPU is available!")
else:
    print("No GPU available for CUDA-Q.")


Number of GPUs available: 1
CUDA-Q GPU is available!


In [12]:
import cudaq
print(cudaq.num_available_gpus())


1


In [13]:
cudaq.set_target('nvidia')


In [14]:
import cudaq

# إنشاء الكيرنل باستخدام make_kernel
kernel = cudaq.make_kernel()

# تخصيص كيوبت واحد
qubits = kernel.qalloc(1)

# تطبيق بوابة هادامارد على الكيوبت الأول
kernel.h(qubits[0])

# قياس الكيوبت
kernel.mz(qubits)

# تنفيذ الكيرنل
result = cudaq.sample(kernel, shots_count=1000)
print(result)


{ 0:490 1:510 }



In [15]:
print(cudaq.num_available_gpus())


1


In [16]:
import cudaq
import timeit

# إعداد الكيرنل
kernel = cudaq.make_kernel()
qubits = kernel.qalloc(25)  # تخصيص 25 كيوبت
kernel.h(qubits[0])  # وضع أول كيوبت في حالة التراكب
for i in range(24):  # تطبيق بوابات CNOT لإنشاء حالة GHZ
    kernel.cx(qubits[i], qubits[i + 1])
kernel.mz(qubits)  # قياس الكيوبتات

# إعداد الكود لاختبار الوقت
def execute_kernel():
    return cudaq.sample(kernel, shots_count=1_000_000)

# تشغيل الكيرنل على CPU
try:
    cudaq.set_target('qpp-cpu')  # تعيين الهدف إلى CPU
    print("Running on CPU...")
    cpu_time = timeit.timeit(execute_kernel, number=1)
    print(f"CPU time: {cpu_time:.2f} seconds")
except Exception as e:
    print(f"Error running on CPU: {e}")

# تشغيل الكيرنل على GPU إذا كان متوفراً
try:
    if cudaq.num_available_gpus() > 0:  # التحقق من وجود GPU
        cudaq.set_target('nvidia')  # تعيين الهدف إلى GPU
        print("Running on GPU...")
        gpu_time = timeit.timeit(execute_kernel, number=1)
        print(f"GPU time: {gpu_time:.2f} seconds")
    else:
        print("No GPU available.")
except Exception as e:
    print(f"Error running on GPU: {e}")


Running on CPU...
CPU time: 679.28 seconds
Running on GPU...
GPU time: 0.91 seconds


###Qiskit **GPU**

In [17]:
pip install qiskit

Collecting qiskit
  Downloading qiskit-1.3.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting rustworkx>=0.15.0 (from qiskit)
  Downloading rustworkx-0.16.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting dill>=0.3 (from qiskit)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting stevedore>=3.0.0 (from qiskit)
  Downloading stevedore-5.4.0-py3-none-any.whl.metadata (2.3 kB)
Collecting symengine<0.14,>=0.11 (from qiskit)
  Downloading symengine-0.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pbr>=2.0.0 (from stevedore>=3.0.0->qiskit)
  Downloading pbr-6.1.0-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading qiskit-1.3.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.9-py3-none-any.whl (119 k

In [18]:
pip install qiskit-ibm-runtime

Collecting qiskit-ibm-runtime
  Downloading qiskit_ibm_runtime-0.34.0-py3-none-any.whl.metadata (3.0 kB)
Collecting requests-ntlm>=1.1.0 (from qiskit-ibm-runtime)
  Downloading requests_ntlm-1.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting ibm-platform-services>=0.22.6 (from qiskit-ibm-runtime)
  Downloading ibm_platform_services-0.59.1-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic<2.10,>=2.5.0 (from qiskit-ibm-runtime)
  Downloading pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting ibm_cloud_sdk_core<4.0.0,>=3.22.0 (from ibm-platform-services>=0.22.6->qiskit-ibm-runtime)
  Downloading ibm_cloud_sdk_core-3.22.1-py3-none-any.whl.metadata (8.6 kB)
Collecting pydantic-core==2.23.4 (from pydantic<2.10,>=2.5.0->qiskit-ibm-runtime)
  Downloading pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collect

In [19]:
pip install qiskit-aer

Collecting qiskit-aer
  Downloading qiskit_aer-0.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Downloading qiskit_aer-0.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m107.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: qiskit-aer
Successfully installed qiskit-aer-0.16.0


In [20]:
pip install qiskit-aer-gpu


Collecting qiskit-aer-gpu
  Downloading qiskit_aer_gpu-0.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Collecting cuquantum-cu12>=23.3.0 (from qiskit-aer-gpu)
  Downloading cuquantum_cu12-24.11.0-py3-none-manylinux2014_x86_64.whl.metadata (2.8 kB)
Downloading qiskit_aer_gpu-0.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.8/18.8 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cuquantum_cu12-24.11.0-py3-none-manylinux2014_x86_64.whl (7.1 kB)
Installing collected packages: cuquantum-cu12, qiskit-aer-gpu
Successfully installed cuquantum-cu12-24.11.0 qiskit-aer-gpu-0.15.1


In [21]:
from qiskit import QuantumCircuit, transpile
from qiskit_aer import Aer
from qiskit_aer import AerSimulator
import time

# إنشاء دائرة GHZ مع 25 كيوبت
qc = QuantumCircuit(25)
qc.h(0)  # بوابة هادامارد على الكيوبت الأول
for i in range(24):  # تطبيق بوابات CNOT لإنشاء حالة GHZ
    qc.cx(i, i + 1)
qc.measure_all()  # قياس الكيوبتات

# إعداد المحاكي (Aer Simulator)
backend = AerSimulator()
# ترجمة الدائرة لتكون جاهزة للتنفيذ
transpiled_circuit = transpile(qc, backend)

# تشغيل الكيرنل على CPU أو GPU
try:
    print("Running Qiskit on Simulator...")
    start_time = time.time()
    job = backend.run(transpiled_circuit, shots=1_000_000)
    result = job.result()
    counts = result.get_counts()
    qiskit_time = time.time() - start_time
    print(f"Qiskit Results: {counts}")
    print(f"Qiskit Execution Time: {qiskit_time:.2f} seconds")
except Exception as e:
    print(f"Error running Qiskit: {e}")

# تحقق من دعم GPU
print(backend.configuration().to_dict())

Running Qiskit on Simulator...
Qiskit Results: {'1111111111111111111111111': 499635, '0000000000000000000000000': 500365}
Qiskit Execution Time: 25.45 seconds
{'backend_name': 'aer_simulator', 'backend_version': '0.15.1', 'n_qubits': 29, 'basis_gates': ['ccx', 'ccz', 'cp', 'crx', 'cry', 'crz', 'cswap', 'csx', 'cu', 'cu1', 'cu2', 'cu3', 'cx', 'cy', 'cz', 'diagonal', 'ecr', 'h', 'id', 'mcp', 'mcphase', 'mcr', 'mcrx', 'mcry', 'mcrz', 'mcswap', 'mcsx', 'mcu', 'mcu1', 'mcu2', 'mcu3', 'mcx', 'mcx_gray', 'mcy', 'mcz', 'multiplexer', 'p', 'pauli', 'r', 'roerror', 'rx', 'rxx', 'ry', 'ryy', 'rz', 'rzx', 'rzz', 's', 'sdg', 'store', 'swap', 'sx', 'sxdg', 't', 'tdg', 'u', 'u1', 'u2', 'u3', 'unitary', 'x', 'y', 'z', 'break_loop', 'continue_loop', 'delay', 'for_loop', 'if_else', 'initialize', 'kraus', 'qerror_loc', 'quantum_channel', 'reset', 'roerror', 'save_amplitudes', 'save_amplitudes_sq', 'save_clifford', 'save_density_matrix', 'save_expval', 'save_expval_var', 'save_matrix_product_state', 'save

###Pennylane **GPU**

In [22]:
pip install pennylane


Collecting pennylane
  Downloading PennyLane-0.40.0-py3-none-any.whl.metadata (10 kB)
Collecting tomlkit (from pennylane)
  Downloading tomlkit-0.13.2-py3-none-any.whl.metadata (2.7 kB)
Collecting appdirs (from pennylane)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting autoray>=0.6.11 (from pennylane)
  Downloading autoray-0.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pennylane-lightning>=0.40 (from pennylane)
  Downloading PennyLane_Lightning-0.40.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (27 kB)
Collecting diastatic-malt (from pennylane)
  Downloading diastatic_malt-2.15.2-py3-none-any.whl.metadata (2.6 kB)
Collecting scipy-openblas32>=0.3.26 (from pennylane-lightning>=0.40->pennylane)
  Downloading scipy_openblas32-0.3.29.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.1/56.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Downloading PennyLane-0.4

In [23]:
pip install pennylane-lightning




In [24]:
!pip install pennylane-lightning[gpu]


Collecting pennylane-lightning-gpu (from pennylane-lightning[gpu])
  Downloading PennyLane_Lightning_GPU-0.40.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Downloading PennyLane_Lightning_GPU-0.40.0-cp311-cp311-manylinux_2_28_x86_64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pennylane-lightning-gpu
Successfully installed pennylane-lightning-gpu-0.40.0


In [25]:
!pip install --upgrade pip
!pip install pennylane pennylane-lightning[gpu] cuquantum-cu12

import pennylane as qml
from pennylane import numpy as np
import time

# إعداد الجهاز
n_qubits = 25  # عدد الكيوبتات
device = qml.device("lightning.gpu", wires=n_qubits, shots=1)  # لقطة واحدة لكل تشغيل

# تعريف الدائرة الكمومية
@qml.qnode(device)
def ghz_circuit():
    qml.Hadamard(wires=0)  # بوابة هادامارد على أول كيوبت
    for i in range(n_qubits - 1):  # بوابات CNOT
        qml.CNOT(wires=[i, i + 1])
    return qml.sample(wires=range(n_qubits))  # قياس جميع الكيوبتات

# تشغيل الدائرة وحساب الزمن
print("Running PennyLane...")
start_time = time.time()
shots = 1_000 # عدد اللقطات
counts = {"0" * n_qubits: 0, "1" * n_qubits: 0}  # إعداد العدادات

for _ in range(shots):
    result = ghz_circuit()  # تشغيل الدائرة
    result_str = "".join(map(str, result))  # تحويل النتيجة إلى سلسلة
    if result_str in counts:
        counts[result_str] += 1

execution_time = time.time() - start_time

# طباعة النتائج
print(f"PennyLane Results: {counts}")
print(f"PennyLane Execution Time: {execution_time:.2f} seconds")


Collecting pip
  Downloading pip-25.0-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0
Running PennyLane...
PennyLane Results: {'0000000000000000000000000': 508, '1111111111111111111111111': 492}
PennyLane Execution Time: 72.84 seconds


#Technical Results and Analysis

1. NVIDIA CUDA-Q

    Execution Time:
        GPU: 0.91 seconds for 1,000,000 shots.
        CPU: 690 seconds for 1,000,000 shots.
    Results:
        Near-equal distribution between "000...0" and "111...1".
    Analysis:
        The performance on GPU NVIDIA was extremely fast due to efficient parallel computing.
        CUDA-Q is highly optimized for handling large numbers of shots.

2. Qiskit (Aer Simulator)

    Execution Time:
        CPU/GPU (Aer Simulator): 25.45 seconds for 1,000,000 shots.
    Results:
        Near-equal distribution between "000...0" and "111...1".
    Analysis:
        Qiskit was slower than CUDA-Q on GPU, but faster than CUDA-Q on CPU.
        Aer Simulator leverages software optimizations but doesn’t utilize GPU as effectively as CUDA-Q.

3. PennyLane

    Execution Time:
        lightning.gpu (GPU): 72.84 seconds for 1000 shots.
        
    Results:
        Results were as follows:
         '000...0': 508
         '111...1': 492
         The results are balanced, as expected for a GHZ state, which alternates equally between these two states.
    Analysis:
       Pennylane (lightning.gpu) is designed for hybrid quantum-classical workflows and provides flexibility for various quantum hardware. However, it is not optimized purely for speed in quantum circuit execution.
       The slower performance could be due to:
       1- Overhead in preparing and managing the GPU resources.
       2- Lack of specific optimizations for high-throughput quantum circuit sampling.

In [28]:
from prettytable import PrettyTable

# Create the PrettyTable instance
table = PrettyTable()

# Define the columns
table.field_names = [
    "Library",
    "Device",
    "Execution Time (1M Shots)",
    "Result Distribution",
    "Accuracy",
    "Strengths",
    "Weaknesses",
    "Use Case Suitability",
    "Remarks"
]

# Add rows with data
table.add_row([
    "Pennylane",
    "GPU (T4)",
    "72.84 seconds",
    "Balanced (508/492)",
    "High",
    "Flexible for hybrid workflows and quantum ML.",
    "Slower compared to CUDA-Q and Qiskit.",
    "Quantum ML and prototyping.",
    "Excellent for hybrid ML-quantum applications but slower."
])

table.add_row([
    "CUDA-Q",
    "GPU (T4)",
    "0.91 seconds",
    "Balanced",
    "High",
    "Fastest execution for large-scale circuits.",
    "Requires NVIDIA GPU for full optimization.",
    "High-performance quantum circuit execution.",
    "Best for fast and scalable simulations."
])

table.add_row([
    "CUDA-Q",
    "CPU",
    "690 seconds",
    "Balanced",
    "High",
    "Handles large circuits but slow.",
    "Extremely slow on CPU.",
    "Fallback option when no GPU is available.",
    "Not suitable for high-performance tasks."
])

table.add_row([
    "Qiskit Aer",
    "Aer Simulator (CPU/GPU)",
    "25.45 seconds",
    "Balanced (500,694/499,306)",
    "High",
    "Reliable simulation with extensive ecosystem.",
    "GPU utilization is limited compared to CUDA-Q.",
    "Research and general simulation needs.",
    "Good for prototyping but slower than CUDA-Q."
])

# Align columns
table.align = "l"

# Print the table
print(table)


+------------+-------------------------+---------------------------+----------------------------+----------+-----------------------------------------------+------------------------------------------------+---------------------------------------------+----------------------------------------------------------+
| Library    | Device                  | Execution Time (1M Shots) | Result Distribution        | Accuracy | Strengths                                     | Weaknesses                                     | Use Case Suitability                        | Remarks                                                  |
+------------+-------------------------+---------------------------+----------------------------+----------+-----------------------------------------------+------------------------------------------------+---------------------------------------------+----------------------------------------------------------+
| Pennylane  | GPU (T4)                | 72.84 seconds             