# **Imports**

In [1]:
from numba import cuda
import numpy as np
import time

In [2]:
!pip install numba==0.57.0  # Try a slightly older version
!pip install llvmlite==0.40.0  # Numba often has a corresponding llvmlite version



**CUDA Kernel**

In [3]:
@cuda.jit
def fibonacci_cuda_kernel(out):
    """CUDA kernel to compute the Fibonacci sequence."""
    idx = cuda.grid(1)
    if idx < out.shape[0]:
        if idx == 0:
            out[idx] = 0
        elif idx == 1:
            out[idx] = 1
        else:
            out[idx] = out[idx - 1] + out[idx - 2]

def fibonacci_cuda(n):
    """Launches the CUDA kernel to compute the Fibonacci sequence."""
    if n <= 0:
        return np.array([])
    out_device = cuda.device_array(n, dtype=np.int64)
    threadsperblock = 256
    blockspergrid = (n + (threadsperblock - 1)) // threadsperblock
    fibonacci_cuda_kernel[blockspergrid, threadsperblock](out_device)
    return out_device.copy_to_host()

**Sequential Implementation**

In [4]:
def fibonacci_cuda(n):
    """Launches the CUDA kernel to compute the Fibonacci sequence."""
    if n <= 0:
        return np.array([])
    out_device = cuda.device_array(n, dtype=np.int64)
    threadsperblock = 256
    blockspergrid = (n + (threadsperblock - 1)) // threadsperblock
    fibonacci_cuda_kernel[blockspergrid, threadsperblock](out_device)
    return out_device.copy_to_host()



In [5]:
# --- Sequential Implementation ---
def fibonacci_sequential(n):
    """Computes the Fibonacci sequence sequentially."""
    if n <= 0:
        return []
    elif n == 1:
        return [0]
    else:
        list_fib = [0, 1]
        while len(list_fib) < n:
            next_fib = list_fib[-1] + list_fib[-2]
            list_fib.append(next_fib)
        return list_fib



**Main Execution and Comparison**

In [6]:
# --- Main Execution and Comparison ---
if __name__ == "__main__":
    N = 220

    # Sequential Execution
    start_time_seq = time.time()
    fib_seq = fibonacci_sequential(N)
    end_time_seq = time.time()
    sequential_time = end_time_seq - start_time_seq
    print(f"Sequential Fibonacci ({N} numbers) took: {sequential_time:.6f} seconds")

    # CUDA Execution
    try:
        start_time_cuda = time.time()
        fib_cuda = fibonacci_cuda(N)
        end_time_cuda = time.time()
        cuda_time = end_time_cuda - start_time_cuda
        print(f"CUDA Fibonacci ({N} numbers) took: {cuda_time:.6f} seconds")
        speedup = sequential_time / cuda_time
        print(f"\nSpeedup (Sequential / CUDA): {speedup:.2f}x")

    except Exception as e:
        print(f"\nCUDA Error: {e}")
        print("Make sure you have a CUDA-enabled GPU and the necessary drivers installed and compatible.")
        print("Falling back to CPU execution for comparison.")
        fib_cuda = fib_seq
        speedup = 1.0

    # Verification (optional)
    if N <= 20:
        if np.array_equal(fib_seq, fib_cuda):
            print("\nResults from sequential and CUDA implementations match.")
        else:
            print("\nResults from sequential and CUDA implementations DO NOT match!")

Sequential Fibonacci (220 numbers) took: 0.000036 seconds


ERROR:numba.cuda.cudadrv.driver:Call to cuLinkAddData results in CUDA_ERROR_UNSUPPORTED_PTX_VERSION



CUDA Error: [222] Call to cuLinkAddData results in CUDA_ERROR_UNSUPPORTED_PTX_VERSION
ptxas application ptx input, line 9; fatal   : Unsupported .version 8.5; current version is '8.4'
Make sure you have a CUDA-enabled GPU and the necessary drivers installed and compatible.
Falling back to CPU execution for comparison.


## --- Observation ---

For a relatively small number of elements like N = 220, the overhead of transferring data to and from the GPU, as well as the kernel launch and grid/block setup, can often outweigh the benefits of parallel computation. In this scenario, you might observe that the sequential implementation is faster or has a similar execution time to the CUDA implementation.

However, as the value of N increases significantly, the parallel nature of the CUDA kernel *could* become more advantageous if the PTX compatibility issues are resolved. The current CUDA implementation has dependencies between threads, limiting parallelism. More advanced CUDA techniques would be needed for better GPU utilization.

**Important Note:** This code uses older versions of Numba and llvmlite.

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
