# Phase 1: Setup Verification

## Learning Objectives
- Verify CUDA is available in Colab
- Check GPU specifications
- Understand compute capability
- Run your first CUDA code

## Before You Start
**IMPORTANT**: Make sure you've enabled GPU runtime
- Click `Runtime` → `Change runtime type`
- Select `T4 GPU` from Hardware accelerator
- Click `Save`

## Step 1: Check NVIDIA Driver and GPU

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== 00-Setup-Verification ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

### What to Look For:
- GPU name (e.g., Tesla T4, V100)
- Driver version
- CUDA version
- GPU memory (e.g., 15GB for T4)

## Step 2: Check CUDA Compiler Version

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== 00-Setup-Verification ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Step 3: Install CUDA Extension for Jupyter

In [None]:
# Install nvcc plugin for Jupyter notebooks
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== 00-Setup-Verification ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Step 4: Your First CUDA Program - Hello World!

In [None]:
%%cu
#include <stdio.h>

__global__ void helloFromGPU() {
    printf("Hello from GPU thread %d!\n", threadIdx.x);
}

int main() {
    printf("Hello from CPU!\n");
    
    // Launch kernel with 1 block and 10 threads
    helloFromGPU<<<1, 10>>>();
    
    // Wait for GPU to finish
    cudaDeviceSynchronize();
    
    printf("Back to CPU!\n");
    return 0;
}

### Understanding the Code

- `__global__`: Indicates this function runs on GPU and is called from CPU
- `<<<1, 10>>>`: Launch configuration (1 block, 10 threads)
- `threadIdx.x`: Built-in variable for thread index
- `cudaDeviceSynchronize()`: Wait for GPU to complete

## Step 5: Query GPU Properties in Detail

In [None]:
%%cu
#include <stdio.h>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    printf("Number of CUDA devices: %d\n\n", deviceCount);
    
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        
        printf("Device %d: %s\n", i, prop.name);
        printf("========================================\n");
        printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
        printf("Total Global Memory: %.2f GB\n", prop.totalGlobalMem / 1e9);
        printf("Shared Memory per Block: %.2f KB\n", prop.sharedMemPerBlock / 1024.0);
        printf("Registers per Block: %d\n", prop.regsPerBlock);
        printf("Warp Size: %d\n", prop.warpSize);
        printf("Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
        printf("Max Threads Dimensions: (%d, %d, %d)\n", 
               prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("Max Grid Dimensions: (%d, %d, %d)\n",
               prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("Number of Multiprocessors: %d\n", prop.multiProcessorCount);
        printf("Clock Rate: %.2f GHz\n", prop.clockRate / 1e6);
        printf("Memory Clock Rate: %.2f GHz\n", prop.memoryClockRate / 1e6);
        printf("Memory Bus Width: %d bits\n", prop.memoryBusWidth);
        printf("L2 Cache Size: %.2f MB\n", prop.l2CacheSize / 1e6);
        printf("Max Threads per Multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
        printf("\n");
    }
    
    return 0;
}

### Key Properties to Remember

- **Compute Capability**: Determines which CUDA features are available
- **Multiprocessors (SMs)**: Number of parallel processing units
- **Max Threads per Block**: Usually 1024
- **Warp Size**: Always 32 (threads execute in warps)
- **Shared Memory**: Fast on-chip memory shared by threads in a block

## Step 6: Understanding Thread Hierarchy

In [None]:
%%cu
#include <stdio.h>

__global__ void printThreadInfo() {
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;
    int globalId = blockId * blockDim.x + threadId;
    
    printf("Block %d, Thread %d, Global ID %d\n", blockId, threadId, globalId);
}

int main() {
    printf("Launching kernel with 3 blocks and 4 threads per block\n");
    printf("Total threads: 3 * 4 = 12\n\n");
    
    printThreadInfo<<<3, 4>>>();
    cudaDeviceSynchronize();
    
    return 0;
}

### Thread Hierarchy Explained

```
Grid (All blocks)
  ├─ Block 0
  │   ├─ Thread 0 (Global ID: 0)
  │   ├─ Thread 1 (Global ID: 1)
  │   ├─ Thread 2 (Global ID: 2)
  │   └─ Thread 3 (Global ID: 3)
  ├─ Block 1
  │   ├─ Thread 0 (Global ID: 4)
  │   ├─ Thread 1 (Global ID: 5)
  │   ├─ Thread 2 (Global ID: 6)
  │   └─ Thread 3 (Global ID: 7)
  └─ Block 2
      ├─ Thread 0 (Global ID: 8)
      ├─ Thread 1 (Global ID: 9)
      ├─ Thread 2 (Global ID: 10)
      └─ Thread 3 (Global ID: 11)
```

## Step 7: Error Checking (Important!)

In [None]:
%%cu
#include <stdio.h>

#define CHECK_CUDA_ERROR(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
               cudaGetErrorString(err)); \
        exit(1); \
    } \
}

__global__ void kernel() {
    printf("Hello from kernel!\n");
}

int main() {
    // Example 1: Check memory allocation
    float *d_data;
    CHECK_CUDA_ERROR(cudaMalloc(&d_data, 1000 * sizeof(float)));
    printf("Memory allocated successfully\n");
    
    // Example 2: Check kernel launch
    kernel<<<1, 32>>>();
    CHECK_CUDA_ERROR(cudaGetLastError());
    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
    printf("Kernel executed successfully\n");
    
    // Clean up
    CHECK_CUDA_ERROR(cudaFree(d_data));
    
    return 0;
}

## ✅ Checklist: You're Ready to Proceed If...

- [ ] You can see GPU information with `nvidia-smi`
- [ ] You successfully ran the Hello World kernel
- [ ] You understand the thread hierarchy (blocks and threads)
- [ ] You can query GPU properties
- [ ] You know how to check for CUDA errors

## 🎉 Congratulations!

You've successfully set up your CUDA learning environment and run your first GPU code!

## Next Steps

Continue to:
- `01-hello-world.ipynb` - More kernel launch patterns
- `02-device-query.ipynb` - Deep dive into GPU architecture
- `03-thread-indexing.ipynb` - Master thread indexing

## Notes Section (Your Learning Journal)

Use this space to write notes about what you learned:

---

*Add your notes here*

---