# Thomas algorithm on CUDA

In [None]:
# make sure CUDA is installed
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
# make sure you have a GPU runtime (if this fails go to runtime -> change runtime type)
!nvidia-smi

Tue Apr 30 03:32:51 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# CUDA in Jupyter helpers
!pip install nvcc4jupyter
%load_ext nvcc4jupyter
# to learn about how to do more fancy things with CUDA using this API see:
# https://nvcc4jupyter.readthedocs.io/en/latest/index.html

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpu0do3dx2".


### Thomas code for size n matrix

In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <time.h>

__host__
void generate_tridiagonal_system(int n, double a[], double b[], double c[], double d[]) {
    for (int i = 0; i < n; i++) {
        // Generate random values for a, b, c, and d
        a[i] = (i > 0) ? rand() % 100 + 1 : 0;  // Upper diagonal (no entry at i=0)
        c[i] = (i < n-1) ? rand() % 100 + 1 : 0;  // Lower diagonal (no entry at i=n-1)
        b[i] = a[i] + c[i] + rand() % 100 + 50;  // Ensure diagonal dominance
        d[i] = rand() % 100 + 1;
    }
}

__host__
void setup_matrix(int n, double *a, double *b, double *c, double *d) {

// Allocate memory for matrix components
    a = (double *) malloc(n * sizeof(double));  // Upper diagonal
    b = (double *) malloc(n * sizeof(double));  // Main diagonal
    c = (double *) malloc(n * sizeof(double));  // Lower diagonal
    d = (double *) malloc(n * sizeof(double));  // Right-hand side vector

    // Generate the tridiagonal system
    generate_tridiagonal_system(n, a, b, c, d);

    // Output the generated tridiagonal system
    printf("Generated tridiagonal matrix system:\n");
    printf("Upper diagonal a: ");
    for (int i = 1; i < n; i++) {  // Starts from 1, as the first element of a is not used
        printf("%.2f ", a[i]);
    }
    printf("\nMain diagonal b: ");
    for (int i = 0; i < n; i++) {
        printf("%.2f ", b[i]);
    }
    printf("\nLower diagonal c: ");
    for (int i = 0; i < n - 1; i++) {  // Ends at n-1, as the last element of c is not used
        printf("%.2f ", c[i]);
    }
    printf("\nRight-hand side d: ");
    for (int i = 0; i < n; i++) {
        printf("%.2f ", d[i]);
    }
    printf("\n");

}

__global__
void thomas_kernel(int n, double* x, double* a, double* b, double* c) {
    int idx = threadIdx.x;
    if (idx != 0) return;  // Ensure only one thread runs for a single system

    // Forward elimination
    for (int i = 1; i < n; i++) {
        double m = a[i] / b[i - 1];
        b[i] = b[i] - m * c[i - 1];
        x[i] = x[i] - m * x[i - 1];
    }

    // Backward substitution
    x[n - 1] = x[n - 1] / b[n - 1];
    for (int i = n - 2; i >= 0; i--) {
        x[i] = (x[i] - c[i] * x[i + 1]) / b[i];
    }
}

int main() {
    // Open a CSV file to save the execution times
    FILE *fp = fopen("execution_times.csv", "w");
    if (fp == NULL) {
        fprintf(stderr, "Failed to open file for writing.\n");
        return -1;
    }
    fprintf(fp, "System Size,Execution Time (ms)\n"); // Write the CSV header

    int sizes[] = {2, 5, 10, 100, 1000, 2000, 5000, 10000};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);  // Number of different system sizes

    int num_systems = 1000;

    for (int idx = 0; idx < num_sizes; idx++) {
        int n = sizes[idx];  // System size for this iteration
        printf("System size: %d\n", n);
        srand(time(NULL));
        float totaltime = 0;
        for(int i = 0; i <= num_systems; i++) {
            double *h_a = (double *) malloc(n * sizeof(double));  // Upper diagonal
            double *h_b = (double *) malloc(n * sizeof(double));  // Main diagonal
            double *h_c = (double *) malloc(n * sizeof(double));  // Lower diagonal
            double *h_x = (double *) malloc(n * sizeof(double));  // Right-hand side vector


            // Generate the tridiagonal system
            generate_tridiagonal_system(n, h_a, h_b, h_c, h_x);

            //setup_matrix(n, h_a, h_b, h_c, h_x);

            double *d_a, *d_b, *d_c, *d_x;
            cudaMalloc(&d_a, n * sizeof(double));
            cudaMalloc(&d_b, n * sizeof(double));
            cudaMalloc(&d_c, n * sizeof(double));
            cudaMalloc(&d_x, n * sizeof(double));

            cudaMemcpy(d_a, h_a, n * sizeof(double), cudaMemcpyHostToDevice);
            cudaMemcpy(d_b, h_b, n * sizeof(double), cudaMemcpyHostToDevice);
            cudaMemcpy(d_c, h_c, n * sizeof(double), cudaMemcpyHostToDevice);
            cudaMemcpy(d_x, h_x, n * sizeof(double), cudaMemcpyHostToDevice);

            // Setting up timing
            cudaEvent_t start, stop;
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            // Launch kernel with a single block and a single thread
            cudaEventRecord(start);
            thomas_kernel<<<1, 1>>>(n, d_x, d_a, d_b, d_c);

            cudaEventRecord(stop);

            cudaEventSynchronize(stop);
            float milliseconds = 0;
            cudaEventElapsedTime(&milliseconds, start, stop);
            // printf("Execution time: %f milliseconds\n", milliseconds);
            // fprintf(fp, "Execution time for system %d: %f milliseconds\n", i, milliseconds);
            if(i != 0) {
                totaltime += milliseconds;
            }

            cudaMemcpy(h_x, d_x, n * sizeof(double), cudaMemcpyDeviceToHost);
            /*
            printf("Result:\n");
            fprintf(fp, "Solution vector for system %d:\n", i);
            for (int i = 0; i < n; i++) {
                printf("%f ", h_x[i]);
                fprintf(fp, "%f ", h_x[i]);
            }
            printf("\n\n");
            fprintf(fp, "\n\n");
            */
            cudaFree(d_a);
            cudaFree(d_b);
            cudaFree(d_c);
            cudaFree(d_x);
        }

        printf("Average runtime: %f\n", totaltime/num_systems);
        fprintf(fp,"%d,%f\n", n,totaltime/num_systems);
    }

    return 0;
}


System size: 2
Average runtime: 0.007882
System size: 5
Average runtime: 0.009039
System size: 10
Average runtime: 0.011225
System size: 100
Average runtime: 0.045132
System size: 1000
Average runtime: 0.375791
System size: 2000
Average runtime: 0.744818
System size: 5000
Average runtime: 1.899918
System size: 10000
Average runtime: 3.898109



## Accuracy testing

Write output of Thomas to file for testing

In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <time.h>

__host__
void generate_tridiagonal_system(int n, double a[], double b[], double c[], double d[]) {
    for (int i = 0; i < n; i++) {
        // Generate random values for a, b, c, and d
        a[i] = (i > 0) ? rand() % 100 + 1 : 0;  // Upper diagonal (no entry at i=0)
        c[i] = (i < n-1) ? rand() % 100 + 1 : 0;  // Lower diagonal (no entry at i=n-1)
        b[i] = a[i] + c[i] + rand() % 100 + 50;  // Ensure diagonal dominance
        d[i] = rand() % 100 + 1;
    }
}

__host__
void setup_matrix(int n, double *a, double *b, double *c, double *d) {

// Allocate memory for matrix components
    a = (double *) malloc(n * sizeof(double));  // Upper diagonal
    b = (double *) malloc(n * sizeof(double));  // Main diagonal
    c = (double *) malloc(n * sizeof(double));  // Lower diagonal
    d = (double *) malloc(n * sizeof(double));  // Right-hand side vector

    // Generate the tridiagonal system
    generate_tridiagonal_system(n, a, b, c, d);

    // Output the generated tridiagonal system
    printf("Generated tridiagonal matrix system:\n");
    printf("Upper diagonal a: ");
    for (int i = 1; i < n; i++) {  // Starts from 1, as the first element of a is not used
        printf("%.2f ", a[i]);
    }
    printf("\nMain diagonal b: ");
    for (int i = 0; i < n; i++) {
        printf("%.2f ", b[i]);
    }
    printf("\nLower diagonal c: ");
    for (int i = 0; i < n - 1; i++) {  // Ends at n-1, as the last element of c is not used
        printf("%.2f ", c[i]);
    }
    printf("\nRight-hand side d: ");
    for (int i = 0; i < n; i++) {
        printf("%.2f ", d[i]);
    }
    printf("\n");

}

__global__
void thomas_kernel(int n, double* x, double* a, double* b, double* c) {
    int idx = threadIdx.x;
    if (idx != 0) return;  // Ensure only one thread runs for a single system

    // Forward elimination
    for (int i = 1; i < n; i++) {
        double m = a[i] / b[i - 1];
        b[i] = b[i] - m * c[i - 1];
        x[i] = x[i] - m * x[i - 1];
    }

    // Backward substitution
    x[n - 1] = x[n - 1] / b[n - 1];
    for (int i = n - 2; i >= 0; i--) {
        x[i] = (x[i] - c[i] * x[i + 1]) / b[i];
    }
}

int main() {
    // Open a CSV file to save the execution times
    FILE *fp = fopen("results.txt", "w");
    if (fp == NULL) {
        fprintf(stderr, "Failed to open file for writing.\n");
        return -1;
    }

    int sizes[] = {100};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);  // Number of different system sizes

    int num_systems = 99;
    fprintf(fp, "%d\n", num_systems+1);
    for (int idx = 0; idx < num_sizes; idx++) {
        int n = sizes[idx];  // System size for this iteration
        srand(time(NULL));
        float totaltime = 0;
        for(int i = 0; i <= num_systems; i++) {
            double *h_a = (double *) malloc(n * sizeof(double));  // Upper diagonal
            double *h_b = (double *) malloc(n * sizeof(double));  // Main diagonal
            double *h_c = (double *) malloc(n * sizeof(double));  // Lower diagonal
            double *h_x = (double *) malloc(n * sizeof(double));  // Right-hand side vector


            // Generate the tridiagonal system
            generate_tridiagonal_system(n, h_a, h_b, h_c, h_x);

            for (int i = 0; i < n; i++) {
                if(i == 0) {
                    printf("%.0f, ", h_a[i]);
                }
                else if(i == n-1) {
                    printf("%.0f ", h_a[i]);
                    fprintf(fp, "%.0f 0", h_a[i]);
                } else {
                    printf("%.0f, ", h_a[i]);
                    fprintf(fp, "%.0f ", h_a[i]);
                }
            }
            printf("\n");
            fprintf(fp, "\n");
            for (int i = 0; i < n; i++) {
                if(i == n-1) {
                    printf("%.0f ", h_b[i]);
                    fprintf(fp, "%.0f ", h_b[i]);
                } else {
                    printf("%.0f, ", h_b[i]);
                    fprintf(fp, "%.0f ", h_b[i]);
                }
            }
            printf("\n");
            fprintf(fp, "\n");
            for (int i = 0; i < n; i++) {
                if(i == n-1) {
                    printf("%.0f ", h_c[i]);
                    fprintf(fp, "%.0f ", h_c[i]);
                } else {
                    printf("%.0f, ", h_c[i]);
                    fprintf(fp, "%.0f ", h_c[i]);
                }
            }
            printf("\n");
            fprintf(fp, "\n");
            for (int i = 0; i < n; i++) {
                if(i == n-1) {
                    printf("%.0f ", h_x[i]);
                    fprintf(fp, "%.0f ", h_x[i]);
                } else {
                    printf("%.0f, ", h_x[i]);
                    fprintf(fp, "%.0f ", h_x[i]);
                }
            }
            printf("\n");
            fprintf(fp, "\n");

            //setup_matrix(n, h_a, h_b, h_c, h_x);

            double *d_a, *d_b, *d_c, *d_x;
            cudaMalloc(&d_a, n * sizeof(double));
            cudaMalloc(&d_b, n * sizeof(double));
            cudaMalloc(&d_c, n * sizeof(double));
            cudaMalloc(&d_x, n * sizeof(double));

            cudaMemcpy(d_a, h_a, n * sizeof(double), cudaMemcpyHostToDevice);
            cudaMemcpy(d_b, h_b, n * sizeof(double), cudaMemcpyHostToDevice);
            cudaMemcpy(d_c, h_c, n * sizeof(double), cudaMemcpyHostToDevice);
            cudaMemcpy(d_x, h_x, n * sizeof(double), cudaMemcpyHostToDevice);

            // Setting up timing
            cudaEvent_t start, stop;
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            // Launch kernel with a single block and a single thread
            cudaEventRecord(start);
            thomas_kernel<<<1, 1>>>(n, d_x, d_a, d_b, d_c);

            cudaEventRecord(stop);

            cudaEventSynchronize(stop);
            float milliseconds = 0;
            cudaEventElapsedTime(&milliseconds, start, stop);


            cudaMemcpy(h_x, d_x, n * sizeof(double), cudaMemcpyDeviceToHost);
            printf("Solution vector x:\n");
            for (int i = 0; i < n; i++) {
                printf("%f, ", h_x[i]);
                fprintf(fp, "%f ", h_x[i]);
            }


            printf("\n\n");
            fprintf(fp, "\n");

            cudaFree(d_a);
            cudaFree(d_b);
            cudaFree(d_c);
            cudaFree(d_x);
        }

    }

    return 0;
}

0, 27, 1, 36, 56, 19, 61, 91, 88, 18, 91, 90, 13, 77, 81, 11, 59, 11, 28, 71, 91, 2, 40, 17, 74, 84, 80, 91, 38, 81, 21, 33, 40, 68, 59, 74, 37, 43, 7, 83, 90, 81, 57, 40, 66, 78, 94, 51, 21, 83, 62, 68, 65, 61, 49, 32, 27, 51, 88, 38, 25, 3, 79, 4, 41, 17, 61, 72, 38, 73, 51, 45, 91, 66, 53, 31, 30, 88, 100, 90, 54, 86, 34, 7, 99, 48, 62, 93, 27, 28, 89, 9, 57, 23, 86, 68, 92, 8, 34, 73 
150, 162, 209, 171, 165, 118, 226, 333, 160, 160, 235, 266, 204, 252, 201, 64, 194, 124, 138, 156, 244, 185, 108, 171, 193, 275, 286, 227, 190, 162, 121, 119, 252, 191, 228, 192, 226, 260, 121, 251, 331, 223, 156, 213, 208, 235, 200, 226, 215, 232, 171, 264, 284, 157, 185, 203, 175, 250, 226, 176, 254, 217, 253, 113, 249, 211, 245, 127, 271, 192, 207, 153, 241, 160, 208, 136, 148, 195, 255, 245, 249, 237, 174, 182, 284, 268, 219, 296, 165, 144, 185, 204, 278, 145, 267, 214, 195, 151, 137, 200 
91, 61, 98, 79, 23, 6, 65, 97, 5, 42, 11, 91, 72, 65, 63, 3, 70, 18, 29, 19, 38, 78, 15, 5, 67, 100, 73, 1, 3

In [None]:
%%writefile thomas_test.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>


__global__
void thomas_kernel(int n, double* x, double* a, double* b, double* c) {
    int idx = threadIdx.x;
    if (idx != 0) return;  // Ensure only one thread runs for a single system

    // Forward elimination
    for (int i = 1; i < n; i++) {
        double m = a[i] / b[i - 1];
        b[i] = b[i] - m * c[i - 1];
        x[i] = x[i] - m * x[i - 1];
    }

    // Backward substitution
    x[n - 1] = x[n - 1] / b[n - 1];
    for (int i = n - 2; i >= 0; i--) {
        x[i] = (x[i] - c[i] * x[i + 1]) / b[i];
    }
}


// Function to generate tridiagonal system of size 'size' on the device
__global__
void generate_tridiagonal_system(float *A, float *d, int size, curandState *state) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    curandState localState = state[idx];  // Each thread gets its own seed

    // Generate main diagonal elements
    if (idx < size) {
        A[idx * size + idx] = curand_uniform(&localState);
    }

    // Generate upper diagonal elements
    if (idx < size - 1) {
        A[idx * size + idx + 1] = curand_uniform(&localState);
    }

    // Generate lower diagonal elements
    if (idx < size - 1) {
        A[(idx + 1) * size + idx] = curand_uniform(&localState);
    }

    // Generate right-hand side vector elements
    if (idx < size) {
        d[idx] = curand_uniform(&localState);
    }

    // Save the state back to global memory
    state[idx] = localState;
}

__global__
void setup_random_states(curandState *state, int size) {
    for (int i = 0; i < size; ++i) {
        curand_init(0, i, 0, &state[i]);
    }
}

int main() {
    int size = 10;  // Initial size of the tridiagonal system

    // Device arrays
    float *d_A, *d_d;
    curandState *d_states;

    // Allocate memory on the device
    cudaMalloc(&d_A, size * size * sizeof(float));
    cudaMalloc(&d_d, size * sizeof(float));
    cudaMalloc(&d_states, size * sizeof(curandState));

    // Initialize random number generator states
    setup_random_states<<<1, size>>>(d_states, size);

    // Launch kernel to generate tridiagonal system
    generate_tridiagonal_system<<<1, size>>>(d_A, d_d, size, d_states);

    // Copy results back to host (for demonstration purposes)
    float *h_A = (float*)malloc(size * size * sizeof(float));
    float *h_d = (float*)malloc(size * sizeof(float));
    cudaMemcpy(h_A, d_A, size * size * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_d, d_d, size * sizeof(float), cudaMemcpyDeviceToHost);

    // Output the generated tridiagonal system (for demonstration purposes)
    printf("Tridiagonal Matrix A:\n");
    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            printf("%.4f ", h_A[i * size + j]);
        }
        printf("\n");
    }

    printf("\nRight-hand side vector d:\n");
    for (int i = 0; i < size; ++i) {
        printf("%.4f\n", h_d[i]);
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_d);
    cudaFree(d_states);
    free(h_A);
    free(h_d);

    return 0;
}



Writing thomas_test.cu


In [None]:
!nvcc -o thomas_test thomas_test.cu -lcurand

In [None]:
!./thomas_test

Tridiagonal Matrix A:
0.7402 0.4385 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
0.5170 0.9210 0.4604 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
0.0000 0.3335 0.0390 0.2502 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
0.0000 0.0000 0.1845 0.9690 0.4947 0.0000 0.0000 0.0000 0.0000 0.0000 
0.0000 0.0000 0.0000 0.6734 0.9251 0.0530 0.0000 0.0000 0.0000 0.0000 
0.0000 0.0000 0.0000 0.0000 0.1631 0.4464 0.3377 0.0000 0.0000 0.0000 
0.0000 0.0000 0.0000 0.0000 0.0000 0.2072 0.6673 0.3968 0.0000 0.0000 
0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.1583 0.1099 0.8744 0.0000 
0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.9808 0.4702 0.4822 
0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.4279 0.5132 

Right-hand side vector d:
0.1570
0.3725
0.9624
0.5628
0.8897
0.9845
0.5423
0.9669
0.4417
0.0428


In [None]:
%%cuda_group_save -n thomas_test.cu -g default

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>

// Function to generate tridiagonal system of size 'size' on the device
__global__
void generate_tridiagonal_system(float *A, float *d, int size, curandState *state) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    curandState localState = state[idx];  // Each thread gets its own seed

    // Generate main diagonal elements
    if (idx < size) {
        A[idx * size + idx] = curand_uniform(&localState);
    }

    // Generate upper diagonal elements
    if (idx < size - 1) {
        A[idx * size + idx + 1] = curand_uniform(&localState);
    }

    // Generate lower diagonal elements
    if (idx < size - 1) {
        A[(idx + 1) * size + idx] = curand_uniform(&localState);
    }

    // Generate right-hand side vector elements
    if (idx < size) {
        d[idx] = curand_uniform(&localState);
    }

    // Save the state back to global memory
    state[idx] = localState;
}

__host__
// Host function to initialize random number generator states
void setup_random_states(curandState *state, int size) {
    for (int i = 0; i < size; ++i) {
        curand_init(0, i, 0, &state[i]);
    }
}

__global__
int main() {
    int size = 5;  // Initial size of the tridiagonal system

    // Device arrays
    float *d_A, *d_d;
    curandState *d_states;

    // Allocate memory on the device
    cudaMalloc(&d_A, size * size * sizeof(float));
    cudaMalloc(&d_d, size * sizeof(float));
    cudaMalloc(&d_states, size * sizeof(curandState));

    // Initialize random number generator states
    setup_random_states<<<1, size>>>(d_states, size);

    // Launch kernel to generate tridiagonal system
    generate_tridiagonal_system<<<1, size>>>(d_A, d_d, size, d_states);

    // Copy results back to host (for demonstration purposes)
    float *h_A = (float*)malloc(size * size * sizeof(float));
    float *h_d = (float*)malloc(size * sizeof(float));
    cudaMemcpy(h_A, d_A, size * size * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_d, d_d, size * sizeof(float), cudaMemcpyDeviceToHost);

    // Output the generated tridiagonal system (for demonstration purposes)
    printf("Tridiagonal Matrix A:\n");
    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            printf("%.4f ", h_A[i * size + j]);
        }
        printf("\n");
    }

    printf("\nRight-hand side vector d:\n");
    for (int i = 0; i < size; ++i) {
        printf("%.4f\n", h_d[i]);
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_d);
    cudaFree(d_states);
    free(h_A);
    free(h_d);


    const int n = 4;  // System size
    double h_a[] = {0, -1, -1, -1};
    double h_b[] = {2, 2, 2, 1};
    double h_c[] = {-1, -1, -1, 0};
    double h_x[] = {0, 0, 1, 0};

    double *d_a, *d_b, *d_c, *d_x;
    cudaMalloc(&d_a, n * sizeof(double));
    cudaMalloc(&d_b, n * sizeof(double));
    cudaMalloc(&d_c, n * sizeof(double));
    cudaMalloc(&d_x, n * sizeof(double));

    cudaMemcpy(d_a, h_a, n * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, n * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, h_c, n * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, h_x, n * sizeof(double), cudaMemcpyHostToDevice);

    // Setting up timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Launch kernel with a single block and a single thread
    cudaEventRecord(start);
    thomas_kernel<<<1, 1>>>(n, d_x, d_a, d_b, d_c);

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Execution time: %f milliseconds\n", milliseconds);

    cudaMemcpy(h_x, d_x, n * sizeof(double), cudaMemcpyDeviceToHost);

    printf("Result:\n");
    for (int i = 0; i < n; i++) {
        printf("%f ", h_x[i]);
    }
    printf("\n");

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFree(d_x);

    return 0;
}

In [None]:
!nvcc -o /content/thomas_test /content/thomas_test.cu -lcurand

[01m[Kcc1plus:[m[K [01;31m[Kfatal error: [m[K/content/thomas_test.cu: No such file or directory
compilation terminated.
