In [None]:
# make sure CUDA is installed
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
# make sure you have a GPU runtime (if this fails go to runtime -> change runtime type)
!nvidia-smi

Tue Apr 30 03:15:55 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# CUDA in Jupyter helpers
!pip install nvcc4jupyter
%load_ext nvcc4jupyter
# to learn about how to do more fancy things with CUDA using this API see:
# https://nvcc4jupyter.readthedocs.io/en/latest/index.html

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpdrd_fbzc".


In [None]:
#
# This is from Louis' PCR_CUDA
#

%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>


__global__ void list_print(int nmax, float * in) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    printf("Thread %i shows %f \n", i, in[i]);
}

//PCR
__global__ void Solve_Kernel(
    float * alist, float * blist, float * clist, float * dlist, float * xlist,
    int iter_max, int DMax) {

    int idx_row = blockIdx.x*blockDim.x + threadIdx.x;
    int row_max = DMax - 1;

    int stride = 1;
    int next_stride = stride;

    float a1, b1, c1, d1;
    float k01, k21, c01, a21, d01, d21;

    bool next_or_ot = true;
    int accum;

    for (int iter = 0; iter < iter_max; iter++) {

        if ( next_or_ot ) {

            next_stride = stride<<1;

            // 1    for updating 'a'
            if ((idx_row - stride)<0) {
            // 1.1  if it is the 'first' line
                a1 = 0.0f;
                k01 = 0.0f;
                c01 = 0.0f;
                d01 = 0.0f;
            } else if ((idx_row - next_stride)<0) {
            // 1.2  if no place for 'a'
                a1 = 0.0f;
                k01 = alist[idx_row]/blist[idx_row - stride];
                c01 = clist[idx_row - stride]*k01;
                d01 = dlist[idx_row - stride]*k01;
            } else {
            // 1.3  for rest general rows
                k01 = alist[idx_row]/blist[idx_row - stride];
                a1 = -alist[idx_row - stride]*k01;
                c01 = clist[idx_row - stride]*k01;
                d01 = dlist[idx_row - stride]*k01;
            }

            // 2    for updating 'c'
            if ((idx_row + stride)>row_max) {
            // 2.1  if it is the 'last' line
                c1 = 0.0f;
                k21 = 0.0f;
                a21 = 0.0f;
                d21 = 0.0f;
            } else if ((idx_row + next_stride)>row_max) {
                c1 = 0.0f;
                k21 = clist[idx_row]/blist[idx_row + stride];
                a21 = alist[idx_row + stride]*k21;
                d21 = dlist[idx_row + stride]*k21;
            } else {
                k21 = clist[idx_row]/blist[idx_row + stride];
                c1 = -clist[idx_row + stride]*k21;
                a21 = alist[idx_row + stride]*k21;
                d21 = dlist[idx_row + stride]*k21;
            }
            // 3   for updating 'b'
            b1 = blist[idx_row] - c01 - a21;
            // 4   for updating 'd'
            d1 = dlist[idx_row] - d01 - d21;

            stride = next_stride;

            //Determine if this line has reached the bi-set
            int pos = idx_row-2*stride;
            accum = 0;
            for ( size_t iter = 0; iter<5; iter++ ) {
                if (pos >=0 && pos < DMax) accum++;
                pos+=stride;
            }
            if (accum < 3) {
                next_or_ot = false;//Turn of for ever
            }

        }

        __syncthreads();

        alist[idx_row] = a1;
        blist[idx_row] = b1;
        clist[idx_row] = c1;
        dlist[idx_row] = d1;

    }

    if ( accum==1 ) {
        xlist[idx_row] = dlist[idx_row] / blist[idx_row];
    } else if ( (idx_row-stride)<0 ) {
        int i = idx_row; int k = idx_row+stride;
        float f = clist[i]/blist[k];
        xlist[i] = (dlist[i]-dlist[k]*f)/(blist[i]-alist[k]*f);
    } else {
        int i = idx_row - stride; int k = idx_row;
        float f = alist[k]/blist[i];
        xlist[k] = (dlist[k]-dlist[i]*f)/(blist[k]-clist[i]*f);
    }

}

__host__
int main() {

    /*
    int diagonal_size = 4;

    float h_a[] = {0, -1, -1, -1};
    float h_b[] = {2, 2, 2, 1};
    float h_c[] = {-1, -1, -1, 0};
    float h_d[] = {0, 0, 1, 0};
    float h_x[diagonal_size];
    */


    int diagonal_size = 6;

    // Host arrays
    float h_a[] = {0, 1, 1, 1, 1, 0};;
    float h_b[] = {2, 2, 2, 2, 2, 2};
    float h_c[] = {1, 1, 1, 1, 1, 0};
    float h_d[] = {0, 0, 1, 2, 3, 0};
    float h_x[diagonal_size];

    // Device arrays
    float *d_a, *d_b, *d_c, *d_d, *d_x;

    // Allocate memory on the device
    cudaMalloc(&d_a, diagonal_size * sizeof(float));
    cudaMalloc(&d_b, diagonal_size * sizeof(float));
    cudaMalloc(&d_c, diagonal_size * sizeof(float));
    cudaMalloc(&d_d, diagonal_size * sizeof(float));
    cudaMalloc(&d_x, diagonal_size * sizeof(float));

    // Copy data from host to device
    cudaMemcpy(d_a, h_a, diagonal_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, diagonal_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, h_c, diagonal_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_d, h_d, diagonal_size * sizeof(float), cudaMemcpyHostToDevice);

    // Define kernel launch configuration
    int threadsPerBlock = 256;
    int blocks = (diagonal_size + threadsPerBlock - 1) / threadsPerBlock;

    // Setting up timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Launch the kernel
    cudaEventRecord(start);
    Solve_Kernel<<<blocks, threadsPerBlock>>>(d_a, d_b, d_c, d_d, d_x, 10, diagonal_size);

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    // printf("Execution time: %f milliseconds\n", milliseconds);

    // Copy results back to host
    cudaMemcpy(h_x, d_x, diagonal_size * sizeof(float), cudaMemcpyDeviceToHost);

    // Output the results
    printf("Solution vector x:\n");
    for (int i = 0; i < diagonal_size; i++) {
        printf("%f ", h_x[i]);
    }

    printf("\n");

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFree(d_d);
    cudaFree(d_x);

    // Destroy the events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Solution vector x:
0.333333 -0.666667 1.000000 -0.333333 1.666667 0.000000 



In [None]:
#
# The PCR-Thomas Algorithm
#

%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__device__ void thomas(float *a, float *b, float *c, float *d, float *x, int size) {
    int stride = 1;

    // Forward elimination
    for (int i = threadIdx.x; i < size; i += blockDim.x) {
        if (i == 0) {
            c[i] = c[i] / b[i];
            d[i] = d[i] / b[i];
        } else {
            float tmp = b[i] - a[i] * c[i - 1];
            c[i] = c[i] / tmp;
            d[i] = (d[i] - d[i - 1] * a[i]) / tmp;
        }
    }
    __syncthreads();  // Synchronize all threads before starting backward substitution

    // Backward substitution
    for (int i = size - 1 - threadIdx.x; i >= 0; i -= blockDim.x) {
        if (i == size - 1) {
            x[i] = d[i];
        } else {
            x[i] = d[i] - c[i] * x[i + 1];
        }
    }
}


// PCR-Thomas
__global__ void Solve_Kernel(
    float *alist, float *blist, float *clist, float *dlist, float *xlist,
    int iter_max, int DMax) {

    int idx_row = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx_row >= DMax) return;  // Handle any extra threads that exceed the matrix size

    int stride = 1;
    float a1, b1, c1, d1;
    float k01, k21, c01, a21, d01, d21;

    //PCR to deduct
    while (stride < DMax) {
        int left_idx = idx_row - stride;
        int right_idx = idx_row + stride;

        if (left_idx >= 0) {
            k01 = alist[idx_row] / blist[left_idx];
            a1 = -alist[left_idx] * k01;
            c01 = clist[left_idx] * k01;
            d01 = dlist[left_idx] * k01;
        } else {
            a1 = 0.0f; c01 = 0.0f; d01 = 0.0f;
        }

        if (right_idx < DMax) {
            k21 = clist[idx_row] / blist[right_idx];
            c1 = -clist[right_idx] * k21;
            a21 = alist[right_idx] * k21;
            d21 = dlist[right_idx] * k21;
        } else {
            c1 = 0.0f; a21 = 0.0f; d21 = 0.0f;
        }

        b1 = blist[idx_row] - c01 - a21;
        d1 = dlist[idx_row] - d01 - d21;

        __syncthreads();  // Ensure all calculations complete before updating

        alist[idx_row] = a1;
        blist[idx_row] = b1;
        clist[idx_row] = c1;
        dlist[idx_row] = d1;

        stride *= 2;  // Double the stride for the next reduction step

        __syncthreads();  // Ensure all updates are completed before next iteration
    }

    // Once reduced sufficiently, apply the Thomas solver
    if (stride >= DMax / 2) {
        thomas(alist, blist, clist, dlist, xlist, DMax);
    }
}

// Function to generate tridiagonal system
void generate_tridiagonal_system(int n, float a[], float b[], float c[], float d[]) {
    // Seed the random number generator for variability in results
    // srand(time(NULL));

    for (int i = 0; i < n; i++) {
        // Generate random values for a, b, c, and d
        a[i] = (i > 0) ? rand() % 100 + 1 : 0;  // Upper diagonal (no entry at i=0)
        c[i] = (i < n-1) ? rand() % 100 + 1 : 0;  // Lower diagonal (no entry at i=n-1)
        b[i] = a[i] + c[i] + rand() % 100 + 50;  // Ensure diagonal dominance
        d[i] = rand() % 100 + 1;
    }
}

__host__
int main() {
    // Array of system sizes to test
    int sizes[] = {2, 5, 10, 100, 1000, 2000, 5000, 10000};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);  // Number of different system sizes

    int num_systems = 1000;

    // Open a CSV file to save the execution times
    FILE *fp = fopen("execution_times.csv", "w");
    if (fp == NULL) {
        fprintf(stderr, "Failed to open file for writing.\n");
        return -1;
    }
    fprintf(fp, "System Size,Execution Time (ms)\n"); // Write the CSV header

    for (int idx = 0; idx < num_sizes; idx++) {
        int n = sizes[idx];  // System size for this iteration
        printf("System size: %d\n", n);
        // fprintf(fp, "System size: %d\n", n); // Write the execution time to the file
        srand(time(NULL));
        float totaltime = 0;
        for(int i = 0; i <= num_systems; i++) {
            // Allocate host memory
            float *h_a = (float *)malloc(n * sizeof(float));
            float *h_b = (float *)malloc(n * sizeof(float));
            float *h_c = (float *)malloc(n * sizeof(float));
            float *h_d = (float *)malloc(n * sizeof(float));
            float *h_x = (float *)malloc(n * sizeof(float));

            // Generate the tridiagonal system
            generate_tridiagonal_system(n, h_a, h_b, h_c, h_d);

            // Allocate device memory
            float *d_a, *d_b, *d_c, *d_d, *d_x;
            cudaMalloc(&d_a, n * sizeof(float));
            cudaMalloc(&d_b, n * sizeof(float));
            cudaMalloc(&d_c, n * sizeof(float));
            cudaMalloc(&d_d, n * sizeof(float));
            cudaMalloc(&d_x, n * sizeof(float));

            // Copy data to the device
            cudaMemcpy(d_a, h_a, n * sizeof(float), cudaMemcpyHostToDevice);
            cudaMemcpy(d_b, h_b, n * sizeof(float), cudaMemcpyHostToDevice);
            cudaMemcpy(d_c, h_c, n * sizeof(float), cudaMemcpyHostToDevice);
            cudaMemcpy(d_d, h_d, n * sizeof(float), cudaMemcpyHostToDevice);

            // Kernel launch configuration
            int threadsPerBlock = 256;
            int blocks = (n + threadsPerBlock - 1) / threadsPerBlock;

            // Timing setup
            cudaEvent_t start, stop;
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            // Launch the kernel and record execution time
            cudaEventRecord(start);
            Solve_Kernel<<<blocks, threadsPerBlock>>>(d_a, d_b, d_c, d_d, d_x, 20, n);
            cudaEventRecord(stop);

            // Wait for GPU to finish before accessing on host
            cudaEventSynchronize(stop);
            float milliseconds = 0;
            cudaEventElapsedTime(&milliseconds, start, stop);
            // printf("Execution time: %f milliseconds\n", milliseconds);
            // fprintf(fp, "Execution time: %f milliseconds\n", milliseconds); // Write the execution time to the file
            if(i != 0) {
                totaltime += milliseconds;
            }
            // Copy result back to host
            cudaMemcpy(h_x, d_x, n * sizeof(float), cudaMemcpyDeviceToHost);

            // Free device memory
            cudaFree(d_a);
            cudaFree(d_b);
            cudaFree(d_c);
            cudaFree(d_d);
            cudaFree(d_x);

            // Free host memory
            free(h_a);
            free(h_b);
            free(h_c);
            free(h_d);
            free(h_x);

            // Destroy CUDA events
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
        }
        printf("Average runtime: %f\n", totaltime/num_systems);
        fprintf(fp,"%d,%f\n", n,totaltime/num_systems);
    }

    // Close the file
    fclose(fp);

    return 0;
}


System size: 2
Average runtime: 0.013595
System size: 5
Average runtime: 0.016972
System size: 10
Average runtime: 0.017514
System size: 100
Average runtime: 0.020763
System size: 1000
Average runtime: 0.037469
System size: 2000
Average runtime: 0.054261
System size: 5000
Average runtime: 0.075767
System size: 10000
Average runtime: 0.135672



## Accuracy Testing

Write output to file for testing

In [None]:
#
# The PCR-Thomas Algorithm
#

%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__device__ void thomas(float *a, float *b, float *c, float *d, float *x, int size) {
    int stride = 1; // This represents the processing stride within each thread

    // Forward elimination
    for (int i = threadIdx.x; i < size; i += blockDim.x) {
        if (i == 0) {
            c[i] = c[i] / b[i];
            d[i] = d[i] / b[i];
        } else {
            float tmp = b[i] - a[i] * c[i - 1];
            c[i] = c[i] / tmp;
            d[i] = (d[i] - d[i - 1] * a[i]) / tmp;
        }
    }
    __syncthreads();  // Synchronize all threads before starting backward substitution

    // Backward substitution
    for (int i = size - 1 - threadIdx.x; i >= 0; i -= blockDim.x) {
        if (i == size - 1) {
            x[i] = d[i];
        } else {
            x[i] = d[i] - c[i] * x[i + 1];
        }
    }
}


// PCR-Thomas
__global__ void Solve_Kernel(
    float *alist, float *blist, float *clist, float *dlist, float *xlist,
    int iter_max, int DMax) {

    int idx_row = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx_row >= DMax) return;  // Handle any extra threads that exceed the matrix size

    int stride = 1;
    float a1, b1, c1, d1;
    float k01, k21, c01, a21, d01, d21;

    int temp = DMax;

    //PCR to deduct
    while (stride < DMax) {
        int left_idx = idx_row - stride;
        int right_idx = idx_row + stride;

        if (left_idx >= 0) {
            k01 = alist[idx_row] / blist[left_idx];
            a1 = -alist[left_idx] * k01;
            c01 = clist[left_idx] * k01;
            d01 = dlist[left_idx] * k01;
        } else {
            a1 = 0.0f; c01 = 0.0f; d01 = 0.0f;
        }

        if (right_idx < DMax) {
            k21 = clist[idx_row] / blist[right_idx];
            c1 = -clist[right_idx] * k21;
            a21 = alist[right_idx] * k21;
            d21 = dlist[right_idx] * k21;
        } else {
            c1 = 0.0f; a21 = 0.0f; d21 = 0.0f;
        }

        b1 = blist[idx_row] - c01 - a21;
        d1 = dlist[idx_row] - d01 - d21;

        __syncthreads();  // Ensure all calculations complete before updating

        alist[idx_row] = a1;
        blist[idx_row] = b1;
        clist[idx_row] = c1;
        dlist[idx_row] = d1;

        stride *= 2;  // Double the stride for the next reduction step
        temp /= 2;

        __syncthreads();  // Ensure all updates are completed before next iteration
    }

    // Once reduced sufficiently, apply the Thomas solver
    stride /= 2;
    temp *= 2;
    if (stride >= DMax / 2) {
        thomas(alist, blist, clist, dlist, xlist, DMax);
    }
}

// Function to generate tridiagonal system
void generate_tridiagonal_system(int n, float a[], float b[], float c[], float d[]) {
    // Seed the random number generator for variability in results
    // srand(time(NULL));

    for (int i = 0; i < n; i++) {
        // Generate random values for a, b, c, and d
        a[i] = (i > 0) ? rand() % 100 + 1 : 0;  // Upper diagonal (no entry at i=0)
        c[i] = (i < n-1) ? rand() % 100 + 1 : 0;  // Lower diagonal (no entry at i=n-1)
        b[i] = a[i] + c[i] + rand() % 100 + 50;  // Ensure diagonal dominance
        d[i] = rand() % 100 + 1;
    }
}

__host__
int main() {
    // Array of system sizes to test

    // Open a CSV file to save the execution times
    FILE *fp = fopen("results.txt", "w");
    if (fp == NULL) {
        fprintf(stderr, "Failed to open file for writing.\n");
        return -1;
    }

    int sizes[] = {100};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);  // Number of different system sizes

    int num_systems = 99;
    fprintf(fp, "%d\n", num_systems+1);
    for (int idx = 0; idx < num_sizes; idx++) {
        int n = sizes[idx];  // System size for this iteration
        // printf("System size: %d\n", n);
        // fprintf(fp, "System size: %d\n", n); // Write the execution time to the file
        srand(time(NULL));
        float totaltime = 0;
        for(int i = 0; i <= num_systems; i++) {
            // Allocate host memory
            float *h_a = (float *)malloc(n * sizeof(float));
            float *h_b = (float *)malloc(n * sizeof(float));
            float *h_c = (float *)malloc(n * sizeof(float));
            float *h_d = (float *)malloc(n * sizeof(float));
            float *h_x = (float *)malloc(n * sizeof(float));

            // Generate the tridiagonal system
            generate_tridiagonal_system(n, h_a, h_b, h_c, h_d);

            for (int i = 0; i < n; i++) {
                if(i == 0) {
                    printf("%.0f, ", h_a[i]);
                }
                else if(i == n-1) {
                    printf("%.0f ", h_a[i]);
                    fprintf(fp, "%.0f 0", h_a[i]);
                } else {
                    printf("%.0f, ", h_a[i]);
                    fprintf(fp, "%.0f ", h_a[i]);
                }
            }
            printf("\n");
            fprintf(fp, "\n");
            for (int i = 0; i < n; i++) {
                if(i == n-1) {
                    printf("%.0f ", h_b[i]);
                    fprintf(fp, "%.0f ", h_b[i]);
                } else {
                    printf("%.0f, ", h_b[i]);
                    fprintf(fp, "%.0f ", h_b[i]);
                }
            }
            printf("\n");
            fprintf(fp, "\n");
            for (int i = 0; i < n; i++) {
                if(i == n-1) {
                    printf("%.0f ", h_c[i]);
                    fprintf(fp, "%.0f ", h_c[i]);
                } else {
                    printf("%.0f, ", h_c[i]);
                    fprintf(fp, "%.0f ", h_c[i]);
                }
            }
            printf("\n");
            fprintf(fp, "\n");
            for (int i = 0; i < n; i++) {
                if(i == n-1) {
                    printf("%.0f ", h_d[i]);
                    fprintf(fp, "%.0f ", h_d[i]);
                } else {
                    printf("%.0f, ", h_d[i]);
                    fprintf(fp, "%.0f ", h_d[i]);
                }
            }
            printf("\n");
            fprintf(fp, "\n");

            // Allocate device memory
            float *d_a, *d_b, *d_c, *d_d, *d_x;
            cudaMalloc(&d_a, n * sizeof(float));
            cudaMalloc(&d_b, n * sizeof(float));
            cudaMalloc(&d_c, n * sizeof(float));
            cudaMalloc(&d_d, n * sizeof(float));
            cudaMalloc(&d_x, n * sizeof(float));

            // Copy data to the device
            cudaMemcpy(d_a, h_a, n * sizeof(float), cudaMemcpyHostToDevice);
            cudaMemcpy(d_b, h_b, n * sizeof(float), cudaMemcpyHostToDevice);
            cudaMemcpy(d_c, h_c, n * sizeof(float), cudaMemcpyHostToDevice);
            cudaMemcpy(d_d, h_d, n * sizeof(float), cudaMemcpyHostToDevice);

            // Kernel launch configuration
            int threadsPerBlock = 256;
            int blocks = (n + threadsPerBlock - 1) / threadsPerBlock;

            // Timing setup
            cudaEvent_t start, stop;
            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            // Launch the kernel and record execution time
            cudaEventRecord(start);
            Solve_Kernel<<<blocks, threadsPerBlock>>>(d_a, d_b, d_c, d_d, d_x, 20, n);
            cudaEventRecord(stop);

            // Wait for GPU to finish before accessing on host
            cudaEventSynchronize(stop);
            float milliseconds = 0;
            cudaEventElapsedTime(&milliseconds, start, stop);

            // Copy result back to host
            cudaMemcpy(h_x, d_x, n * sizeof(float), cudaMemcpyDeviceToHost);

            printf("Solution vector x:\n");
            for (int i = 0; i < n; i++) {
                printf("%f, ", h_x[i]);
                fprintf(fp, "%f ", h_x[i]);
            }


            printf("\n\n");
            fprintf(fp, "\n");

            // Free device memory
            cudaFree(d_a);
            cudaFree(d_b);
            cudaFree(d_c);
            cudaFree(d_d);
            cudaFree(d_x);

            // Free host memory
            free(h_a);
            free(h_b);
            free(h_c);
            free(h_d);
            free(h_x);

            // Destroy CUDA events
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
        }
    }

    // Close the file
    fclose(fp);

    return 0;
}


0, 1, 23, 92, 29, 60, 25, 13, 18, 65, 59, 14, 68, 82, 58, 9, 54, 84, 22, 67, 50, 42, 28, 49, 93, 13, 6, 21, 45, 17, 72, 46, 4, 78, 14, 27, 39, 50, 7, 26, 65, 3, 24, 53, 70, 27, 31, 5, 93, 83, 25, 83, 63, 19, 6, 97, 26, 33, 75, 60, 80, 5, 4, 48, 53, 34, 83, 88, 13, 17, 23, 17, 45, 89, 31, 34, 8, 26, 59, 37, 1, 3, 9, 56, 3, 12, 60, 18, 96, 44, 48, 86, 53, 50, 98, 25, 32, 71, 98, 76 
200, 155, 167, 208, 106, 228, 128, 163, 202, 146, 144, 253, 180, 216, 182, 138, 258, 194, 197, 296, 203, 225, 230, 146, 173, 115, 99, 81, 167, 194, 261, 214, 167, 221, 196, 150, 155, 294, 179, 187, 222, 179, 162, 187, 298, 169, 257, 173, 187, 237, 205, 185, 258, 114, 127, 271, 208, 104, 287, 227, 228, 243, 184, 248, 217, 231, 231, 229, 108, 217, 154, 127, 234, 180, 101, 202, 131, 172, 200, 152, 115, 65, 190, 243, 154, 179, 305, 138, 198, 201, 286, 248, 183, 283, 189, 201, 202, 208, 253, 219 
88, 77, 35, 45, 26, 79, 8, 79, 37, 25, 18, 91, 52, 51, 31, 18, 60, 53, 77, 99, 92, 97, 80, 34, 8, 30, 23, 2, 21, 40, 60