In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

In [None]:
#@title deviceQuery
%cd /usr/local/cuda-10.1/samples/1_Utilities/deviceQuery/
!make
!./deviceQuery

In [None]:
%%cu
/****** calculate pi *******/
#include <stdio.h>
#include <math.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
#include <sys/time.h>
#define BLOCKSPERGRID 8
#define NUMTHREADS 1024
#define ITERATIONS 2e09

/*****************************************************************************
/*kernel
*****************************************************************************/


__global__ void calculatePi(double *piTotal, long int iterations, int totalThreads)
{   long int initIteration, endIteration;
    long int i = 0;
    double piPartial;
    
    int index = (blockDim.x * blockIdx.x) + threadIdx.x;

    initIteration = (iterations/totalThreads) * index;
    endIteration = initIteration + (iterations/totalThreads) - 1;
    
    i = initIteration;
    piPartial = 0;
    
    do{
        piPartial = piPartial + (double)(4.0 / ((i*2)+1));
        i++;
        piPartial = piPartial - (double)(4.0 / ((i*2)+1));
        i++;
    }while(i < endIteration);

    piTotal[index] = piPartial;
    
    __syncthreads();
    if(index == 0){
        for(i = 1; i < totalThreads; i++)
            piTotal[0] = piTotal[0] + piTotal[i];
    }
}


/******************************************************************************/


int main()
{   
    int blocksPerGrid, threadsPerBlock, i, size;
    long int iterations;
    int totalThreads;
    double *h_pitotal, *d_pitotal;
 
    blocksPerGrid = BLOCKSPERGRID;
    cudaError_t err = cudaSuccess;

    struct timeval tval_before, tval_after, tval_result;
    cudaEvent_t start, stop;

    size = sizeof(double)*NUMTHREADS;
    h_pitotal = (double *)malloc(size);
    if ( h_pitotal == NULL){
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }
    
    for(i = 0; i < NUMTHREADS; i++)
        h_pitotal[i] = 0.0;

    err = cudaMalloc((void **)&d_pitotal, size);
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    
    err = cudaMemcpy(d_pitotal, h_pitotal, sizeof(double)*NUMTHREADS, cudaMemcpyHostToDevice);
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
 
    gettimeofday(&tval_before, NULL);
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Lanzar KERNEL
    threadsPerBlock = NUMTHREADS/blocksPerGrid;
    totalThreads = blocksPerGrid * threadsPerBlock;
    iterations = ITERATIONS;
    printf("CUDA kernel launch with %d blocks of %d threads Total: %i       ", blocksPerGrid, threadsPerBlock, totalThreads  );
    calculatePi<<<blocksPerGrid, threadsPerBlock>>>(d_pitotal, iterations, totalThreads);
 
    gettimeofday(&tval_after, NULL);
    cudaEventRecord(stop);

    err = cudaGetLastError();
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaMemcpy(h_pitotal, d_pitotal, size, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaFree(d_pitotal);
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
  
    
    timersub(&tval_after, &tval_before, &tval_result);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
 
    printf("\nTime elapsed: %ld.%06ld seconds", (long int)tval_result.tv_sec, (long int)tval_result.tv_usec);
    printf("\nCuda time: %.16f seconds", milliseconds / 1000.0);
    printf("\nPi: %.12f", *h_pitotal);
    // Free host memory

    free(h_pitotal);
    err = cudaDeviceReset();
    if (err != cudaSuccess){
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    return 0;
}