forked from cangumeli/mvapich-tests
-
Notifications
You must be signed in to change notification settings - Fork 0
/
simpleCUDAMPI.cu
123 lines (105 loc) · 3.33 KB
/
simpleCUDAMPI.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* Simple example demonstrating how to use MPI with CUDA
*
* Generate some random numbers on one node.
* Dispatch them to all nodes.
* Compute their square root on each node's GPU.
* Compute the average of the results using MPI.
*
* simpleMPI.cu: GPU part, compiled with nvcc
*/
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "simpleMPI.h"
// Error handling macro
#define CUDA_CHECK(call) \
if((call) != cudaSuccess) { \
cudaError_t err = cudaGetLastError(); \
fprintf(stderr,"CUDA error calling \""#call"\", code is %d\n",err); \
my_abort(err); }
// Device code
// Very simple GPU Kernel that computes square roots of input numbers
__global__ void simpleMPIKernel(float * buffer, float add) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
buffer[tid] += add;
}
void listDevices() {
int nDevices;
printf("here...");
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
}
// Initialize an array with random data (between 0 and 1)
void initData(float * data, int dataSize, float scalar) {
for(int i = 0; i < dataSize; i++) {
data[i] = scalar;
}
}
void initDataGPU(float *data, int dataSize, float initScalar)
{
cudaMemset(data, initScalar, dataSize * sizeof(float));
}
void setDevice(int rank) {
cudaError_t err = cudaSetDevice(rank);
if (err != cudaSuccess) {
printf("Error %s \n@ rank %d\n", cudaGetErrorString(err), rank);
}
/*else if (rank == 0) {
double *dummy;
cudaError_t errm = cudaMalloc(&dummy, 100000 * sizeof(float));
if (errm != cudaSuccess)
printf("Error %s \n@ rank %d\n", cudaGetErrorString(errm), rank);
cudaDeviceSynchronize();
}*/
}
void getInfo(int rank) {
size_t freeBytes, totalBytes;
cudaMemGetInfo(&freeBytes, &totalBytes);
printf("Free bytes %lu, total bytes %lu, rank: %d\n", freeBytes, totalBytes, rank);
}
void mallocGPU(float **buffer, size_t size) {
cudaMalloc(buffer, size * sizeof(float));
}
void copyGPU2HOST(float *host_buffer, float *gpu_buffer, size_t size) {
cudaMemcpy(host_buffer, gpu_buffer, size * sizeof(float), cudaMemcpyDeviceToHost);
}
void copyHOST2GPU(float *gpu_buffer, float *host_buffer, size_t size) {
cudaMemcpy(gpu_buffer, host_buffer, size * sizeof(float), cudaMemcpyHostToDevice);
}
void addScalarGPU(float *gpu_buffer, float scalar, int blockSize, int gridSize) {
dim3 threadsPerBlock(blockSize, 1, 1);
dim3 blocksPerGrid(gridSize, 1, 1);
simpleMPIKernel<<<threadsPerBlock, blocksPerGrid>>>(gpu_buffer, scalar);
}
void syncGPU() {
cudaDeviceSynchronize();
}
void printLastError()
{
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Last error is %s\n", cudaGetErrorString(err));
}
}
float sum(float * data, int size) {
float accum = 0.f;
for(int i = 0; i < size; i++) {
accum += data[i];
//printf("%d: %f\n", i, data[i]);
}
return accum;
}
void freeGPU(float *buffer) {
cudaFree(buffer);
}