-
Notifications
You must be signed in to change notification settings - Fork 293
/
saxpy.hip
62 lines (50 loc) · 1.5 KB
/
saxpy.hip
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <iostream>
#include <hip/hip_runtime.h>
#define N (1024 * 500)
__global__ void saxpy(float a, float* x, float* y) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N) y[tid] = a * x[tid] + y[tid];
}
int main() {
const float a = 100.0f;
float* x = (float*)malloc(N * sizeof(float));
float* y = (float*)malloc(N * sizeof(float));
// Initialize the input data.
for (size_t i = 0; i < N; ++i) {
x[i] = static_cast<float>(i);
y[i] = static_cast<float>(i * 2);
}
// Make a copy for the GPU implementation.
float* d_x;
float* d_y;
hipMalloc((void**)&d_x, N * sizeof(float));
hipMalloc((void**)&d_y, N * sizeof(float));
hipMemcpy(d_x, x, N * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_y, y, N * sizeof(float), hipMemcpyHostToDevice);
// CPU implementation of saxpy.
for (int i = 0; i < N; i++) {
y[i] = a * x[i] + y[i];
}
// Launch a GPU kernel to compute the saxpy.
saxpy<<<(N+255)/256, 256>>>(a, d_x, d_y);
// Copy the device results to host.
float* h_y = (float*)malloc(N * sizeof(float));
hipDeviceSynchronize();
hipMemcpy(h_y, d_y, N * sizeof(float), hipMemcpyDeviceToHost);
// Verify the results match CPU.
int errors = 0;
for (int i = 0; i < N; i++) {
if (fabs(y[i] - h_y[i]) > fabs(y[i] * 0.0001f))
errors++;
}
if (errors != 0)
std::cout << errors << " errors" << std::endl;
else
std::cout << "PASSED!" << std::endl;
free(h_y);
free(x);
free(y);
hipFree(d_x);
hipFree(d_y);
return errors;
}