/
cubradixsort.cu
178 lines (166 loc) · 7 KB
/
cubradixsort.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/******************************************************************************
* Simple example of DeviceRadixSort::SortPairs().
*
* Sorts an array of float keys paired with a corresponding array of int values.
*
* To compile using the command line:
* nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3
*
******************************************************************************/
// - I is include path
//compile with:
// nvcc -m64 -arch=sm_35 cubradixsort.cu -Icub-1.8.0/ -lcudart -O3 -o cubradixsort
// nvcc cubradixsort.cu -Icub-1.8.0/ -o cubradixsort
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <algorithm>
// #include <cub/cub.cuh>
#include <cub/util_allocator.cuh>
#include <cub/device/device_radix_sort.cuh>
#include "test/test_util.h"
#define SIZE 16 << 20
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------
struct Pair
{
float key;
int value;
bool operator<(const Pair &b) const
{
if (key < b.key)
return true;
if (key > b.key)
return false;
// Return true if key is negative zero and b.key is positive zero
unsigned int key_bits = *reinterpret_cast<unsigned*>(const_cast<float*>(&key));
unsigned int b_key_bits = *reinterpret_cast<unsigned*>(const_cast<float*>(&b.key));
unsigned int HIGH_BIT = 1u << 31;
return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
}
};
void Initialize(
float *h_keys,
int *h_values,
float *h_reference_keys,
int *h_reference_values,
int num_items)
{
Pair *h_pairs = new Pair[num_items];
for (int i = 0; i < num_items; ++i)
{
RandomBits(h_keys[i]);
RandomBits(h_values[i]);
h_pairs[i].key = h_keys[i];
h_pairs[i].value = h_values[i];
}
if (g_verbose)
{
printf("Input keys:\n");
DisplayResults(h_keys, num_items);
printf("\n\n");
printf("Input values:\n");
DisplayResults(h_values, num_items);
printf("\n\n");
}
std::stable_sort(h_pairs, h_pairs + num_items);
for (int i = 0; i < num_items; ++i)
{
h_reference_keys[i] = h_pairs[i].key;
h_reference_values[i] = h_pairs[i].value;
}
delete[] h_pairs;
}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
int main(int argc, char** argv)
{
const uint N = SIZE;
cudaSetDevice (0);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int num_items = SIZE;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<input items> "
"[--device=<device-id>] "
"[--v] "
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n",
num_items, int(sizeof(float)), int(sizeof(int)));
fflush(stdout);
// Allocate host arrays
float *h_keys = new float[num_items];
float *h_reference_keys = new float[num_items];
int *h_values = new int[num_items];
int *h_reference_values = new int[num_items];
// Initialize problem and solution on host
Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items);
// Allocate device arrays
DoubleBuffer<float> d_keys;
DoubleBuffer<int> d_values;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items));
// Allocate temporary storage
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Initialize device arrays
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
float elapsedTime;
cudaDeviceSynchronize();
cudaEventRecord(start, 0);
// Run
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
if (h_keys) delete[] h_keys;
if (h_reference_keys) delete[] h_reference_keys;
if (h_values) delete[] h_values;
if (h_reference_values) delete[] h_reference_values;
if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
printf("Processing time: %f (ms)\n", elapsedTime);
double dTimeSecs = 1.0e-3 * elapsedTime ;
printf("sortingNetworks-cub, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u\n",
(1.0e-6 * (double)N/dTimeSecs), dTimeSecs , N, 1);
return 0;
}