-
Notifications
You must be signed in to change notification settings - Fork 2
/
inspector-topo.cu
134 lines (109 loc) · 4.04 KB
/
inspector-topo.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "Options.hpp"
#include "probe_latency.hpp"
#include "LoopbackFlow.hpp"
#include "probe_gpu_bandwidth.hpp"
#include <cstdlib>
#include <numa.h>
#include <vector>
#include <limits>
#include <tuple>
#include <memory>
int main(int argc, char * argv[]) {
// initialize global options object
Options::options = std::make_unique<Options>(&argc, &argv);
if (-1 == numa_available()) {
std::cerr << "NUMA not available. Cannot probe topology." << std::endl;
exit(1);
}
// Measure latency on each NUMA node. Remember the lowest.
double min_latency = std::numeric_limits<double>::max();
int min_numa_node = 0;
for (int node = 0; node <= numa_max_node(); ++node) {
double latency = probe_latency_from_numa_node(node);
if (latency < min_latency) {
min_latency = latency;
min_numa_node = node;
}
}
std::cout << "NIC appears to be nearest to NUMA node " << min_numa_node << std::endl;
// Now probe bandwith to all pairs of GPUs while NIC is processing a
// loopback flow.
// Track the pair with highest aggregate bandwidth.
double max_bw = std::numeric_limits<double>::min();
int max_gpuA = 0;
int max_gpuB = 1;
std::tuple<int, int, bool, int, int, bool> max_args;
// Track the pair with lowest aggregate bandwidth. This should be
// the one shared with the NIC.
double min_bw = std::numeric_limits<double>::max();
int min_gpuA = 0;
int min_gpuB = 1;
std::tuple<int, int, bool, int, int, bool> min_args;
{ // Start loopback flow between NIC and the DRAM of the CPU it is nearest.
LoopbackFlow flow(min_numa_node);
// TODO: set CUDA_VISIBLE_DEVICES to match our detected GPU count in a way that works
//int retval = setenv("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7", 1);
// try both mapped and WC allocation
for (int wc = false; wc <= false; ++wc) {
GPUBuffers buffers(wc);
// try both HtoD (true) and DtoH (false) directions
for (int dirA = false; dirA <= false; ++dirA) {
for (int dirB = true; dirB <= true; ++dirB) {
// for (int nodeA = 0; nodeA < buffers.get_cpu_count(); ++nodeA) {
// for (int nodeB = 0; nodeB < buffers.get_cpu_count(); ++nodeB) {
{
{
int nodeA = min_numa_node;
int nodeB = min_numa_node;
for (int gpuA = 0; gpuA < buffers.get_gpu_count(); ++gpuA) {
//for (int gpuB = gpuA + 1; gpuB < gpu_count; ++gpuB) {
// TODO: probably don't need full matrix, but run it for now.
for (int gpuB = 0; gpuB < buffers.get_gpu_count(); ++gpuB) {
if (gpuA == gpuB)
continue;
double bw = buffers.double_memcpy_probe(nodeA, gpuA, dirA, nodeB, gpuB, dirB);
// update best pair
if (bw > max_bw) {
max_bw = bw;
max_gpuA = gpuA;
max_gpuB = gpuB;
max_args = std::make_tuple(nodeA, gpuA, dirA, nodeB, gpuB, dirB);
}
// update worst pair
if (bw < min_bw) {
min_bw = bw;
min_gpuA = gpuA;
min_gpuB = gpuB;
min_args = std::make_tuple(nodeA, gpuA, dirA, nodeB, gpuB, dirB);
}
}
}
}
}
}
}
}
{
bool dirA, dirB;
int gpuA, gpuB, nodeA, nodeB;
std::tie(nodeA, gpuA, dirA, nodeB, gpuB, dirB) = min_args;
std::cout << "GPU pair shared with NIC appears to be " << min_gpuA << " and " << min_gpuB
<< " with a bandwidth of " << min_bw
<< std::endl;
std::tie(nodeA, gpuA, dirA, nodeB, gpuB, dirB) = max_args;
std::cout << "Best GPU pair was " << max_gpuA << " and " << max_gpuB
<< " with a bandwidth of " << max_bw
<< std::endl;
}
// std::cout << "GPU pair shared with NIC appears to be " << min_gpuA << " and " << min_gpuB
// << " with a bandwidth of " << min_bw
// << std::endl;
// std::cout << "Best GPU pair was " << max_gpuA << " doing DtoH and " << max_gpuB
// << " doing HtoD with a bandwidth of " << max_bw
// << std::endl;
}
std::cout << "Done." << std::endl;
return 0;
}