diff --git a/rccl_samples/npkit_launcher.sh b/rccl_samples/npkit_launcher.sh index e060b13..2e04c46 100644 --- a/rccl_samples/npkit_launcher.sh +++ b/rccl_samples/npkit_launcher.sh @@ -33,37 +33,38 @@ export RCCL_NUM_WARMUPS="0" # Number of rccl-tests iterations. export RCCL_NUM_ITERS="10" -NPKIT_FLAGS_CPU_PREFIX="-DENABLE_NPKIT" -NPKIT_FLAGS_GPU_PREFIX="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU" - -export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT" - -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_ENTRY -DENABLE_NPKIT_EVENT_RECV_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_ENTRY -DENABLE_NPKIT_EVENT_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT" - -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT" - -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT" - -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT" -# export NPKIT_FLAGS=${NPKIT_FLAGS_GPU_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT" - -# export NPKIT_FLAGS=${NPKIT_FLAGS_CPU_PREFIX}" -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT" +NPKIT_FLAGS_PREFIX="-DENABLE_NPKIT" + +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT" + +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_ENTRY -DENABLE_NPKIT_EVENT_RECV_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_ENTRY -DENABLE_NPKIT_EVENT_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT" + +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT" + +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT" + +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT" +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT" + +# export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT" + +export NPKIT_FLAGS=${NPKIT_FLAGS_PREFIX}" -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY -DENABLE_NPKIT_EVENT_NET_SEND_EXIT -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY -DENABLE_NPKIT_EVENT_NET_RECV_EXIT" bash npkit_runner.sh diff --git a/rccl_samples/npkit_runner.sh b/rccl_samples/npkit_runner.sh index e974d4e..927d41e 100644 --- a/rccl_samples/npkit_runner.sh +++ b/rccl_samples/npkit_runner.sh @@ -36,7 +36,7 @@ cd ${RCCL_SRC_DIR} rm -rf build mkdir -p build cd build -CXX=/opt/rocm/bin/hipcc cmake -DNPKIT_FLAGS="${NPKIT_FLAGS}" .. +CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ -DNPKIT_FLAGS="${NPKIT_FLAGS}" .. make -j # Clean existing results diff --git a/rccl_samples/npkit_trace_generator.py b/rccl_samples/npkit_trace_generator.py index 2427cd8..e1ebee6 100644 --- a/rccl_samples/npkit_trace_generator.py +++ b/rccl_samples/npkit_trace_generator.py @@ -35,6 +35,11 @@ def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path): den = float(f.read()) return den / num / 1e6 +def parse_clock_calibration_info(clock_calibration_file_path): + with open(clock_calibration_file_path, 'r') as f: + num = float(f.read()) + return num + def parse_gpu_event(event_bytes): return { 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), @@ -51,11 +56,11 @@ def parse_cpu_event(event_bytes): 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) } -def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale): +def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, gpu_time_cpu, gpu_time_gpu): gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx)) raw_event_size = 16 - curr_cpu_base_time = None - curr_gpu_base_time = None + cpu_base_time = gpu_time_cpu / cpu_clock_scale + gpu_base_time = gpu_time_gpu / gpu_clock_scale gpu_events = [] event_type_to_seq = {} with open(gpu_event_file_path, 'rb') as f: @@ -64,46 +69,37 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo raw_content_idx = 0 while raw_content_idx < raw_content_size: parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) - if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU': - curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale - curr_gpu_base_time = None - elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU': - if curr_gpu_base_time is None: - curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale - else: - if curr_gpu_base_time is None: - curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale - event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']] - phase = 'B' if event_type.endswith('_ENTRY') else 'E' - gpu_events.append({ - 'ph': phase, - 'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time, - 'pid': rank, - 'tid': buf_idx + 1 + event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']] + phase = 'B' if event_type.endswith('_ENTRY') else 'E' + gpu_events.append({ + 'ph': phase, + 'ts': cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - gpu_base_time, + 'pid': rank, + 'tid': buf_idx + 1 + }) + if phase == 'B': + if event_type not in event_type_to_seq: + event_type_to_seq[event_type] = 0 + gpu_events[-1].update({ + 'name': event_type, + 'cat': 'GPU', + 'args': { + 'rank': rank, + 'buf_idx': buf_idx, + 'seq': event_type_to_seq[event_type], + 'rsvd_0': parsed_gpu_event['rsvd'], + 'size_0': parsed_gpu_event['size'] + } }) - if phase == 'B': - if event_type not in event_type_to_seq: - event_type_to_seq[event_type] = 0 - gpu_events[-1].update({ - 'name': event_type, - 'cat': 'GPU', - 'args': { - 'rank': rank, - 'buf_idx': buf_idx, - 'seq': event_type_to_seq[event_type], - 'rsvd_0': parsed_gpu_event['rsvd'], - 'size_0': parsed_gpu_event['size'] - } - }) - event_type_to_seq[event_type] += 1 - else: - gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']} - delta_time = gpu_events[-1]['ts'] - gpu_events[-2]['ts'] - gpu_events[-1]['args']['bw (GB/s)'] = gpu_events[-1]['args']['size'] / delta_time / 1e3 + event_type_to_seq[event_type] += 1 + else: + gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']} + delta_time = max(0.001, gpu_events[-1]['ts'] - gpu_events[-2]['ts']) + gpu_events[-1]['args']['bw (GB/s)'] = gpu_events[-1]['args']['size'] / delta_time / 1e3 raw_content_idx += raw_event_size return gpu_events -def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale): +def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale, cpu_time_global, cpu_time_local): cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel)) raw_event_size = 16 cpu_events = [] @@ -124,7 +120,7 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo phase = 'B' if event_type.endswith('_ENTRY') else 'E' cpu_events.append({ 'ph': phase, - 'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale, + 'ts': (cpu_time_global + (parsed_cpu_event['timestamp'] - cpu_time_local)) / cpu_clock_scale, 'pid': rank }) slot = parsed_cpu_event['slot'] @@ -192,12 +188,17 @@ def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def): gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank) gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path) + cpu_time_global = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_cpu_global_rank_%d' % rank)) + cpu_time_local = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_cpu_local_rank_%d' % rank)) + gpu_time_cpu = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_gpu_cpu_rank_%d' % rank)) + gpu_time_gpu = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_gpu_gpu_rank_%d' % rank)) + for buf_idx in buf_indices: - gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale) + gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, gpu_time_cpu, gpu_time_gpu) trace['traceEvents'].extend(gpu_events) for channel in channels: - cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale) + cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale, cpu_time_global, cpu_time_local) trace['traceEvents'].extend(cpu_events) trace['traceEvents'].sort(key=lambda x : x['ts'])