@@ -626,9 +626,9 @@ struct AMDGPUSignalTy {
626
626
}
627
627
628
628
// / Wait until the signal gets a zero value.
629
- Error wait (const uint64_t ActiveTimeout = 0 , RPCServerTy *RPCServer = nullptr ,
629
+ Error wait (const uint64_t ActiveTimeout = 0 ,
630
630
GenericDeviceTy *Device = nullptr ) const {
631
- if (ActiveTimeout && !RPCServer ) {
631
+ if (ActiveTimeout) {
632
632
hsa_signal_value_t Got = 1 ;
633
633
Got = hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
634
634
ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -637,14 +637,11 @@ struct AMDGPUSignalTy {
637
637
}
638
638
639
639
// If there is an RPC device attached to this stream we run it as a server.
640
- uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
641
- auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
640
+ uint64_t Timeout = UINT64_MAX;
641
+ auto WaitState = HSA_WAIT_STATE_BLOCKED;
642
642
while (hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
643
- Timeout, WaitState) != 0 ) {
644
- if (RPCServer && Device)
645
- if (auto Err = RPCServer->runServer (*Device))
646
- return Err;
647
- }
643
+ Timeout, WaitState) != 0 )
644
+ ;
648
645
return Plugin::success ();
649
646
}
650
647
@@ -1052,11 +1049,6 @@ struct AMDGPUStreamTy {
1052
1049
// / operation that was already finalized in a previous stream sycnhronize.
1053
1050
uint32_t SyncCycle;
1054
1051
1055
- // / A pointer associated with an RPC server running on the given device. If
1056
- // / RPC is not being used this will be a null pointer. Otherwise, this
1057
- // / indicates that an RPC server is expected to be run on this stream.
1058
- RPCServerTy *RPCServer;
1059
-
1060
1052
// / Mutex to protect stream's management.
1061
1053
mutable std::mutex Mutex;
1062
1054
@@ -1236,9 +1228,6 @@ struct AMDGPUStreamTy {
1236
1228
// / Deinitialize the stream's signals.
1237
1229
Error deinit () { return Plugin::success (); }
1238
1230
1239
- // / Attach an RPC server to this stream.
1240
- void setRPCServer (RPCServerTy *Server) { RPCServer = Server; }
1241
-
1242
1231
// / Push a asynchronous kernel to the stream. The kernel arguments must be
1243
1232
// / placed in a special allocation for kernel args and must keep alive until
1244
1233
// / the kernel finalizes. Once the kernel is finished, the stream will release
@@ -1266,10 +1255,30 @@ struct AMDGPUStreamTy {
1266
1255
if (auto Err = Slots[Curr].schedReleaseBuffer (KernelArgs, MemoryManager))
1267
1256
return Err;
1268
1257
1258
+ // If we are running an RPC server we want to wake up the server thread
1259
+ // whenever there is a kernel running and let it sleep otherwise.
1260
+ if (Device.getRPCServer ())
1261
+ Device.Plugin .getRPCServer ().Thread ->notify ();
1262
+
1269
1263
// Push the kernel with the output signal and an input signal (optional)
1270
- return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
1271
- GroupSize, StackSize, OutputSignal,
1272
- InputSignal);
1264
+ if (auto Err = Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads,
1265
+ NumBlocks, GroupSize, StackSize,
1266
+ OutputSignal, InputSignal))
1267
+ return Err;
1268
+
1269
+ // Register a callback to indicate when the kernel is complete.
1270
+ if (Device.getRPCServer ()) {
1271
+ if (auto Err = Slots[Curr].schedCallback (
1272
+ [](void *Data) -> llvm::Error {
1273
+ GenericPluginTy &Plugin =
1274
+ *reinterpret_cast <GenericPluginTy *>(Data);
1275
+ Plugin.getRPCServer ().Thread ->finish ();
1276
+ return Error::success ();
1277
+ },
1278
+ &Device.Plugin ))
1279
+ return Err;
1280
+ }
1281
+ return Plugin::success ();
1273
1282
}
1274
1283
1275
1284
// / Push an asynchronous memory copy between pinned memory buffers.
@@ -1479,8 +1488,8 @@ struct AMDGPUStreamTy {
1479
1488
return Plugin::success ();
1480
1489
1481
1490
// Wait until all previous operations on the stream have completed.
1482
- if (auto Err = Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds,
1483
- RPCServer , &Device))
1491
+ if (auto Err =
1492
+ Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds , &Device))
1484
1493
return Err;
1485
1494
1486
1495
// Reset the stream and perform all pending post actions.
@@ -3024,7 +3033,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
3024
3033
: Agent(Device.getAgent()), Queue(nullptr ),
3025
3034
SignalManager (Device.getSignalManager()), Device(Device),
3026
3035
// Initialize the std::deque with some empty positions.
3027
- Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer( nullptr ),
3036
+ Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
3028
3037
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
3029
3038
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
3030
3039
@@ -3377,10 +3386,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3377
3386
if (auto Err = AMDGPUDevice.getStream (AsyncInfoWrapper, Stream))
3378
3387
return Err;
3379
3388
3380
- // If this kernel requires an RPC server we attach its pointer to the stream.
3381
- if (GenericDevice.getRPCServer ())
3382
- Stream->setRPCServer (GenericDevice.getRPCServer ());
3383
-
3384
3389
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3385
3390
if (ImplArgs &&
3386
3391
getImplicitArgsSize () == sizeof (hsa_utils::AMDGPUImplicitArgsTy)) {
0 commit comments