Skip to content

Commit 300bfe0

Browse files
demoing stall detector
spinning a sidecar domain which monitors runtime events to detect stalls (which happen if one of the promise doesn't cooperate enough)
1 parent 8cd79b8 commit 300bfe0

File tree

2 files changed

+66
-0
lines changed

2 files changed

+66
-0
lines changed

test/stall_detection/dune

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
(executable
2+
(name selfdetector)
3+
(libraries unix lwt lwt.unix runtime_events))
4+
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
let () = Runtime_events.start ()
2+
3+
let rec stall d =
4+
let open Lwt.Syntax in
5+
Unix.sleepf d;
6+
let* () = Lwt.pause () in
7+
stall (d+.0.01)
8+
9+
let last_lap = ref 0L
10+
let started = ref false
11+
12+
13+
let detect () =
14+
let cursor = Runtime_events.create_cursor None in
15+
let cb_stall =
16+
Runtime_events.Callbacks.create ()
17+
|> Runtime_events.Callbacks.add_user_event
18+
Runtime_events.Type.unit
19+
(fun _ t e () ->
20+
match Runtime_events.User.tag e with
21+
| Lwt_main.Scheduler_lap ->
22+
let delta = Int64.sub (Runtime_events.Timestamp.to_int64 t) !last_lap in
23+
if delta > 1_000_000_000L (* 1 second *) then begin
24+
Printf.printf "ALARM: stall 1s+ CRASHING\n"; flush stdout;
25+
exit 0
26+
end;
27+
if delta > 500_000_000L (* 0.5 second *) then begin
28+
Printf.printf "ALARM: stall .5s+\n"; flush stdout
29+
end
30+
| _ -> ())
31+
in
32+
let rec detect_stall () =
33+
Unix.sleepf 0.01;
34+
let _ : int = Runtime_events.read_poll cursor cb_stall None in
35+
detect_stall ()
36+
in
37+
let cb_pre_start =
38+
Runtime_events.Callbacks.create ()
39+
|> Runtime_events.Callbacks.add_user_event
40+
Runtime_events.Type.span
41+
(fun _ t e ev ->
42+
match Runtime_events.User.tag e, ev with
43+
| Lwt_main.Scheduler_call, Runtime_events.Type.Begin ->
44+
last_lap := Runtime_events.Timestamp.to_int64 t;
45+
started := true
46+
| Lwt_main.Scheduler_call, Runtime_events.Type.End ->
47+
failwith "NO"
48+
| _ -> failwith "NOPE")
49+
in
50+
let rec detect_start () =
51+
if not !started then begin
52+
Unix.sleepf 0.001;
53+
let _ : int = Runtime_events.read_poll cursor cb_pre_start None in
54+
detect_start ()
55+
end else
56+
detect_stall ()
57+
in
58+
detect_start ()
59+
60+
let _ = Domain.spawn (fun () -> detect ())
61+
62+
let () = Lwt_main.run (stall 0.)

0 commit comments

Comments
 (0)