Skip to content

Commit 8c66f74

Browse files
stall detection: demo of attaching to existing lwt program
1 parent f462038 commit 8c66f74

File tree

9 files changed

+104
-41
lines changed

9 files changed

+104
-41
lines changed

dune-project

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ a single thread by default. This reduces the need for locks or other
7676
synchronization primitives. Code can be run in parallel on an opt-in basis.
7777
")
7878
(depends
79-
(ocaml (>= 5.1.0))
79+
(ocaml (>= 5.4))
8080
domain_shims
8181
(cppo (and :build (>= 1.1)))
8282
(ocamlfind (and :dev (>= 1.7.3-1)))

lwt.opam

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ doc: "https://ocsigen.org/lwt"
2222
bug-reports: "https://github.com/ocsigen/lwt/issues"
2323
depends: [
2424
"dune" {>= "3.15"}
25-
"ocaml" {>= "5.1.0"}
25+
"ocaml" {>= "5.4"}
2626
"domain_shims"
2727
"cppo" {build & >= "1.1"}
2828
"ocamlfind" {dev & >= "1.7.3-1"}

src/unix/lwt_main.mli

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,8 @@ module Exit_hooks :
135135
val at_exit : (unit -> unit Lwt.t) -> unit
136136
(** [Lwt_main.at_exit hook] is the same as
137137
[ignore (Lwt_main.Exit_hooks.add_first hook)]. *)
138+
139+
val sch_call : Runtime_events.Type.span Runtime_events.User.t
140+
type Runtime_events.User.tag += Scheduler_call
141+
val sch_lap : unit Runtime_events.User.t
142+
type Runtime_events.User.tag += Scheduler_lap

stall-detect.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
PROJECT_ROOT="$(git rev-parse --show-toplevel)"
5+
dune build "$PROJECT_ROOT/_build/default/test/stall_detection/staller.exe"
6+
dune build "$PROJECT_ROOT/_build/default/test/stall_detection/detector.exe"
7+
8+
RING_DIR=$(mktemp -d -t staller-detector.XXXXXX)
9+
10+
OCAML_RUNTIME_EVENTS_DIR="$RING_DIR" "$PROJECT_ROOT"/_build/default/test/stall_detection/staller.exe &
11+
STALLER_PID=$!
12+
13+
echo "staller started"
14+
15+
"$PROJECT_ROOT/_build/default/test/stall_detection/detector.exe" "$RING_DIR" "$STALLER_PID"
16+
17+
echo "detector started"
18+
19+
# Optional: wait for both processes to finish
20+
wait

test/stall_detection/detector.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
let ringopt = (Sys.argv.(1), int_of_string Sys.argv.(2))
2+
let () = Stallerlib.detect ~ringopt ()

test/stall_detection/dune

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1+
(executable
2+
(name staller)
3+
(libraries lwt lwt.unix))
4+
5+
(executable
6+
(name detector)
7+
(libraries lwt lwt.unix runtime_events))
8+
19
(executable
210
(name selfdetector)
311
(libraries unix lwt lwt.unix runtime_events))
4-
Lines changed: 2 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,5 @@
1-
let () = Runtime_events.start ()
2-
3-
let rec stall d =
4-
let open Lwt.Syntax in
5-
Printf.printf "stalling for %dns\n" (int_of_float (d *. 1_000_000_000.));
6-
flush stdout;
7-
Unix.sleepf d;
8-
let* () = Lwt.pause () in
9-
stall (1.5*.(d+.0.01))
10-
11-
(* set to maxint to avoid the first hit being a false positive *)
12-
let last_lap = ref Int64.max_int
13-
14-
let cursor = Runtime_events.create_cursor None
15-
let cb =
16-
Runtime_events.Callbacks.create ()
17-
|> Runtime_events.Callbacks.add_user_event
18-
Runtime_events.Type.unit
19-
(fun _ t e () ->
20-
match Runtime_events.User.tag e with
21-
| Lwt_main.Scheduler_lap ->
22-
let delta = Int64.sub (Runtime_events.Timestamp.to_int64 t) !last_lap in
23-
last_lap := Runtime_events.Timestamp.to_int64 t;
24-
if delta > 1_000_000_000L (* 1 second *) then begin
25-
Printf.printf "ALARM: stall 1s+ CRASHING\n"; flush stdout;
26-
exit 0
27-
end;
28-
if delta > 500_000_000L (* 0.5 second *) then begin
29-
Printf.printf "ALARM: stall .5s+\n"; flush stdout
30-
end
31-
| _ -> ())
32-
let rec detect () =
33-
Unix.sleepf 0.01;
34-
let _ : int = Runtime_events.read_poll cursor cb None in
35-
detect ()
36-
37-
let _ : _ Domain.t = Domain.spawn detect
1+
let _ : _ Domain.t = Domain.spawn (fun () -> Stallerlib.detect ())
382

393
let () =
404
Printf.printf "start\n"; flush stdout;
41-
Lwt_main.run (stall 0.)
5+
Lwt_main.run (Stallerlib.stall 0.)

test/stall_detection/staller.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
let () = Unix.sleep 2
2+
let () = Lwt_main.run (Stallerlib.stall 0.)

test/stall_detection/stallerlib.ml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
(* STALLING *)
2+
3+
let () = Random.self_init ()
4+
5+
let to_s_ns f =
6+
let ns = int_of_float (f *. 1_000_000_000.) in
7+
(ns / 1_000_000_000 , ns mod 1_000_000_000)
8+
9+
let rec stall d =
10+
let open Lwt.Syntax in
11+
let s, ns = to_s_ns d in
12+
Printf.printf "stalling for %d.%09d\n" s ns;
13+
flush stdout;
14+
Unix.sleepf d;
15+
let* () = Lwt.pause () in
16+
if d > 3. then exit 2 else stall (1.2*.(d+. Random.float 0.05))
17+
18+
19+
(* DETECTING *)
20+
21+
let () = Runtime_events.start ()
22+
23+
(* set to maxint to avoid the first hit being a false positive *)
24+
let last_lap = ref Int64.max_int
25+
let alarmed = ref false
26+
27+
let detect ?ringopt () =
28+
begin match ringopt with
29+
| None -> Printf.printf "starting detection on self (%d)\n" (Unix.getpid ())
30+
| Some (path, pid) -> Printf.printf "starting detection on %s/%d.events\n" path pid
31+
end;
32+
let cursor = Runtime_events.create_cursor ringopt in
33+
let is_stall t =
34+
let delta = Int64.sub (Runtime_events.Timestamp.to_int64 t) !last_lap in
35+
if delta > 1_000_000_000L (* 1 second *) then begin
36+
Printf.printf "ALARM: stall 1s+ CRASHING\n"; flush stdout;
37+
(match ringopt with
38+
| None -> exit 1
39+
| Some (_, pid) -> Unix.kill pid Sys.sigkill)
40+
end;
41+
if not !alarmed && delta > 500_000_000L (* 0.5 second *) then begin
42+
alarmed := true;
43+
Printf.printf "ALARM: stall .5s+\n"; flush stdout
44+
end
45+
in
46+
let cb =
47+
Runtime_events.Callbacks.create ()
48+
|> Runtime_events.Callbacks.add_user_event
49+
Runtime_events.Type.unit
50+
(fun _ t e () ->
51+
match Runtime_events.User.tag e with
52+
| Lwt_main.Scheduler_lap ->
53+
alarmed := false;
54+
last_lap := Runtime_events.Timestamp.to_int64 t
55+
| _ -> ())
56+
in
57+
let rec detect () =
58+
Unix.sleepf 0.01;
59+
let _ : int = Runtime_events.read_poll cursor cb None in
60+
is_stall (Runtime_events.Timestamp.get_current ());
61+
detect ()
62+
in
63+
detect ()

0 commit comments

Comments
 (0)