ocaml · gasche · Jan 30, 2018 · Jan 26, 2018 · Jan 26, 2018 · Jan 26, 2018
diff --git a/Changes b/Changes
@@ -135,6 +135,9 @@ Working version
   structures and signatures (e.g. "include struct … include(struct … end) … end")
   (Florian Angeletti, review by Gabriel Scherer, report by Christophe Raffalli)
 
+- GPR#1585: optimize output of "ocamllex -ml"
+  (Alain Frisch, review by Frédéric Bour and Gabriel Scherer)
+
 ### Manual and documentation:
 
 - PR#7647, GPR#1384: emphasize ocaml.org website and forum in README

diff --git a/experimental/frisch/Makefile b/experimental/frisch/Makefile
@@ -1,3 +1,5 @@
+include ../../config/Makefile
+
 ROOT=../..
 OCAMLC=$(ROOT)/boot/ocamlrun $(ROOT)/ocamlc -I $(ROOT)/stdlib -I $(ROOT)/parsing -I $(ROOT)/utils -I $(ROOT)/tools -I $(ROOT)/typing -I $(ROOT)/driver -I $(ROOT)/toplevel -w A-4-9-42
 COMMON=$(ROOT)/compilerlibs/ocamlcommon.cma
@@ -77,3 +79,35 @@ nomli:
 matches:
 	$(OCAMLC) -linkall -o ppx_matches.exe $(COMMON) ppx_matches.ml
 	$(OCAMLC) -c -dsource -ppx ./ppx_matches.exe test_matches.ml
+
+
+## Benchmark ocamllex
+
+.PHONY: bench_ocamllex bench_ocamllex2
+ARGS=-o bench.exe -nostdlib \
+     -I $(ROOT)/otherlibs/$(UNIXLIB) -I $(ROOT)/stdlib -I $(ROOT)/parsing -I $(ROOT)/utils \
+     $(ROOT)/compilerlibs/ocamlcommon.cma unix.cma \
+     my_lexer.ml bench.ml
+bench_ocamllex:
+	@cp $(ROOT)/parsing/lexer.mll my_lexer.mll
+	cp my_lexer2.mll my_lexer.mll
+	@(cd $(ROOT)/lex && make ocamllex)
+	@$(ROOT)/boot/ocamlrun $(ROOT)/lex/ocamllex -ml my_lexer.mll
+	@echo WITH -ml flag:
+	@make -s bench_ocamllex2
+	@$(ROOT)/boot/ocamlrun $(ROOT)/lex/ocamllex -q my_lexer.mll
+	@echo WITHOUT -ml flag:
+	@make -s bench_ocamllex2
+
+bench_ocamllex2:
+	@echo -n "  NATIVE, -inline 1000: "
+	@$(ROOT)/boot/ocamlrun $(ROOT)/ocamlopt -inline 1000 $(ARGS:.cma=.cmxa)
+	@./bench.exe
+
+	@echo -n "  NATIVE, -inline 10  : "
+	@$(ROOT)/boot/ocamlrun $(ROOT)/ocamlopt -inline 10 $(ARGS:.cma=.cmxa)
+	@./bench.exe
+
+	@echo -n "  BYTECODE            : "
+	@$(ROOT)/boot/ocamlrun $(ROOT)/ocamlc -custom $(ARGS)
+	@./bench.exe
diff --git a/experimental/frisch/bench.ml b/experimental/frisch/bench.ml
@@ -0,0 +1,39 @@
+let lexing s =
+  let lexbuf = Lexing.from_string s in
+  lexbuf.lex_curr_p <- Lexing.dummy_pos;
+  let rec loop () =
+    match My_lexer.token lexbuf with
+    | Parser.EOF -> ()
+    | token -> loop ()
+  in
+  loop ()
+
+let s =
+  let ic = open_in "../../typing/typecore.ml" in
+  let b = Buffer.create 16 in
+  begin
+    try while true do
+        Buffer.add_string b (input_line ic);
+        Buffer.add_char b '\n'
+      done
+    with End_of_file -> ()
+  end;
+  close_in ic;
+  Buffer.contents b
+
+let () =
+  let alloc0 = Gc.allocated_bytes () in
+  let t0 = Unix.gettimeofday () in
+  let n = 100 in
+  for _ = 1 to n do
+    lexing s
+  done;
+  let time = Unix.gettimeofday () -. t0 in
+  let alloc = Gc.allocated_bytes () -. alloc0 in
+
+  let len = float (String.length s) *. float n in
+  let mb = len /. 1024. /. 1024. in
+  Printf.printf " % 8.02f Mb/s   % 8.02f ms/Mb   alloc x % 8.02f \n%!"
+    (mb /. time)
+    (time *. 1000. /. mb)
+    (alloc /. len)
diff --git a/experimental/frisch/bench_ocamllex_optims.md b/experimental/frisch/bench_ocamllex_optims.md
@@ -0,0 +1,110 @@
+Some benchmark to evaluate the speedup to `ocamllex -ml`.
+
+In all tests, we tokenize the `typecore.ml` file (first loaded in
+memory) file using either:
+
+  - the OCaml lexer, or
+
+  - a simpler lexer with trivial actions (to eliminate the cost of
+actions themselves, which is not under the control of ocamllex).
+
+We run the output of:
+
+   - `ocamllex` without the -ml flag, i.e. using tables interpreted at
+runtime by the C support code
+
+   - `ocamllex -ml`, i.e. the automaton is translated to OCaml code;
+this is done on before and after the optimizations;
+
+In adition, since it turned out that the automatic update of lex_start_p by
+the generated code is quite costly, there is now logic so that
+the generated code does not update this field and lex_curr_p when
+lex_start_p is initially physically equal to Lexing.dummy_pos.  This is also
+tested (only for the simpler lexer, since the OCaml one update the field in
+its actions).
+
+For each case, we compile the benchmark with:
+
+   - `ocamlc`
+
+   - `ocamlopt -inline 10`
+
+   - `ocamlopt -inline 1000`
+
+(flambda disabled).
+
+The tables below show:
+
+   - the throughput (Mb of source code tokenized
+by second -- higher is better;
+
+   - its inverse (number of milleseconds to parse one Mb) -- lower is better;
+
+   - the allocation ratio (number of bytes allocated by the GC for each byte of source code)
+
+
+Conclusions:
+
+  - In native code, the "-ml" mode is slightly slower than the table
+    mode before the optimizations, but it becomes significantly faster
+    after the optimizations, obviously even more so when the
+    lexer actions are trivial (throughput 58.44 -> 98.30).
+
+  - In bytecode, the "-ml" mode is always much slower than the table
+    mode, but the optimization reduce the gap is little bit.
+
+  - Not tested here, but it is likely that the optimizations produce
+    code which would be more friendly to Javascript backends
+    (js_of_ocaml and Bucklescript), as they reduce quite a bit
+    the number of function calls and mutations.
+
+Note:
+
+  - The "refill handler" mode has been lightly tested only.
+
+
+OCaml lexer:
+
+````
+WITHOUT -ml flag:
+  NATIVE, -inline 1000:     38.07 Mb/s      26.27 ms/Mb   alloc x    36.79
+  NATIVE, -inline 10  :     35.42 Mb/s      28.23 ms/Mb   alloc x    36.79
+  BYTECODE            :      7.84 Mb/s     127.54 ms/Mb   alloc x    35.48
+
+
+WITH -ml flag, TRUNK:
+  NATIVE, -inline 1000:     34.36 Mb/s      29.11 ms/Mb   alloc x    36.79
+  NATIVE, -inline 10  :     34.12 Mb/s      29.31 ms/Mb   alloc x    36.79
+  BYTECODE            :      4.08 Mb/s     244.93 ms/Mb   alloc x    35.48
+
+
+WITH -ml flag, BRANCH:
+  NATIVE, -inline 1000:     45.56 Mb/s      21.95 ms/Mb   alloc x    36.79
+  NATIVE, -inline 10  :     43.19 Mb/s      23.15 ms/Mb   alloc x    36.79
+  BYTECODE            :      4.35 Mb/s     229.91 ms/Mb   alloc x    35.48
+````
+
+
+Simpler lexer (trivial actions):
+
+````
+WITHOUT -ml flag:
+  NATIVE, -inline 1000:     58.44 Mb/s      17.11 ms/Mb   alloc x    21.94
+  NATIVE, -inline 10  :     58.24 Mb/s      17.17 ms/Mb   alloc x    21.94
+  BYTECODE            :     12.63 Mb/s      79.21 ms/Mb   alloc x    21.93
+
+WITH -ml flag, TRUNK:
+  NATIVE, -inline 1000:     55.14 Mb/s      18.13 ms/Mb   alloc x    21.94
+  NATIVE, -inline 10  :     50.76 Mb/s      19.70 ms/Mb   alloc x    21.94
+  BYTECODE            :      5.74 Mb/s     174.22 ms/Mb   alloc x    21.93
+
+WITH -ml flag, BRANCH:
+  NATIVE, -inline 1000:     98.30 Mb/s      10.17 ms/Mb   alloc x    21.94
+  NATIVE, -inline 10  :     87.16 Mb/s      11.47 ms/Mb   alloc x    21.94
+  BYTECODE            :      6.48 Mb/s     154.43 ms/Mb   alloc x    21.93
+
+WITH -ml flag, BRANCH, dummy_pos:
+  NATIVE, -inline 1000:    152.68 Mb/s       6.55 ms/Mb   alloc x     1.00
+  NATIVE, -inline 10  :    133.97 Mb/s       7.46 ms/Mb   alloc x     1.00
+  BYTECODE            :      7.42 Mb/s     134.81 ms/Mb   alloc x     1.00
+````
diff --git a/experimental/frisch/my_lexer2.mll b/experimental/frisch/my_lexer2.mll
@@ -0,0 +1,89 @@
+{
+open Parser
+}
+let newline = ('\013'* '\010')
+let blank = [' ' '\009' '\012']
+let lowercase = ['a'-'z' '_']
+let uppercase = ['A'-'Z']
+let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
+let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
+let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222']
+let identchar_latin1 =
+  ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
+let symbolchar =
+  ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
+let dotsymbolchar =
+  ['!' '$' '%' '&' '*' '+' '-' '/' ':' '=' '>' '?' '@' '^' '|' '~']
+let decimal_literal =
+  ['0'-'9'] ['0'-'9' '_']*
+let hex_digit =
+  ['0'-'9' 'A'-'F' 'a'-'f']
+let hex_literal =
+  '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']*
+let oct_literal =
+  '0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
+let bin_literal =
+  '0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
+let int_literal =
+  decimal_literal | hex_literal | oct_literal | bin_literal
+let float_literal =
+  ['0'-'9'] ['0'-'9' '_']*
+  ('.' ['0'-'9' '_']* )?
+  (['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']* )?
+let hex_float_literal =
+  '0' ['x' 'X']
+  ['0'-'9' 'A'-'F' 'a'-'f'] ['0'-'9' 'A'-'F' 'a'-'f' '_']*
+  ('.' ['0'-'9' 'A'-'F' 'a'-'f' '_']* )?
+  (['p' 'P'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']* )?
+let literal_modifier = ['G'-'Z' 'g'-'z']
+
+rule token = parse
+ | eof { EOF }
+ | lowercase identchar * { TILDE }
+  | "&"  { AMPERSAND }
+  | "&&" { AMPERAMPER }
+  | "`"  { BACKQUOTE }
+  | "\'" { QUOTE }
+  | "("  { LPAREN }
+  | ")"  { RPAREN }
+  | "*"  { STAR }
+  | ","  { COMMA }
+  | "->" { MINUSGREATER }
+  | "."  { DOT }
+  | ".." { DOTDOT }
+  | ":"  { COLON }
+  | "::" { COLONCOLON }
+  | ":=" { COLONEQUAL }
+  | ":>" { COLONGREATER }
+  | ";"  { SEMI }
+  | ";;" { SEMISEMI }
+  | "<"  { LESS }
+  | "<-" { LESSMINUS }
+  | "="  { EQUAL }
+  | "["  { LBRACKET }
+  | "[|" { LBRACKETBAR }
+  | "[<" { LBRACKETLESS }
+  | "[>" { LBRACKETGREATER }
+  | "]"  { RBRACKET }
+  | "{"  { LBRACE }
+  | "{<" { LBRACELESS }
+  | "|"  { BAR }
+  | "||" { BARBAR }
+  | "|]" { BARRBRACKET }
+  | ">"  { GREATER }
+  | ">]" { GREATERRBRACKET }
+  | "}"  { RBRACE }
+  | ">}" { GREATERRBRACE }
+  | "[@" { LBRACKETAT }
+  | "[@@"  { LBRACKETATAT }
+  | "[@@@" { LBRACKETATATAT }
+  | "[%"   { LBRACKETPERCENT }
+  | "[%%"  { LBRACKETPERCENTPERCENT }
+  | "!"  { BANG }
+  | "!=" { INFIXOP0 "!=" }
+  | "+"  { PLUS }
+  | "+." { PLUSDOT }
+  | "+=" { PLUSEQ }
+  | "-"  { MINUS }
+  | "-." { MINUSDOT }
+ | _ { EOL }