ocaml · May 31, 2018 · Jan 30, 2018 · May 31, 2018 · May 31, 2018 · May 31, 2018
diff --git a/Changes b/Changes
@@ -12,6 +12,11 @@ Working version
 
 ### Standard library:
 
+- GPR#1590: ocamllex-generated lexers can be instructed not to update
+  their lex_curr_p/lex_start_p fields, resulting in a significant
+  performance gain when those fields are not required
+  (Alain Frisch, Jérémie Dimino)
+
 - MPR#7795, GPR#1782: Fix off-by-one error in Weak.create
   (KC Sivaramakrishnan)
 
@@ -90,7 +95,7 @@ Working version
   (steinuil, review by Marcello Seri, Gabriel Scherer and Florian Angeletti)
 
 - GPR#1797: remove the deprecated Makefile.nt files
-  (Sébastien Hinderer, review by Nicolas Ojeda Bar)  
+  (Sébastien Hinderer, review by Nicolas Ojeda Bar)
 
 ### Internal/compiler-libs changes:
 

diff --git a/stdlib/lexing.ml b/stdlib/lexing.ml
@@ -63,7 +63,7 @@ external c_new_engine : lex_tables -> int -> lexbuf -> int
 
 let engine tbl state buf =
   let result = c_engine tbl state buf in
-  if result >= 0 then begin
+  if result >= 0 && buf.lex_curr_p != dummy_pos then begin
     buf.lex_start_p <- buf.lex_curr_p;
     buf.lex_curr_p <- {buf.lex_curr_p
                        with pos_cnum = buf.lex_abs_pos + buf.lex_curr_pos};
@@ -73,7 +73,7 @@ let engine tbl state buf =
 
 let new_engine tbl state buf =
   let result = c_new_engine tbl state buf in
-  if result >= 0 then begin
+  if result >= 0 && buf.lex_curr_p != dummy_pos then begin
     buf.lex_start_p <- buf.lex_curr_p;
     buf.lex_curr_p <- {buf.lex_curr_p
                        with pos_cnum = buf.lex_abs_pos + buf.lex_curr_pos};
@@ -145,7 +145,7 @@ let zero_pos = {
   pos_cnum = 0;
 }
 
-let from_function f =
+let from_function ?(with_positions = true) f =
   { refill_buff = lex_refill f (Bytes.create 512);
     lex_buffer = Bytes.create 1024;
     lex_buffer_len = 0;
@@ -156,14 +156,14 @@ let from_function f =
     lex_last_action = 0;
     lex_mem = [||];
     lex_eof_reached = false;
-    lex_start_p = zero_pos;
-    lex_curr_p = zero_pos;
+    lex_start_p = if with_positions then zero_pos else dummy_pos;
+    lex_curr_p = if with_positions then zero_pos else dummy_pos;
   }
 
-let from_channel ic =
-  from_function (fun buf n -> input ic buf 0 n)
+let from_channel ?with_positions ic =
+  from_function ?with_positions (fun buf n -> input ic buf 0 n)
 
-let from_string s =
+let from_string ?(with_positions = true) s =
   { refill_buff = (fun lexbuf -> lexbuf.lex_eof_reached <- true);
     lex_buffer = Bytes.of_string s; (* have to make a copy for compatibility
                                        with unsafe-string mode *)
@@ -175,10 +175,12 @@ let from_string s =
     lex_last_action = 0;
     lex_mem = [||];
     lex_eof_reached = true;
-    lex_start_p = zero_pos;
-    lex_curr_p = zero_pos;
+    lex_start_p = if with_positions then zero_pos else dummy_pos;
+    lex_curr_p = if with_positions then zero_pos else dummy_pos;
   }
 
+let with_positions lexbuf = lexbuf.lex_curr_p != dummy_pos
+
 let lexeme lexbuf =
   let len = lexbuf.lex_curr_pos - lexbuf.lex_start_pos in
   Bytes.sub_string lexbuf.lex_buffer lexbuf.lex_start_pos len
@@ -215,10 +217,12 @@ let lexeme_end_p lexbuf = lexbuf.lex_curr_p
 
 let new_line lexbuf =
   let lcp = lexbuf.lex_curr_p in
-  lexbuf.lex_curr_p <- { lcp with
-    pos_lnum = lcp.pos_lnum + 1;
-    pos_bol = lcp.pos_cnum;
-  }
+  if lcp != dummy_pos then
+    lexbuf.lex_curr_p <-
+      { lcp with
+        pos_lnum = lcp.pos_lnum + 1;
+        pos_bol = lcp.pos_cnum;
+      }
 
 
 
@@ -227,5 +231,6 @@ let new_line lexbuf =
 let flush_input lb =
   lb.lex_curr_pos <- 0;
   lb.lex_abs_pos <- 0;
-  lb.lex_curr_p <- {lb.lex_curr_p with pos_cnum = 0};
+  let lcp = lb.lex_curr_p in
+  if lcp != dummy_pos then lb.lex_curr_p <- {lcp with pos_cnum = 0};
   lb.lex_buffer_len <- 0;
diff --git a/stdlib/lexing.mli b/stdlib/lexing.mli
@@ -65,35 +65,60 @@ type lexbuf =
    The lexer buffer holds the current state of the scanner, plus
    a function to refill the buffer from the input.
 
-   At each token, the lexing engine will copy [lex_curr_p] to
-   [lex_start_p], then change the [pos_cnum] field
-   of [lex_curr_p] by updating it with the number of characters read
-   since the start of the [lexbuf].  The other fields are left
-   unchanged by the lexing engine.  In order to keep them
+   Lexers can optionally maintain the [lex_curr_p] and [lex_start_p]
+   position fields.  This "position tracking" mode is the default, and
+   it corresponds to passing [~with_position:true] to functions that
+   create lexer buffers. In this mode, the lexing engine and lexer
+   actions are co-responsible for properly updating the position
+   fields, as described in the next paragraph.  When the mode is
+   explicitly disabled (with [~with_position:false]), the lexing
+   engine will not touch the position fields and the lexer actions
+   should be careful not to do it either; the [lex_curr_p] and
+   [lex_start_p] field will then always hold the [dummy_pos] invalid
+   position.  Not tracking positions avoids allocations and memory
+   writes and can significantly improve the performance of the lexer
+   in contexts where [lex_start_p] and [lex_curr_p] are not needed.
+
+   Position tracking mode works as follows.  At each token, the lexing
+   engine will copy [lex_curr_p] to [lex_start_p], then change the
+   [pos_cnum] field of [lex_curr_p] by updating it with the number of
+   characters read since the start of the [lexbuf].  The other fields
+   are left unchanged by the lexing engine.  In order to keep them
    accurate, they must be initialised before the first use of the
-   lexbuf, and updated by the relevant lexer actions (i.e. at each
-   end of line -- see also [new_line]).
- *)
+   lexbuf, and updated by the relevant lexer actions (i.e. at each end
+   of line -- see also [new_line]).
+*)
 
-val from_channel : in_channel -> lexbuf
+val from_channel : ?with_positions:bool -> in_channel -> lexbuf
 (** Create a lexer buffer on the given input channel.
    [Lexing.from_channel inchan] returns a lexer buffer which reads
    from the input channel [inchan], at the current reading position. *)
 
-val from_string : string -> lexbuf
+val from_string : ?with_positions:bool -> string -> lexbuf
 (** Create a lexer buffer which reads from
    the given string. Reading starts from the first character in
    the string. An end-of-input condition is generated when the
    end of the string is reached. *)
 
-val from_function : (bytes -> int -> int) -> lexbuf
+val from_function : ?with_positions:bool -> (bytes -> int -> int) -> lexbuf
 (** Create a lexer buffer with the given function as its reading method.
    When the scanner needs more characters, it will call the given
    function, giving it a byte sequence [s] and a byte
    count [n]. The function should put [n] bytes or fewer in [s],
    starting at index 0, and return the number of bytes
    provided. A return value of 0 means end of input. *)
 
+val with_positions : lexbuf -> bool
+(** Tell whether the lexer buffer keeps track of position fields
+    [lex_curr_p] / [lex_start_p], as determined by the corresponding
+    optional argument for functions that create lexer buffers
+    (whose default value is [true]).
+
+    When [with_positions] is [false], lexer actions should not
+    modify position fields.  Doing it nevertheless could
+    re-enable the [with_position] mode and degrade performances.
+*)
+
 
 (** {1 Functions for lexer semantic actions} *)
 
@@ -127,16 +152,19 @@ val lexeme_end : lexbuf -> int
 
 val lexeme_start_p : lexbuf -> position
 (** Like [lexeme_start], but return a complete [position] instead
-    of an offset. *)
+    of an offset.  When position tracking is disabled, the function
+    returns [dummy_pos]. *)
 
 val lexeme_end_p : lexbuf -> position
 (** Like [lexeme_end], but return a complete [position] instead
-    of an offset. *)
+    of an offset.  When position tracking is disabled, the function
+    returns [dummy_pos]. *)
 
 val new_line : lexbuf -> unit
 (** Update the [lex_curr_p] field of the lexbuf to reflect the start
     of a new line.  You can call this function in the semantic action
-    of the rule that matches the end-of-line character.
+    of the rule that matches the end-of-line character.  The function
+    does nothing when position tracking is disabled.
     @since 3.11.0
 *)