Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make updating lex_curr_p/lex_start_p optional #1590

Merged
4 commits merged into from
May 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Changes
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ Working version

### Standard library:

- GPR#1590: ocamllex-generated lexers can be instructed not to update
their lex_curr_p/lex_start_p fields, resulting in a significant
performance gain when those fields are not required
(Alain Frisch, Jérémie Dimino)

- MPR#7795, GPR#1782: Fix off-by-one error in Weak.create
(KC Sivaramakrishnan)

Expand Down Expand Up @@ -90,7 +95,7 @@ Working version
(steinuil, review by Marcello Seri, Gabriel Scherer and Florian Angeletti)

- GPR#1797: remove the deprecated Makefile.nt files
(Sébastien Hinderer, review by Nicolas Ojeda Bar)
(Sébastien Hinderer, review by Nicolas Ojeda Bar)

### Internal/compiler-libs changes:

Expand Down
35 changes: 20 additions & 15 deletions stdlib/lexing.ml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ external c_new_engine : lex_tables -> int -> lexbuf -> int

let engine tbl state buf =
let result = c_engine tbl state buf in
if result >= 0 then begin
if result >= 0 && buf.lex_curr_p != dummy_pos then begin
buf.lex_start_p <- buf.lex_curr_p;
buf.lex_curr_p <- {buf.lex_curr_p
with pos_cnum = buf.lex_abs_pos + buf.lex_curr_pos};
Expand All @@ -73,7 +73,7 @@ let engine tbl state buf =

let new_engine tbl state buf =
let result = c_new_engine tbl state buf in
if result >= 0 then begin
if result >= 0 && buf.lex_curr_p != dummy_pos then begin
buf.lex_start_p <- buf.lex_curr_p;
buf.lex_curr_p <- {buf.lex_curr_p
with pos_cnum = buf.lex_abs_pos + buf.lex_curr_pos};
Expand Down Expand Up @@ -145,7 +145,7 @@ let zero_pos = {
pos_cnum = 0;
}

let from_function f =
let from_function ?(with_positions = true) f =
{ refill_buff = lex_refill f (Bytes.create 512);
lex_buffer = Bytes.create 1024;
lex_buffer_len = 0;
Expand All @@ -156,14 +156,14 @@ let from_function f =
lex_last_action = 0;
lex_mem = [||];
lex_eof_reached = false;
lex_start_p = zero_pos;
lex_curr_p = zero_pos;
lex_start_p = if with_positions then zero_pos else dummy_pos;
lex_curr_p = if with_positions then zero_pos else dummy_pos;
}

let from_channel ic =
from_function (fun buf n -> input ic buf 0 n)
let from_channel ?with_positions ic =
from_function ?with_positions (fun buf n -> input ic buf 0 n)

let from_string s =
let from_string ?(with_positions = true) s =
{ refill_buff = (fun lexbuf -> lexbuf.lex_eof_reached <- true);
lex_buffer = Bytes.of_string s; (* have to make a copy for compatibility
with unsafe-string mode *)
Expand All @@ -175,10 +175,12 @@ let from_string s =
lex_last_action = 0;
lex_mem = [||];
lex_eof_reached = true;
lex_start_p = zero_pos;
lex_curr_p = zero_pos;
lex_start_p = if with_positions then zero_pos else dummy_pos;
lex_curr_p = if with_positions then zero_pos else dummy_pos;
}

let with_positions lexbuf = lexbuf.lex_curr_p != dummy_pos

let lexeme lexbuf =
let len = lexbuf.lex_curr_pos - lexbuf.lex_start_pos in
Bytes.sub_string lexbuf.lex_buffer lexbuf.lex_start_pos len
Expand Down Expand Up @@ -215,10 +217,12 @@ let lexeme_end_p lexbuf = lexbuf.lex_curr_p

let new_line lexbuf =
let lcp = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <- { lcp with
pos_lnum = lcp.pos_lnum + 1;
pos_bol = lcp.pos_cnum;
}
if lcp != dummy_pos then
lexbuf.lex_curr_p <-
{ lcp with
pos_lnum = lcp.pos_lnum + 1;
pos_bol = lcp.pos_cnum;
}



Expand All @@ -227,5 +231,6 @@ let new_line lexbuf =
let flush_input lb =
lb.lex_curr_pos <- 0;
lb.lex_abs_pos <- 0;
lb.lex_curr_p <- {lb.lex_curr_p with pos_cnum = 0};
let lcp = lb.lex_curr_p in
if lcp != dummy_pos then lb.lex_curr_p <- {lcp with pos_cnum = 0};
lb.lex_buffer_len <- 0;
56 changes: 42 additions & 14 deletions stdlib/lexing.mli
Original file line number Diff line number Diff line change
Expand Up @@ -65,35 +65,60 @@ type lexbuf =
The lexer buffer holds the current state of the scanner, plus
a function to refill the buffer from the input.

At each token, the lexing engine will copy [lex_curr_p] to
[lex_start_p], then change the [pos_cnum] field
of [lex_curr_p] by updating it with the number of characters read
since the start of the [lexbuf]. The other fields are left
unchanged by the lexing engine. In order to keep them
Lexers can optionally maintain the [lex_curr_p] and [lex_start_p]
position fields. This "position tracking" mode is the default, and
it corresponds to passing [~with_position:true] to functions that
create lexer buffers. In this mode, the lexing engine and lexer
actions are co-responsible for properly updating the position
fields, as described in the next paragraph. When the mode is
explicitly disabled (with [~with_position:false]), the lexing
engine will not touch the position fields and the lexer actions
should be careful not to do it either; the [lex_curr_p] and
[lex_start_p] field will then always hold the [dummy_pos] invalid
position. Not tracking positions avoids allocations and memory
writes and can significantly improve the performance of the lexer
in contexts where [lex_start_p] and [lex_curr_p] are not needed.

Position tracking mode works as follows. At each token, the lexing
engine will copy [lex_curr_p] to [lex_start_p], then change the
[pos_cnum] field of [lex_curr_p] by updating it with the number of
characters read since the start of the [lexbuf]. The other fields
are left unchanged by the lexing engine. In order to keep them
accurate, they must be initialised before the first use of the
lexbuf, and updated by the relevant lexer actions (i.e. at each
end of line -- see also [new_line]).
*)
lexbuf, and updated by the relevant lexer actions (i.e. at each end
of line -- see also [new_line]).
*)

val from_channel : in_channel -> lexbuf
val from_channel : ?with_positions:bool -> in_channel -> lexbuf
(** Create a lexer buffer on the given input channel.
[Lexing.from_channel inchan] returns a lexer buffer which reads
from the input channel [inchan], at the current reading position. *)

val from_string : string -> lexbuf
val from_string : ?with_positions:bool -> string -> lexbuf
(** Create a lexer buffer which reads from
the given string. Reading starts from the first character in
the string. An end-of-input condition is generated when the
end of the string is reached. *)

val from_function : (bytes -> int -> int) -> lexbuf
val from_function : ?with_positions:bool -> (bytes -> int -> int) -> lexbuf
(** Create a lexer buffer with the given function as its reading method.
When the scanner needs more characters, it will call the given
function, giving it a byte sequence [s] and a byte
count [n]. The function should put [n] bytes or fewer in [s],
starting at index 0, and return the number of bytes
provided. A return value of 0 means end of input. *)

val with_positions : lexbuf -> bool
(** Tell whether the lexer buffer keeps track of position fields
[lex_curr_p] / [lex_start_p], as determined by the corresponding
optional argument for functions that create lexer buffers
(whose default value is [true]).

When [with_positions] is [false], lexer actions should not
modify position fields. Doing it nevertheless could
re-enable the [with_position] mode and degrade performances.
*)


(** {1 Functions for lexer semantic actions} *)

Expand Down Expand Up @@ -127,16 +152,19 @@ val lexeme_end : lexbuf -> int

val lexeme_start_p : lexbuf -> position
(** Like [lexeme_start], but return a complete [position] instead
of an offset. *)
of an offset. When position tracking is disabled, the function
returns [dummy_pos]. *)

val lexeme_end_p : lexbuf -> position
(** Like [lexeme_end], but return a complete [position] instead
of an offset. *)
of an offset. When position tracking is disabled, the function
returns [dummy_pos]. *)

val new_line : lexbuf -> unit
(** Update the [lex_curr_p] field of the lexbuf to reflect the start
of a new line. You can call this function in the semantic action
of the rule that matches the end-of-line character.
of the rule that matches the end-of-line character. The function
does nothing when position tracking is disabled.
@since 3.11.0
*)

Expand Down