forked from colinbenner/ocaml-llvm
/
lexgen.ml
241 lines (208 loc) · 6.8 KB
/
lexgen.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
(***********************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 1996 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed *)
(* under the terms of the Q Public License version 1.0. *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* Compiling a lexer definition *)
open Syntax
(* Deep abstract syntax for regular expressions *)
type regexp =
Empty
| Chars of int
| Action of int
| Seq of regexp * regexp
| Alt of regexp * regexp
| Star of regexp
type ('args,'action) lexer_entry =
{ lex_name: string;
lex_args: 'args;
lex_regexp: regexp;
lex_actions: (int * 'action) list }
(* Representation of automata *)
type automata =
Perform of int
| Shift of automata_trans * automata_move array
and automata_trans =
No_remember
| Remember of int
and automata_move =
Backtrack
| Goto of int
(* Representation of entry points *)
type ('args,'action) automata_entry =
{ auto_name: string;
auto_args: 'args;
auto_initial_state: int;
auto_actions: (int * 'action) list }
(* From shallow to deep syntax *)
let chars = ref ([] : int list list)
let chars_count = ref 0
let rec encode_regexp = function
Epsilon -> Empty
| Characters cl ->
let n = !chars_count in
chars := cl :: !chars;
incr chars_count;
Chars(n)
| Sequence(r1,r2) ->
Seq(encode_regexp r1, encode_regexp r2)
| Alternative(r1,r2) ->
Alt(encode_regexp r1, encode_regexp r2)
| Repetition r ->
Star (encode_regexp r)
let encode_casedef casedef =
List.fold_left
(fun (reg,actions,count) (expr, act) ->
Alt(reg, Seq(encode_regexp expr, Action count)),
(count,act) :: actions,
(succ count)
)
(Empty, [], 0)
casedef
let encode_lexdef def =
chars := [];
chars_count := 0;
let entry_list =
List.map
(fun ((entry_name,args), casedef) ->
let (re,actions,_) = encode_casedef casedef in
{ lex_name = entry_name;
lex_args = args;
lex_regexp = re;
lex_actions = List.rev actions })
def in
let chr = Array.of_list (List.rev !chars) in
chars := [];
(chr, entry_list)
(* To generate directly a NFA from a regular expression.
Confer Aho-Sethi-Ullman, dragon book, chap. 3 *)
type transition =
OnChars of int
| ToAction of int
module TransSet =
Set.Make(struct type t = transition let compare = compare end)
let rec nullable = function
Empty -> true
| Chars _ -> false
| Action _ -> false
| Seq(r1,r2) -> nullable r1 && nullable r2
| Alt(r1,r2) -> nullable r1 || nullable r2
| Star r -> true
let rec firstpos = function
Empty -> TransSet.empty
| Chars pos -> TransSet.add (OnChars pos) TransSet.empty
| Action act -> TransSet.add (ToAction act) TransSet.empty
| Seq(r1,r2) -> if nullable r1
then TransSet.union (firstpos r1) (firstpos r2)
else firstpos r1
| Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2)
| Star r -> firstpos r
let rec lastpos = function
Empty -> TransSet.empty
| Chars pos -> TransSet.add (OnChars pos) TransSet.empty
| Action act -> TransSet.add (ToAction act) TransSet.empty
| Seq(r1,r2) -> if nullable r2
then TransSet.union (lastpos r1) (lastpos r2)
else lastpos r2
| Alt(r1,r2) -> TransSet.union (lastpos r1) (lastpos r2)
| Star r -> lastpos r
let followpos size entry_list =
let v = Array.create size TransSet.empty in
let fill_pos first = function
OnChars pos -> v.(pos) <- TransSet.union first v.(pos)
| ToAction _ -> () in
let rec fill = function
Seq(r1,r2) ->
fill r1; fill r2;
TransSet.iter (fill_pos (firstpos r2)) (lastpos r1)
| Alt(r1,r2) ->
fill r1; fill r2
| Star r ->
fill r;
TransSet.iter (fill_pos (firstpos r)) (lastpos r)
| _ -> () in
List.iter (fun entry -> fill entry.lex_regexp) entry_list;
v
let no_action = max_int
let split_trans_set trans_set =
TransSet.fold
(fun trans (act, pos_set as act_pos_set) ->
match trans with
OnChars pos -> (act, pos :: pos_set)
| ToAction act1 -> if act1 < act then (act1, pos_set) else act_pos_set)
trans_set
(no_action, [])
module StateMap =
Map.Make(struct type t = TransSet.t let compare = TransSet.compare end)
let state_map = ref (StateMap.empty : int StateMap.t)
let todo = (Stack.create() : (TransSet.t * int) Stack.t)
let next_state_num = ref 0
let reset_state_mem () =
state_map := StateMap.empty;
Stack.clear todo;
next_state_num := 0
let get_state st =
try
StateMap.find st !state_map
with Not_found ->
let num = !next_state_num in
incr next_state_num;
state_map := StateMap.add st num !state_map;
Stack.push (st, num) todo;
num
let map_on_all_states f =
let res = ref [] in
begin try
while true do
let (st, i) = Stack.pop todo in
let r = f st in
res := (r, i) :: !res
done
with Stack.Empty -> ()
end;
!res
let goto_state st =
if TransSet.is_empty st then Backtrack else Goto (get_state st)
let transition_from chars follow pos_set =
let tr = Array.create 257 TransSet.empty in
let shift = Array.create 257 Backtrack in
List.iter
(fun pos ->
List.iter
(fun c ->
tr.(c) <- TransSet.union tr.(c) follow.(pos))
chars.(pos))
pos_set;
for i = 0 to 256 do
shift.(i) <- goto_state tr.(i)
done;
shift
let translate_state chars follow state =
match split_trans_set state with
(n, []) -> Perform n
| (n, ps) -> Shift((if n = no_action then No_remember else Remember n),
transition_from chars follow ps)
let make_dfa lexdef =
let (chars, entry_list) = encode_lexdef lexdef in
let follow = followpos (Array.length chars) entry_list in
reset_state_mem();
let initial_states =
List.map
(fun le ->
{ auto_name = le.lex_name;
auto_args = le.lex_args;
auto_initial_state = get_state(firstpos le.lex_regexp);
auto_actions = le.lex_actions })
entry_list in
let states = map_on_all_states (translate_state chars follow) in
let actions = Array.create !next_state_num (Perform 0) in
List.iter (fun (act, i) -> actions.(i) <- act) states;
reset_state_mem();
(initial_states, actions)