Permalink
Browse files

Full cleanup

  • Loading branch information...
1 parent b69ea68 commit 2cb9b162ed27a702e0f6f0dedcde52cfc99dea8e @mthvedt committed Jun 10, 2012
Showing with 89 additions and 53 deletions.
  1. +2 −3 .gitignore
  2. +11 −3 TODO.md
  3. +38 −32 src/clearley/core.clj
  4. +1 −3 test/clearley/examples/calculator.clj
  5. +30 −11 test/clearley/test/core.clj
  6. +7 −1 test/clearley/test/utils.clj
View
@@ -1,6 +1,5 @@
lib/*
+classes/*
target/*
+pom.xml
.lein*
-*.swp
-*.iml
-.idea/*
View
14 TODO.md
@@ -7,11 +7,19 @@ Alpha:
Beta:
+* Grammar composition.
* JSON example.
* Clean up the software internals.
-* Work on defrule syntax. Eventually I want to make Clearley self-hosting, as a POC.
+* Work on defrule syntax. Eventually I want to make Clearley self-hosting--
+use a parser to define defrule, instead of about 90 lines of macros.
+Would make a convincing POC for the 'parse any input' thesis!
-Release:
+More:
-* Parser NDFA.
+* EarleyItem protocol and polymorphism. Leverage the JVM for max speed and flexibility.
+* Expose EarleyItem protocol. Allow extension of the rule mechanism.
+Infinitely-generated context-free grammars in O(n^3) time. Noam Chomsky can eat it!
+The EarleyItem protocol must be defined with building a LR-NDFA in mind, leading to...
+* Parser NDFA. This will be a huge performance win. See 'Practical Earley Parsing',
+Aycock & Horspool 2002, to see the performance compares favorably to standard automata.
* Also test reliable disambiguity (LR vs LL in particular) when the above is implemented.
View
@@ -4,8 +4,7 @@
Emphasis is on ease of use, versatility, and dynamic/exploratory programming."
(:require (clojure string))
(:use clearley.utils))
-; TODO: test compositability/extensibility
-; TODO: expose grammar
+; TODO: empty rule?
(defprotocol ^:private PStrable
(^:private pstr [obj] "pstr stands for \"pretty-string\".
@@ -18,18 +17,17 @@
after matching. A rule has a required vector of clauses,
an head (optional, since Rules can also be embedded in other rules),
and an optional action (the default action bundles the args into a list).
- A clause can be a rule head referring to one or more rules,
- or a seq of one or more rules (anonymous rules)."
+ A clause can be a rule, a symbol referring to one or more rules,
+ or a vector or seq of one or more rules (anonymous rules)."
([clauses] (rule nil clauses nil))
([head clauses] (rule head clauses nil))
([head clauses action] (Rule. head (vec clauses) action)))
(defn pstr-rule [rule]
(str (:head rule) " -> " (separate-str (:clauses rule) " ")))
-; TODO: better token fns
(defn token
- "A rule that matches a single object (the token). Its action by default
+ "Returns a rule that matches a single object (the token). Its action by default
returns the token but can also return some specified value."
([a-token] (rule nil [a-token] (fn [_] a-token)))
([a-token value] (rule nil [a-token] (fn [_] value))))
@@ -39,7 +37,6 @@
The scanner function is used by the parser to match tokens. If this rule is invoked
on a token, and the scanner returns logcial true, the rule matches the token."
[scanner-fn action]
- ; TODO: test scanners
; TODO: a hack here: the below clause is highly unlikely to match anything
(assoc (Rule. nil [(str "Scanner<" scanner-fn ">")] action) :scanner scanner-fn))
@@ -68,14 +65,11 @@
clause
(get grammar clause [])))
-; TODO: polymorphism. EarleyItem can be much faster.
-; But the parser automaton should come first, since parser automata are big perf win
-; and whatever polymorphism EarleyItme has should be tailored to that.
(defprotocol ^:private EarleyItem
- (predict [self index])
- (escan [self input-token])
- (is-complete? [self])
- (advance [self]))
+ (^:private predict [self index])
+ (^:private escan [self input-token])
+ (^:private is-complete? [self])
+ (^:private advance [self]))
(defrecord ^:private REarleyItem [rule dot index grammar]
EarleyItem
@@ -106,9 +100,9 @@
" ")))
(defprotocol ^:private ChartItem
- (cpredict [self pos])
- (cscan [self input-token input])
- (emerge [self other-item]))
+ (^:private cpredict [self pos])
+ (^:private cscan [self input-token input])
+ (^:private emerge [self other-item]))
; Builds a rule match from the output stack and pushes the match to the top
; (think of a Forth operator reducing the top of a stack)
@@ -145,13 +139,14 @@
(defn- chart-item [rule grammar]
(RChartItem. (REarleyItem. rule 0 0 grammar) (atom {}) '()))
-; TODO: nuke this protocol
+; TODO: nuke this protocol, have data object chart
+; data object charts can also serve as prototypes of parsing NDFA states
(defprotocol Chart
- (add [self item])
- (cfirst [self])
- (crest [self])
- (reset [self])
- (chart-seq [self]))
+ (^:private add [self item])
+ (^:private cfirst [self])
+ (^:private crest [self])
+ (^:private reset [self])
+ (^:private chart-seq [self]))
(defrecord RChart [chartvec chartmap dot]
Chart
@@ -176,7 +171,7 @@
(apply str (update-in (vec (map #(str (pstr %) "\n") chartvec))
[dot] #(str "* " %))))))
-(def new-chart (RChart. [] {} 0))
+(def ^:private new-chart (RChart. [] {} 0))
; scans an input character, seeding a new chart
(defn- scan-chart [chart input-token input]
@@ -207,7 +202,8 @@
; end step
(conj charts (parse-chart current-chart (inc pos))))))
-(defprotocol Parser
+; Don't need to expose parser protocol... only 'parse' fn
+(defprotocol ^:private Parser
(parse [parser input] "Parse the given input with the given parser,
yielding a match tree (a tree of the form
[rule leaves] where leaves is a seq).")
@@ -222,8 +218,9 @@
(defn earley-parser
"Constructs an Earley parser, provided with a seq of rules and a predefined
goal symbol. The parser will attempt to match the given input to the goal symbol,
- given the rules provided. The tokenizer should be a fn that maps input objects
- to the input tokens used in your grammar."
+ given the rules provided. The optional tokenizer can be used to map inputs
+ to the terminal rules of your grammar (the parse tree will contiain inputs
+ as its leaves, not the terminal symbols)."
([goal rules]
(earley-parser goal identity rules))
([goal tokenizer rules]
@@ -249,8 +246,10 @@
match)))
(defn print-charts
- "For a givne parser and input, prints a string representation of its charts to
- *out*."
+ "For a givne parser and input, prints a multi-line representation of its charts to
+ *out*. The representation might change in the future. For more about
+ parse charts, see http://www.wikipedia.org/wiki/Earley_parser. Primarily
+ useful for debugging."
[parser input]
(dorun (for [chart (charts parser input)]
(println (pstr chart)))))
@@ -377,9 +376,16 @@
(not (symbol? current-head)) (recur (rest stack) rgrammar)
true ; rule is a symbol--look it up
(if-let [resolved (ns-resolve thens theenv current-head)]
- (recur (concat (mapcat :clauses @resolved) stack)
- (assoc rgrammar current-head
- (map #(assoc % :head current-head) @resolved)))
+ (let [resolved @resolved]
+ (if (or (vector? resolved) (seq? resolved))
+ ; assume it is a seq of rules
+ (recur (concat (mapcat :clauses resolved) stack)
+ (assoc rgrammar current-head
+ (map #(assoc % :head current-head) resolved)))
+ ; assume it is a rule
+ (recur (cons resolved stack)
+ (assoc rgrammar current-head
+ [(assoc resolved :head current-head)]))))
(TIAE "Cannot resolve rule for head: " current-head))))))
(defn- build-grammar-in-env
@@ -28,9 +28,7 @@
; TODO: error when it's numexpr not number?
([number digit] (+ (* 10 number) digit))
([digit] digit))
-; todo: 'or-token' or something instead
-(defrule digit [(a-digit (map (comp token char) (range (int \0) (inc (int \9)))))]
- (- (int a-digit) (int \0)))
+(def digit (token-range \0 \9 (fn [c] (- (int c) (int \0)))))
(def my-calculator (build-parser sum))
@@ -1,10 +1,8 @@
(ns clearley.test.core
(:use clearley.core clearley.test.utils lazytest.deftest))
+; Some basic tests
(defn rulefn
- "Creates a context-free grammar rule that matches the first given symbol
- (the head symbol) to a sequence of subsymbols (the clauses).
- Any object may be a symbol."
[head & clauses]
(rule head clauses))
@@ -26,19 +24,20 @@
(deftest simple-parser-test
(with-parser simple-parser
- (is (parses? "1+2"))
- (is (parses? "1+2*3+4"))
- (is (parses? "1*2+3*4"))
- (is (parses? "1+55*3+2*55"))
+ (is-parsing "1+2")
+ (is-parsing "1+2*3+4")
+ (is-parsing "1*2+3*4")
+ (is-parsing "1+55*3+2*55")
(is (not (parses? "44")))
(is (not (parses? "55*23")))
(is (not (parses? "1+2a")))
- (is (parses? "1+55*2*55+3+55*4"))
+ (is-parsing "1+55*2*55+3+55*4")
(is-ast [[[\1]]] "1")
(is-ast [[[[\2]]] \+ [[[\3]] \* [\4]]] "2+3*4")
(is-ast [[[[[\1]]] \+ [[[\2]] \* [\3]]] \+ [[[\4]] \* [\1]]] "1+2*3+4*1")
(is-ast [[[\5 \5]]] "55")))
+; Slightly less basic tests
(deftest simple-match-test
(with-parser simple-parser
(is-parse [sum2 [(rulefn :times :num) [num1 [\1]]]] "1")
@@ -48,10 +47,12 @@
[(rulefn :times :num) [(rulefn :num \2) [\2]]]
[\*] [(rulefn :num \3) [\3]]]]
[\+]
- [(rulefn :times :times \* :num) [(rulefn :times :num) [(rulefn :num \4) [\4]]]
+ [(rulefn :times :times \* :num) [(rulefn :times :num)
+ [(rulefn :num \4) [\4]]]
[\*] [(rulefn :num \5 \5) [\5] [\5]]]]
"1+2*3+4*55")))
+; Tokenizers
(defn letter-to-num [thechar]
(if (java.lang.Character/isLetter thechar)
(char (- (int thechar) 48))
@@ -65,6 +66,7 @@
(is-ast [[[[[\a]]] \+ [[[\2]] \* [\c]]] \+ [[[\d]] \* [\1]]] "a+2*c+d*1")
(is-parse [sum2 [(rulefn :times :num) [num1 [\a]]]] "a")))
+; Action tests
(def calculator-rules
[(rule :sum [:sum \+ :times] (fn [a _ b] (+ a b)))
(rule :sum [:times] identity)
@@ -81,6 +83,19 @@
(is-action 6 "2*3")
(is-action 19 "2*3+2*2+3*3")))
+; Rule embedding
+(def weird-rules
+ [(rule :a [\a [\b \c] (rule :d [\d])])])
+
+(def weird-rule-parser (earley-parser :a weird-rules))
+
+(deftest rule-embedding-test
+ (with-parser weird-rule-parser
+ (parses? "abd")
+ (parses? "acd")
+ (not (parses? "abcd"))))
+
+; Test of defrule
(defrule sum
([sum \+ times] (+ sum times))
([times] times))
@@ -98,6 +113,7 @@
(is-action 6 "3+3")
(is-action 15 "3+3*3+3")))
+; Extending rules
(extend-rule digit [\4] 4)
(def parser3 (build-parser sum))
@@ -106,13 +122,15 @@
(is-action 7 "3+4")
(is-action 12 "3*4")))
+; Rule aliasing
(extend-rule sum [sum \- (foo times)] (- sum foo))
(def parser4 (build-parser sum))
(deftest rule-aliasing-test
(with-parser parser4
(is-action 0 "3-3")))
+; Rule literals in defrule
(def digits567 [(token \5 5) (token \6 6) (token \7 7)])
(extend-rule digit
([digits567] digits567)
@@ -131,7 +149,7 @@
(is (with-out-str
(print-charts parser5 "3*4+5-6+7"))))
-; TODO: single rule literal not in vector
+; Scanners
(add-rules digit (scanner #(= \0 %) (fn [_] 0)))
(def parser6 (build-parser sum))
@@ -140,8 +158,9 @@
(is-action 3 "0+3")
(is-action 1 "3+0*5*4+0+3-5")))
+; Token ranges
; should override digit
-(def digit [(token-range \0 \9 (fn [c] (- (int c) (int \0))))])
+(def digit (token-range \0 \9 (fn [c] (- (int c) (int \0)))))
(def parser7 (build-parser sum))
(deftest token-range-test
@@ -4,6 +4,9 @@
(defmacro is= [& forms]
`(is (= ~@forms)))
+(defmacro isnt [& forms]
+ `(is (not (= ~@forms))))
+
(def ^:dynamic local-parser)
(defmacro with-parser [parser & forms]
@@ -15,6 +18,9 @@
(defn parses? [input]
(not (nil? (parse local-parser input))))
+(defmacro is-parsing [input]
+ `(is (parses? ~input)))
+
; valued trees of the form (value & branches)
; neccesary for comparing heterogeneous seqables (here, vec vs lazy-seq)
(defn tree-eq [tree1 tree2]
@@ -27,7 +33,7 @@
(reduce #(and % %2) true (map tree-eq (rest tree1) (rest tree2))))))
(deftest tree-eq-test
- ; only test for falsehoods here... avoid false positives in later testing
+ ; only need to test for falsehoods here... avoid false positives
(is (not (tree-eq [\1] [\2])))
(is (not (tree-eq [] [[]])))
(is (not (tree-eq [\1 []] [\1])))

0 comments on commit 2cb9b16

Please sign in to comment.