Permalink
Browse files

More cleanup, some API cleanup

  • Loading branch information...
1 parent b06f461 commit 696359175df12c3776ed736348d970181b921468 @mthvedt committed Jan 20, 2013
Showing with 90 additions and 87 deletions.
  1. +1 −1 project.clj
  2. +43 −51 src/clearley/core.clj
  3. +11 −11 src/clearley/earley.clj
  4. +20 −11 src/clearley/npda.clj
  5. +4 −3 src/clearley/rules.clj
  6. +0 −6 src/clearley/utils.clj
  7. +11 −4 test/clearley/test/core.clj
View
@@ -1,4 +1,4 @@
-(defproject clearley "0.1.1.ALPHA-SNAPSHOT"
+(defproject clearley "0.0.2.SNAPSHOT"
:description "Parsing for Earthlings"
:dependencies [[org.clojure/clojure "1.4.0"]
[org.clojure/math.numeric-tower "0.0.1" :scope "test"]
View
@@ -4,10 +4,8 @@
maps to arbitrary sequences of sub-rules.
Emphasis is on completeness, modularity, and ease of use.
A functional API and a macro DSL are provided.
-
- Please be sure to see the docs for a high-level overview.
- Clearley docs assume familiarity with the library's high level concepts,
- for succintness."
+
+ See the high-level docs for a further background and overview."
(require [clojure string]
[clojure.pprint])
(use [clearley utils rules earley]))
@@ -28,46 +26,46 @@
([a-token] (token a-token a-token))
([a-token value] (rule nil [a-token] (fn [_] value))))
-(defrecord OneOrMoreImpl [subrule dot]
+(defrecord OneOrMoreImpl [subrule done]
RuleKernel
(predict [self] [(rule nil [subrule] vector)
(rule nil [self subrule] conj)])
(rule-deps [self] [subrule])
(scan [_ _] [])
- (is-complete? [_] (= dot 1))
- (advance [self] (assoc self :dot 1))
- (rule-str [_] (if (not (zero? dot))
+ (is-complete? [_] done)
+ (advance [self] (assoc self :done true))
+ (rule-str [_] (if done
(str (clause-str subrule) " *")
(clause-str subrule))))
(defn one-or-more
- "Creates a rule that matches one or more of a subrule. Its action returns a vector
+ "Creates a rule that matches one or more of a clause. Its action returns a vector
of the matches."
- ([subrule]
- (one-or-more (str (clause-str subrule) "+") subrule))
- ([name subrule]
- (merge (OneOrMoreImpl. subrule 0) {:name name, :action identity})))
+ ([clause]
+ (one-or-more (str (clause-str clause) "+") clause))
+ ([name clause]
+ (merge (OneOrMoreImpl. clause false) {:name name, :action identity})))
-(defrecord Scanner [rulefn dot]
+(defrecord Scanner [rulefn scanned]
RuleKernel
(rule-deps [_] [])
(predict [self] [])
(scan [self input-token]
(if (and (not (is-complete? self)) (rulefn input-token))
[(advance self)]
[]))
- (is-complete? [_] (= dot 1))
- (advance [self] (assoc self :dot 1))
- (rule-str [_] (if (zero? dot)
- (clause-str rulefn)
- (str (clause-str rulefn) " *"))))
+ (is-complete? [_] scanned)
+ (advance [self] (assoc self :scanned true))
+ (rule-str [_] (if scanned
+ (str (clause-str rulefn) " *")
+ (clause-str rulefn))))
(defn scanner
"Creates a rule that accepts input tokens. For a token t, if (scanner-fn t)
is logical true, this rule matches that token. The default action returns the token."
([scanner-fn] (scanner scanner-fn identity))
([scanner-fn action]
- (wrap-kernel (Scanner. scanner-fn 0) nil action)))
+ (wrap-kernel (Scanner. scanner-fn false) nil action)))
(defn char-range
"Creates a rule that accepts any one character within a given range
@@ -85,45 +83,39 @@
(and (<= intx intmax) (>= intx intmin))))
action))))
-; Don't need to expose parser protocol... only 'parse' fn
-(defprotocol ^:private Parser
+(defprotocol Parser
(parse [parser input] "Parse the given input with the given parser,
- yielding a match tree.")
- ; charts is not yet usable by external users
- (charts [parser input]))
+ yielding a match tree."))
+
+(defprotocol ChartParser
+ (print-charts [parser input] "Prints this parser's charts to *out*.
+ Format is not fixed. A good explanation of parse charts
+ (for an Earley parser, but same idea) is at
+ http://www.wikipedia.org/wiki/Earley_parser."))
-; TODO rename
-(defn earley-parser
- "Constructs an Earley parser given a map of rules,
+(defn parser
+ "Constructs a parser given a map of rules,
a goal clause, and an optional tokenizer."
([goal rules]
- (earley-parser goal identity rules))
+ (parser goal identity rules))
([goal tokenizer rules]
- (reify Parser
+ (reify
+ Parser
(parse [parser input]
- ; For now, only return first match
- ; TODO don't return on failure? why does this work?
- (first (scan-goal (last (charts parser input)))))
- (charts [parser input]
- (parse-charts input rules tokenizer goal)))))
+ ; For now, only return first match. If failure, last chart will be empty
+ (-> (parse-charts input rules tokenizer goal) last scan-goal first))
+ ChartParser
+ (print-charts [parser input]
+ (pstr-charts (parse-charts input rules tokenizer goal))))))
-; TODO should be able to use rule-str instead of clause-str.
(defn print-match
"Prints a pretty shorthand tree to *out*."
[match]
((fn f [{:keys [rule submatches]} depth]
(println (apply str (repeat depth " ")) (clause-str rule))
- (doall (map #(f % (+ depth 2)) submatches)))
+ (domap #(f % (+ depth 2)) submatches))
match 0)
- nil) ; don't return a tree full of nils!
-
-(defn print-charts
- "For a given parser and input, prints a multi-line representation of its charts to
- *out*. The representation might change in the future. For more about
- parse charts, see http://www.wikipedia.org/wiki/Earley_parser. Useful for debugging."
- [parser input]
- (dorun (for [chart (charts parser input)]
- (println (pstr chart)))))
+ nil) ; don't return a tree full of nils
(defn take-action
"Executes the parse actions for a parser match."
@@ -263,7 +255,6 @@
(declare close-rule)
-; TODO map interface perhaps
(defrecord ^:private ClosedRule [rule grammar]
RuleKernel
(predict [self]
@@ -273,12 +264,13 @@
(scan [self input-token] (map #(assoc self :rule %) (scan rule input-token)))
(is-complete? [_] (is-complete? rule))
(advance [self] (assoc self :rule (advance rule)))
- (rule-str [_] (str "\\" (rule-str rule))))
+ (rule-str [_] (str "::" (rule-str rule))))
(defn close-rule [goal grammar]
"Creates a rule that closes over the given grammar. This rule
- can be used as a rule in other grammars, while being unaffected by that grammar."
- ; TODO closed rule name?
+ can be used as a rule in other grammars, while being unaffected by that grammar.
+
+ Closed rules are indicated in charts with a :: prefix."
(let [goal-rule (to-rule goal)]
(assoc goal-rule :kernel (ClosedRule. goal-rule grammar))))
@@ -293,4 +285,4 @@
(defn build-parser-with-ns
"Build a parser in a given ns from the given goal rule and tokenizer."
[goal tokenizer thens]
- (earley-parser goal tokenizer (build-grammar-with-ns goal thens)))
+ (parser goal tokenizer (build-grammar-with-ns goal thens)))
View
@@ -1,3 +1,4 @@
+; TODO rename
(ns clearley.earley
(require [clearley.collections.ordered-set :as os]
[clearley.collections.ordered-multimap :as omm]
@@ -16,8 +17,8 @@
; original: the original (unadvanced) rule, used to populate matches
; match-count: the number of times this rule has been scanned or advanced
(defrecord Item [name rule original match-count]
- PStrable
- (pstr [_]
+ npda/IPrinting
+ (npda/pstr [_]
(str name " -> " (rule-str rule))))
(defn new-item [name clause]
@@ -42,9 +43,9 @@
(defn pstr-item-set-item [item predictor-map]
(let [predictor-str
- (cutoff (separate-str ", " (map pstr (omm/get-vec predictor-map
- (:original item)))))]
- (str (pstr item) (if (seq predictor-str) (str " | " predictor-str)))))
+ (cutoff (separate-str ", " (map npda/pstr (omm/get-vec predictor-map
+ (:original item)))))]
+ (str (npda/pstr item) (if (seq predictor-str) (str " | " predictor-str)))))
(declare shift-item-set reduce-item-set item-set-reductions)
@@ -55,8 +56,8 @@
(npda/shift [self input] (shift-item-set self input))
(npda/reduce [self output] (reduce-item-set self output))
(npda/reductions [self] (item-set-reductions self))
- PStrable
- (pstr [self]
+ npda/IPrinting
+ (npda/pstr [self]
(with-out-str
(runmap println (map #(pstr-item-set-item % predictor-map) items)))))
@@ -121,14 +122,13 @@
(defn reduce-ostream [ostream]
(first (reduce reduce-ostream-helper '() ostream)))
-#_(defn parse [input-str grammar tokenizer goal]
- (npda/run-automaton-2 (new-item-set [(new-item ::goal goal)] grammar)
- input-str tokenizer))
-
(defn parse-charts [input-str grammar tokenizer goal]
(npda/run-automaton (new-item-set [(new-item ::goal goal)] grammar)
input-str tokenizer))
+(defn pstr-charts [charts]
+ (dorun (map-> charts npda/pstr println)))
+
; Searches states for completed parse of the goal rule, returning all matches
(defn scan-goal [chart]
(map (fn-> npda/popone npda/stream reduce-ostream)
View
@@ -5,7 +5,9 @@
[clojure.core :as core])
(use clearley.utils))
-;(defrecord Reduction [rule count])
+; For use in print-charts &c.
+(defprotocol IPrinting
+ (pstr [obj]))
(defprotocol Node
(shift [self input])
@@ -92,7 +94,7 @@
; doall prevents lazy my-prevs explosions with very large numbers of states
; for some reason
(AState. node my-rstream (doall (concat my-prevs op)))));)
- PStrable
+ IPrinting
(pstr [self]
(with-out-str
(println "State" (hexhash (state-key self)))
@@ -127,11 +129,13 @@
(add-state [_ new-state]
(AChart. (om/assoc my-states (state-key new-state) new-state)))
(states [_] (om/vals my-states))
- PStrable
+ IPrinting
(pstr [self]
(with-out-str
(println "===")
- (print (separate-str "---\n" (map pstr (states self))))
+ (if (seq (states self))
+ (print (separate-str "---\n" (map pstr (states self))))
+ (print "(empty)\n"))
(println "==="))))
(def empty-chart (AChart. om/empty))
@@ -170,19 +174,24 @@
(defn process-chart [chart token input]
(reduce-chart (shift-chart chart token input)))
-; TODO test laziness
+; Laziness knocks the big-O down a notch
+; but doesn't get us to best-case O(n^2)--O(1) for CLR(k) grammars
+; because we store matches in the chart. Push parsing could be added in the future
+; to accomplish this.
(defn run-automaton-helper [input current-chart tokenizer]
- (lazy-seq
- (when-let [thechar (first input)]
- (let [next-chart (process-chart current-chart (tokenizer thechar) thechar)]
- (if (seq (states next-chart))
- (cons next-chart (run-automaton-helper (rest input) next-chart tokenizer))
- (list next-chart))))))
+ (cons current-chart
+ (lazy-seq
+ (when-let [thechar (first input)]
+ (let [next-chart (process-chart current-chart (tokenizer thechar) thechar)]
+ (if (seq (states next-chart))
+ (run-automaton-helper (rest input) next-chart tokenizer)
+ (list next-chart))))))) ; Puts an empty chart at the end
; Runs the automaton, returning a sequence of charts
(defn run-automaton [initial-node input tokenizer]
(run-automaton-helper input (initial-chart initial-node) tokenizer))
+; Saved for later
#_(defn fast-run-automaton [initial-node input tokenizer]
(loop [remaining-input input
current-chart (initial-chart initial-node)]
View
@@ -3,7 +3,7 @@
(use clearley.utils)
(require clojure.string))
-; TODO merge into core? simplify rule kernel(?)
+; TODO merge into core? simplify rule kernel?
(defrecord Match [rule submatches])
(defn match [rule submatches]
@@ -119,7 +119,6 @@
; Rules
; ===
-; TODO rule attrs/metadata?
(defrecord RuleImpl [kernel name action]
RuleKernel
(predict [self] (predict kernel))
@@ -155,4 +154,6 @@
(rule-str [_] (cfg-rule-str clauses dot)))
(defn context-free-rule [name clauses action]
- (wrap-kernel (CfgRule. (vec clauses) 0) name action))
+ (if (= (count clauses) 0)
+ (TIAE "Clauses cannot be empty")
+ (wrap-kernel (CfgRule. (vec clauses) 0) name action)))
View
@@ -1,12 +1,6 @@
(ns clearley.utils)
; Some utils used by Clearley.
-; TODO: get rid of this protocol?
-; or maybe multimethod
-(defprotocol PStrable
- (pstr [obj] "pstr stands for \"pretty-string\".
- Returns a shorthand str of this item."))
-
(defmacro thrownew [extype & strs]
`(throw (new ~extype (str ~@strs))))
@@ -16,7 +16,7 @@
:num [num1 (rulefn :num \2) (rulefn :num \3) (rulefn :num \4)
(rulefn :num \5 \5) (rule :num "777" nil)]})
-(def simple-parser (earley-parser :sum simple-parser-rules))
+(def simple-parser (parser :sum simple-parser-rules))
(def-parser-test basic-parser-test simple-parser
(is-parsing "1+2")
@@ -54,7 +54,7 @@
(char (- (int thechar) 48))
thechar))
-(def letter-to-num-parser (earley-parser :sum letter-to-num simple-parser-rules))
+(def letter-to-num-parser (parser :sum letter-to-num simple-parser-rules))
(def-parser-test basic-tokenizer-test letter-to-num-parser
(is-ast [[[\a]]] "a")
@@ -70,7 +70,7 @@
:num [(rule :num [\2] (fn [_] 2))
(rule :num [\3] (fn [_] 3))]})
-(def calculator-parser (earley-parser :sum calculator-rules))
+(def calculator-parser (parser :sum calculator-rules))
(def-parser-test calculator-test calculator-parser
(is-action 5 "2+3")
@@ -81,7 +81,7 @@
(def embedded-rules
{:a [(rule :a [\a [\b \c] (rule :d [\d] nil)] nil)]})
-(def embedded-rule-parser (earley-parser :a embedded-rules))
+(def embedded-rule-parser (parser :a embedded-rules))
(def-parser-test rule-embedding-test embedded-rule-parser
(is-parsing "abd")
@@ -186,3 +186,10 @@
(isnt (parses? "4+4"))
(is-action 19 "2*3+2*2+3*3")
(isnt (parses? "0+1*2+3*4+9"))))
+
+(def one-or-more-s (one-or-more \s))
+(def one-or-more-test-parser (build-parser one-or-more-s))
+(deftest special-rules-test
+ (with-parser one-or-more-test-parser
+ (is-parsing "sssss")
+ (isnt (parses? "sssst"))))

0 comments on commit 6963591

Please sign in to comment.