diff --git a/doc/lib.txt b/doc/lib.txt index 5ff6de7fd089..e24db97e0492 100644 --- a/doc/lib.txt +++ b/doc/lib.txt @@ -87,7 +87,7 @@ Collections and algorithms * `sequtils `_ This module implements operations for the built-in seq type which were inspired by functional programming languages. - + String handling --------------- @@ -100,6 +100,9 @@ String handling * `parseutils `_ This module contains helpers for parsing tokens, numbers, identifiers, etc. +* `strscans `_ + This module contains a ``scanf`` macro for convenient parsing of mini languages. + * `strtabs `_ The ``strtabs`` module implements an efficient hash table that is a mapping from strings to strings. Supports a case-sensitive, case-insensitive and diff --git a/lib/pure/parseutils.nim b/lib/pure/parseutils.nim index ffa234edddc2..fb7d72182d89 100644 --- a/lib/pure/parseutils.nim +++ b/lib/pure/parseutils.nim @@ -173,6 +173,22 @@ proc parseUntil*(s: string, token: var string, until: char, result = i-start token = substr(s, start, i-1) +proc parseUntil*(s: string, token: var string, until: string, + start = 0): int {.inline.} = + ## parses a token and stores it in ``token``. Returns + ## the number of the parsed characters or 0 in case of an error. A token + ## consists of any character that comes before the `until` token. + var i = start + while i < s.len: + if s[i] == until[0]: + var u = 1 + while i+u < s.len and u < until.len and s[i+u] == until[u]: + inc u + if u >= until.len: break + inc(i) + result = i-start + token = substr(s, start, i-1) + proc parseWhile*(s: string, token: var string, validChars: set[char], start = 0): int {.inline.} = ## parses a token and stores it in ``token``. Returns diff --git a/lib/pure/strscans.nim b/lib/pure/strscans.nim new file mode 100644 index 000000000000..72719e3020d8 --- /dev/null +++ b/lib/pure/strscans.nim @@ -0,0 +1,296 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2016 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +##[ +This module contains a `scanf`:idx: macro that can be used for extracting +substrings from an input string. This is often easier than regular expressions. +Some examples as an apetizer: + +.. code-block:: nim + # check if input string matches a triple of integers: + const input = "(1,2,4)" + var x, y, z: int + if scanf("($i,$i,$i)", input, x, y, z): + echo "matches and x is ", x, " y is ", y, " z is ", z + + # check if input string matches an ISO date followed by an identifier followed + # by whitespace and a floating point number: + var year, month, day: int + var identifier: string + var myfloat: float + if scanf("$i-$i-$i $w$s$f", input, year, month, day, identifier, myfloat): + echo "yes, we have a match!" + +As can be seen from the examples, strings are matched verbatim except for +substrings starting with ``$``. These constructions are available: + +================= ======================================================== +``$i`` Matches an integer. This uses ``parseutils.parseInt``. +``$f`` Matches a floating pointer number. Uses ``parseFloat``. +``$w`` Matches an ASCII identifier: ``[A-Z-a-z_][A-Za-z_0-9]*``. +``$s`` Skips optional whitespace. +``$$`` Matches a single dollar sign. +``$.`` Matches if the end of the input string has been reached. +``$*`` Matches until the token following the ``$*`` was found. + The match is allowed to be of 0 length. +``$+`` Matches until the token following the ``$+`` was found. + The match must consist of at least one char. +``${foo}`` User defined matcher. Uses the proc ``foo`` to perform + the match. See below for more details. +``$[foo]`` Call user defined proc ``foo`` to **skip** some optional + parts in the input string. See below for more details. +================= ======================================================== + +Even though ``$*`` and ``$+`` look similar to the regular expressions ``.*`` +and ``.+`` they work quite differently, there is no non-deterministic +state machine involved and the matches are non-greedy. ``[$*]`` +matches ``[xyz]`` via ``parseutils.parseUntil``. + +Furthermore no backtracking is performed, if parsing fails after a value +has already been bound to a matched subexpression this value is not restored +to its original value. This rarely causes problems in practice and if it does +for you, it's easy enough to bind to a temporary variable first. + + +Startswith vs full match +======================== + +``scanf`` returns true if the input string **starts with** the specified +pattern. If instead it should only return true if theres is also nothing +left in the input, append ``$.`` to your pattern. + + +User definable matchers +======================= + +One very nice advantage over regular expressions is that ``scanf`` is +extensible with ordinary Nim procs. The proc is either enclosed in ``${}`` +or in ``$[]``. ``${}`` matches and binds the result +to a variable (that was passed to the ``scanf`` macro) while ``$[]`` merely +optional tokens. + + +In this example, we define a helper proc ``skipSep`` that skips some separators +which we then use in our scanf pattern to help us in the matching process: + +.. code-block:: nim + + proc someSep(input: string; start: int; seps: set[char] = {':','-','.'}): int = + # Note: The parameters and return value must match to what ``scanf`` requires + result = 0 + while input[start+result] in seps: inc result + + if scanf("$w${someSep}$w", input, key, value): + ... + +It also possible to pass arguments to a user definable matcher: + +.. code-block:: nim + + proc ndigits(input: string; start: int; intVal: var int; n: int): int = + # matches exactly ``n`` digits. Matchers need to return 0 if nothing + # matched or otherwise the number of processed chars. + var x = 0 + var i = 0 + while i < n and i+start < input.len and input[i+start] in {'0'..'9'}: + x = x * 10 + input[i+start].ord - '0'.ord + inc i + # only overwrite if we had a match + if i == n: + result = n + intVal = x + + # match an ISO date extracting year, month, day at the same time. + # Also ensure the input ends after the ISO date: + var year, month, day: int + if scanf("${ndigits(4)}-${ndigits(2)}-${ndigits(2)}$.", "2013-01-03", year, month, day): + ... + +]## + + +import macros, parseutils + +proc conditionsToIfChain(n, idx, res: NimNode; start: int): NimNode = + assert n.kind == nnkStmtList + if start >= n.len: return newAssignment(res, newLit true) + var ifs: NimNode = nil + if n[start+1].kind == nnkEmpty: + ifs = conditionsToIfChain(n, idx, res, start+3) + else: + ifs = newIfStmt((n[start+1], + newTree(nnkStmtList, newCall(bindSym"inc", idx, n[start+2]), + conditionsToIfChain(n, idx, res, start+3)))) + result = newTree(nnkStmtList, n[start], ifs) + +proc notZero(x: NimNode): NimNode = newCall(bindSym"!=", x, newLit 0) + +proc buildUserCall(x: string; args: varargs[NimNode]): NimNode = + let y = parseExpr(x) + result = newTree(nnkCall) + if y.kind in nnkCallKinds: result.add y[0] + else: result.add y + for a in args: result.add a + if y.kind in nnkCallKinds: + for i in 1..=", idx, newCall(bindSym"len", input)) + else: + result.add res + +when isMainModule: + proc twoDigits(input: string; x: var int; start: int): int = + if input[start] == '0' and input[start+1] == '0': + result = 2 + x = 13 + else: + result = 0 + + proc someSep(input: string; start: int; seps: set[char] = {';',',','-','.'}): int = + result = 0 + while input[start+result] in seps: inc result + + var key, val: string + var intval: int + var floatval: float + doAssert scanf("$w$s::$s$w$s$i $f", "abc:: xyz 89 33.25", key, val, intval, floatVal) + doAssert key == "abc" + doAssert val == "xyz" + doAssert intval == 89 + doAssert floatVal == 33.25 + + let xx = scanf("$$$i", "$abc", intval) + doAssert xx == false + + + let xx2 = scanf("$$$i", "$1234", intval) + doAssert xx2 + + let yy = scanf("$[someSep]Breakpoint${twoDigits}$[someSep({';','.','-'})] [$+]$.", ";.--Breakpoint00 [output]", intVal, key) + doAssert yy + doAssert key == "output" + doAssert intVal == 13 diff --git a/web/news.txt b/web/news.txt index c9561c7a5bec..b6ce533c8d40 100644 --- a/web/news.txt +++ b/web/news.txt @@ -45,6 +45,9 @@ Library Additions - The rlocks module has been added providing reentrant lock synchronization primitive. - A generic "sink operator" written as ``&=`` has been added to the ``system`` and the ``net`` modules. +- Added ``strscans`` module that implements a ``scanf`` for easy input extraction. +- Added a version of ``parseutils.parseUntil`` that can deal with a string ``until`` token. The other + versions are for ``char`` and ``set[char]``. Compiler Additions @@ -62,6 +65,7 @@ Language Additions - Nim now supports ``partial`` object declarations to mitigate the problems that arise when types are mutually dependent and yet should be kept in different modules. +- ``include`` statements are not restricted to top level statements anymore. 2016-01-27 Nim in Action is now available! diff --git a/web/website.ini b/web/website.ini index d1f8a04bf3dd..2084d9240371 100644 --- a/web/website.ini +++ b/web/website.ini @@ -40,7 +40,7 @@ srcdoc2: "pure/concurrency/threadpool.nim;pure/concurrency/cpuinfo.nim" srcdoc: "system/threads.nim;system/channels.nim;js/dom" srcdoc2: "pure/os;pure/strutils;pure/math;pure/matchers;pure/algorithm" srcdoc2: "pure/stats;impure/nre;windows/winlean" -srcdoc2: "pure/complex;pure/times;pure/osproc;pure/pegs;pure/dynlib" +srcdoc2: "pure/complex;pure/times;pure/osproc;pure/pegs;pure/dynlib;pure/strscans" srcdoc2: "pure/parseopt;pure/parseopt2;pure/hashes;pure/strtabs;pure/lexbase" srcdoc2: "pure/parsecfg;pure/parsexml;pure/parsecsv;pure/parsesql" srcdoc2: "pure/streams;pure/terminal;pure/cgi;pure/unicode"