View
@@ -1,48 +1,21 @@
/*
Copyright 2014 Google Inc. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
* _fastrand.c -- Python extension module to generate random bit vectors
* quickly.
*
* IMPORTANT: This module does not use crytographically strong randomness. It
* should be used ONLY be used to speed up the simulation. Don't use it in
* production.
*
* If an adversary can predict which random bits are flipped, then RAPPOR's
* privacy is compromised.
*
* Fast lexer using re2c.
*/
#include <stdarg.h> // va_list, etc.
#include <stdio.h> // printf
#include <fnmatch.h>
#include <glob.h>
#ifdef __FreeBSD__
#include <gnu/posix/regex.h>
#else
#include <regex.h>
#endif
#include <Python.h>
#include "id.h"
#include "osh-ast.h"
// this is generated C code, but we want a single translation unit
#include "osh-lex.h"
// TODO: Should this be shared among all extensions?
// Log messages to stderr.
void debug(const char* fmt, ...) {
#ifdef LIBC_VERBOSE
#if 1
va_list args;
va_start(args, fmt);
vfprintf(stderr, fmt, args);
@@ -52,260 +25,40 @@ void debug(const char* fmt, ...) {
}
static PyObject *
func_fnmatch(PyObject *self, PyObject *args) {
const char *pattern;
const char *str;
lex_MatchToken(PyObject *self, PyObject *args) {
int lex_mode;
const char* line;
if (!PyArg_ParseTuple(args, "ss", &pattern, &str)) {
return NULL;
}
int flags = 0;
int ret = fnmatch(pattern, str, flags);
switch (ret) {
case 0:
debug("matched: %s", str);
return PyLong_FromLong(1);
break;
case FNM_NOMATCH:
debug("no match: %s", str);
return PyLong_FromLong(0);
break;
default:
debug("other error: %s", str);
return PyLong_FromLong(-1);
break;
}
}
// error callback to glob()
int globerr(const char *path, int eerrno) {
fprintf(stderr, "%s: %s\n", path, strerror(eerrno));
return 0; // let glob() keep going
}
int line_len;
// Doesn't work! signed/unsigned confused?
//Py_ssize_t line_len;
static PyObject *
func_glob(PyObject *self, PyObject *args) {
const char* pattern;
if (!PyArg_ParseTuple(args, "s", &pattern)) {
int start_index;
if (!PyArg_ParseTuple(args, "is#i",
&lex_mode, &line, &line_len, &start_index)) {
return NULL;
}
debug("lex_mode %d, line_len %d, start_index %d\n",
lex_mode, line_len, start_index);
glob_t results;
// Hm, it's weird that the first one can't be called with GLOB_APPEND. You
// get a segfault.
int flags = 0;
// int flags = GLOB_APPEND;
//flags |= GLOB_NOMAGIC;
int ret = glob(pattern, flags, globerr, &results);
const char *err_str = NULL;
switch (ret) {
case 0: // no error
break;
case GLOB_ABORTED:
err_str = "read error";
break;
case GLOB_NOMATCH:
// No error, because not matching isn't necessarily a problem.
// NOTE: This can be turned on to log overaggressive calls to glob().
//err_str = "nothing matched";
break;
case GLOB_NOSPACE:
err_str = "no dynamic memory";
break;
default:
err_str = "unknown problem";
break;
}
if (err_str) {
fprintf(stderr, "%s: %s\n", pattern, err_str);
}
// http://stackoverflow.com/questions/3512414/does-this-pylist-appendlist-py-buildvalue-leak
size_t n = results.gl_pathc;
PyObject* matches = PyList_New(n);
// Print array of results
size_t i;
for (i = 0; i < n; i++) {
//printf("%s\n", results.gl_pathv[i]);
PyObject* m = Py_BuildValue("s", results.gl_pathv[i]);
PyList_SetItem(matches, i, m);
}
globfree(&results);
return matches;
}
static PyObject *
func_regex_parse(PyObject *self, PyObject *args) {
const char* pattern;
if (!PyArg_ParseTuple(args, "s", &pattern)) {
return NULL;
for (int i = 0; i < line_len; ++i) {
printf("%d c: %c\n", i, line[i]);
}
regex_t pat;
// This is an extended regular expression rather than a basic one, i.e. we
// use 'a*' instaed of 'a\*'.
int ret = regcomp(&pat, pattern, REG_EXTENDED);
regfree(&pat);
// Copied from man page
const char *err_str = NULL;
switch (ret) {
case 0: // success
break;
case REG_BADBR:
err_str = "Invalid use of back reference operator.";
break;
case REG_BADPAT:
err_str = "Invalid use of pattern operators such as group or list.";
break;
case REG_BADRPT:
err_str = "Invalid use of repetition operators such as using '*' as the first character.";
break;
case REG_EBRACE:
err_str = "Un-matched brace interval operators.";
break;
case REG_EBRACK:
err_str = "Un-matched bracket list operators.";
break;
case REG_ECOLLATE:
err_str = "Invalid collating element.";
break;
case REG_ECTYPE:
err_str = "Unknown character class name.";
break;
case REG_EESCAPE:
err_str = "Trailing backslash.";
break;
case REG_EPAREN:
err_str = "Un-matched parenthesis group operators.";
break;
case REG_ERANGE:
err_str = "Invalid use of the range operator, e.g., the ending point of the range occurs prior to the starting point.";
break;
case REG_ESPACE:
err_str = "The regex routines ran out of memory.";
break;
case REG_ESUBREG:
err_str = "Invalid back reference to a subexpression.";
break;
/* NOTE: These are not defined by musl libc on Alpine.
* TODO: If we can construct test cases for these, add them back.
* */
#if 0
case REG_EEND:
err_str = "Nonspecific error. This is not defined by POSIX.2.";
break;
case REG_ESIZE:
err_str = "Compiled regular expression requires a pattern buffer larger than 64Kb. This is not defined by POSIX.2.";
break;
#endif
default:
/* TODO: Add the integer to error message */
err_str = "Unknown error compiling regex";
}
if (err_str) {
// TODO: return a proper value?
fprintf(stderr, "Error compiling regex: %s\n", err_str);
Py_RETURN_FALSE;
} else {
Py_RETURN_TRUE;
}
}
static PyObject *
func_regex_match(PyObject *self, PyObject *args) {
const char* pattern;
const char* str;
if (!PyArg_ParseTuple(args, "ss", &pattern, &str)) {
return NULL;
}
regex_t pat;
regmatch_t m[2];
// Should have been checked by regex_parse
if (regcomp(&pat, pattern, REG_EXTENDED) != 0) {
fprintf(stderr, "Invalid regex at runtime\n");
return PyLong_FromLong(-1);
}
//regcomp(&pat, pattern, REG_EXTENDED);
int ret;
// must match at pos 0
if (regexec(&pat, str, 2, m, 0) == 0) {
debug("MATCH\n");
//if (regexec(&pat, str, 2, m, 0) == 0 && !m[0].rm_so) {
// Return first parenthesized subexpression as string, or length of match
/*
if (pat.re_nsub>0) {
ret->s = xmprintf("%.*s", m[1].rm_eo-m[1].rm_so, target+m[1].rm_so);
if (TT.refree) free(TT.refree);
TT.refree = ret->s;
} else assign_int(ret, m[0].rm_eo);
*/
ret = 1;
} else {
debug("NO MATCH");
/*
if (pat.re_nsub>0) ret->s = "";
else assign_int(ret, 0);
*/
ret = 0;
}
regfree(&pat);
// TODO: Return a list for BASH_REMATCH.
if (ret) {
return PyLong_FromLong(1);
} else {
Py_RETURN_FALSE;
return PyLong_FromLong(0);
}
}
static PyObject *
func_FindLongestMatch(PyObject *self, PyObject *args) {
return NULL;
int id;
int end_index;
MatchToken(lex_mode, line, line_len, start_index, &id, &end_index);
return Py_BuildValue("(ii)", id, end_index);
}
// TODO: enlist core/id_kind_gen and osh/lex_gen.py
// Rename to TokenMatcher?
// LineLexer holds CharMatcher? or TokenMatcher?
// SlowTokenMatcher
// FastTokenMatcher
PyMethodDef methods[] = {
{"FindLongestMatch", func_FindLongestMatch, METH_VARARGS,
"(lexer mode, line, pos) -> (end_index, tok_type, tok_val)."},
{"fnmatch", func_fnmatch, METH_VARARGS,
"Return whether a string matches a pattern."},
// Python's glob doesn't have char classes
{"glob", func_glob, METH_VARARGS,
"Return files that match a pattern."},
// https://docs.python.org/2/c-api/capsule.html#capsules
{"regex_parse", func_regex_parse, METH_VARARGS,
"Compile a regex in ERE syntax, returning whether it is valid"},
{"regex_match", func_regex_match, METH_VARARGS,
"Match regex against a string, returning a list of matches"},
{"MatchToken", lex_MatchToken, METH_VARARGS,
"(lexer mode, line, start_index) -> (id, end_index)."},
{NULL, NULL},
};
View
@@ -11,13 +11,32 @@
import unittest
import lex # module under test
from core.id_kind import Id
from osh import ast_ as ast
import lex # module under test
lex_mode_e = ast.lex_mode_e
def MatchToken(lex_mode, line, s):
tok_type, end_index = lex.MatchToken(lex_mode.enum_id, line, s)
return Id(tok_type), end_index
class LexTest(unittest.TestCase):
def testFnmatch(self):
def testMatchToken(self):
print(dir(lex))
print lex_mode_e.COMMENT.enum_id
result = MatchToken(lex_mode_e.COMMENT, 'line', 3)
print result
# Need to be able to pass NUL bytes for EOF.
result = MatchToken(lex_mode_e.OUTER, 'end of file\0', 3)
# TODO: Need to turn Id back?
print result
if __name__ == '__main__':
View
@@ -1,30 +1,5 @@
/*
Copyright 2014 Google Inc. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
* _fastrand.c -- Python extension module to generate random bit vectors
* quickly.
*
* IMPORTANT: This module does not use crytographically strong randomness. It
* should be used ONLY be used to speed up the simulation. Don't use it in
* production.
*
* If an adversary can predict which random bits are flipped, then RAPPOR's
* privacy is compromised.
*
* Python interface to libc functions.
*/
#include <stdarg.h> // va_list, etc.
View
@@ -121,23 +121,22 @@ def PrettyPrint(node, f=sys.stdout):
f.write('\n')
def _ParseAndMakeTypes(f, root):
# TODO: A better syntax for this might be:
#
# id = external
#
# in osh.asdl. Then we can show an error if it's not provided.
def LoadSchema(f):
app_types = {'id': asdl.UserType(Id)}
module = asdl.parse(f)
asdl_module = asdl.parse(f)
# TODO: Need some metaprogramming here to add id and kind.
# Check for type errors
if not asdl.check(module, app_types):
# NOTE: This only checks for overlapping sum types, which will no longer be
# an error.
if not asdl.check(asdl_module, app_types):
raise AssertionError('ASDL file is invalid')
py_meta.MakeTypes(module, root, app_types)
return asdl_module, app_types
f = util.GetResourceLoader().open('osh/osh.asdl')
asdl_module, app_types = LoadSchema(f)
root = sys.modules[__name__]
_ParseAndMakeTypes(f, root)
py_meta.MakeTypes(asdl_module, root, app_types)
f.close()
View
@@ -0,0 +1,54 @@
#!/usr/bin/python
"""
ast_gen.py
"""
import sys
from core import util
from osh import ast_ as ast
from asdl import gen_cpp
lex_mode_e = ast.lex_mode_e
def main(argv):
#print lex_mode_e
#print dir(lex_mode_e)
with open('osh/osh.asdl') as f:
asdl_module, _ = ast.LoadSchema(f)
# TODO: Generate C files for lex_mode_e, id_e, etc.
# It already works for lex_mode. Except it's C++. Maybe write one that's C?
#
# C doesn't have namespaces!
# lex_mode_e__NONE?
# lex_mode_e__OUTER?
# The re2c code will have a switch statement for that?
#
# Or maybe just lex_mode__NONE.
#
# And then it will output id__Lit_Chars ?
# This is all generated so we can change it at any time.
# Eventually you may also generate code to map Id -> Kind? But right now you
# just need Id for the lexer!
#print asdl_module
#print gen_cpp
v = gen_cpp.CEnumVisitor(sys.stdout)
v.VisitModule(asdl_module)
# NOTE: MakeTypes does things in a certain order.
if __name__ == '__main__':
try:
main(sys.argv)
except RuntimeError as e:
print >>sys.stderr, 'FATAL: %s' % e
sys.exit(1)
View
@@ -0,0 +1,97 @@
#!/usr/bin/python
"""
lex_gen.py
"""
import sys
import sre_parse
from osh import lex
def PrintTree(re_tree, depth=2):
"""
re_tree: List of children
"""
for child in re_tree:
name, arg = child
sys.stdout.write(depth * '\t')
sys.stdout.write(name)
sys.stdout.write(' ')
if name == 'in': # character class
print '{'
PrintTree(arg, depth=depth+1)
sys.stdout.write(depth * '\t')
print '}'
elif name == 'max_repeat': # repetition
min_, max_, children = arg
# min = 0 means *, min = 1 means +
assert min_ in (0, 1), min_
print min_, max_, '{'
PrintTree(children, depth=depth+1)
sys.stdout.write(depth * '\t')
print
elif name == 'negate': # Oh this is a ^. It doesn't form a node.
assert arg is None
print
elif name == 'literal': # Quote \ and " in re2c syntax
print repr(chr(arg))
elif name == 'not_literal': # ditto
print repr(chr(arg))
elif name == 'range': # ascii range
begin, end = arg
print repr(chr(begin)), repr(chr(end))
elif name == 'any': # This is the '.' character
assert arg is None
print
else:
raise AssertionError(name)
# NOTE: negate and not_literal are sort of duplicated
def re2c_convert(re_tree):
print '\t\t['
PrintTree(re_tree)
print '\t\t]'
return re_tree
# note: use YYCURSOR and YYLIMIT
# limit should be the end of string
# line + line_len
def main(argv):
# This becomes osh-lex.re2c.c. It is compiled to osh-lex.c and then
# included.
print """
inline void MatchToken(int lexer_mode, char* line, int line_len, int start_index,
int* id, int* end_index) {
*id = id__Lit_Chars;
//*id = id__Lit_Other;
*end_index = 3;
}
"""
return
# Top level is a switch statement.
for state, pat_list in lex.LEXER_DEF.iteritems():
print state
# This level is re2c patterns.
for is_regex, pat, token_id in pat_list:
print '\t%r -> %r' % (pat, token_id)
if is_regex:
re_tree = sre_parse.parse(pat)
#print re_tree
out_pat = re2c_convert(re_tree)
#print out_pat
print
if __name__ == '__main__':
try:
main(sys.argv)
except RuntimeError as e:
print >>sys.stderr, 'FATAL: %s' % e
sys.exit(1)