Navigation Menu

Skip to content

Commit

Permalink
updated SAX API
Browse files Browse the repository at this point in the history
  • Loading branch information
Peter Ohler committed Sep 21, 2011
1 parent d55223d commit 40870ff
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 116 deletions.
6 changes: 3 additions & 3 deletions README.md
Expand Up @@ -22,7 +22,7 @@ A fast XML parser and Object marshaller as a Ruby gem.

### Release 1.3.2

- SAX parser bug fix that missed element attributes, 7.5 times faster than Nokogiri and 2.6 times faster than LibXML
- Changed SAX parser API for element and instruction attributes

## <a name="description">Description</a>

Expand Down Expand Up @@ -56,8 +56,8 @@ files may result in slightly different times.
As an Object serializer Ox is up to 6 times faster than the standard Ruby
Marshal.dump() and up to 3 times faster than Marshal.load().

The SAX like stream parser is over 7 times faster than Nokogiri and more than
2.5 times faster than LibXML when using a trivial Ruby side set of
The SAX like stream parser is 40 times faster than Nokogiri and more than 13
times faster than LibXML when validating a file with minimal Ruby
callbacks. Unlike Nokogiri and LibXML, Ox can be tuned to use only the SAX
callbacks that are of interest to the caller. (See the perf_sax.rb file for an
example.)
Expand Down
2 changes: 2 additions & 0 deletions ext/ox/ox.c
Expand Up @@ -46,6 +46,7 @@ void Init_ox();
VALUE Ox = Qnil;

ID at_id;
ID attr_id;
ID attributes_id;
ID beg_id;
ID cdata_id;
Expand Down Expand Up @@ -616,6 +617,7 @@ void Init_ox() {

rb_require("time");
at_id = rb_intern("at");
attr_id = rb_intern("attr");
attributes_id = rb_intern("@attributes");
beg_id = rb_intern("@beg");
cdata_id = rb_intern("cdata");
Expand Down
1 change: 1 addition & 0 deletions ext/ox/ox.h
Expand Up @@ -206,6 +206,7 @@ extern void write_obj_to_file(VALUE obj, const char *path, Options copts);
extern VALUE Ox;

extern ID at_id;
extern ID attr_id;
extern ID attributes_id;
extern ID beg_id;
extern ID cdata_id;
Expand Down
64 changes: 31 additions & 33 deletions ext/ox/sax.c
Expand Up @@ -56,6 +56,7 @@ typedef struct _SaxDrive {
VALUE io;
};
int has_instruct;
int has_attr;
int has_doctype;
int has_comment;
int has_cdata;
Expand All @@ -80,7 +81,7 @@ static int read_cdata(SaxDrive dr);
static int read_comment(SaxDrive dr);
static int read_element(SaxDrive dr);
static int read_text(SaxDrive dr);
static int read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2, int gather);
static int read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml);
static char read_name_token(SaxDrive dr);
static int read_quoted_value(SaxDrive dr);

Expand Down Expand Up @@ -162,6 +163,7 @@ ox_sax_parse(VALUE handler, VALUE io) {
#if 0
printf("*** sax_parse with these flags\n");
printf(" has_instruct = %s\n", dr.has_instruct ? "true" : "false");
printf(" has_attr = %s\n", dr.has_attr ? "true" : "false");
printf(" has_doctype = %s\n", dr.has_doctype ? "true" : "false");
printf(" has_comment = %s\n", dr.has_comment ? "true" : "false");
printf(" has_cdata = %s\n", dr.has_cdata ? "true" : "false");
Expand Down Expand Up @@ -199,6 +201,7 @@ sax_drive_init(SaxDrive dr, VALUE handler, VALUE io) {
dr->col = 0;
dr->handler = handler;
dr->has_instruct = rb_respond_to(handler, instruct_id);
dr->has_attr = rb_respond_to(handler, attr_id);
dr->has_doctype = rb_respond_to(handler, doctype_id);
dr->has_comment = rb_respond_to(handler, comment_id);
dr->has_cdata = rb_respond_to(handler, cdata_id);
Expand Down Expand Up @@ -373,31 +376,25 @@ read_children(SaxDrive dr, int first) {
*/
static int
read_instruction(SaxDrive dr) {
VALUE target = Qnil;
VALUE attrs = Qnil;
char c;

if ('\0' == (c = read_name_token(dr))) {
return -1;
}
if (dr->has_instruct) {
target = rb_str_new2(dr->str);
VALUE args[1];

args[0] = rb_str_new2(dr->str);
rb_funcall2(dr->handler, instruct_id, 1, args);
}
if (0 != read_attrs(dr, &attrs, c, '?', '?', dr->has_instruct)) {
if (0 != read_attrs(dr, c, '?', '?', (0 == strcmp("xml", dr->str)))) {
return -1;
}
c = next_non_white(dr);
if ('>' != c) {
sax_drive_error(dr, "invalid format, instruction not terminated", 1);
return -1;
}
if (dr->has_instruct) {
VALUE args[2];

args[0] = target;
args[1] = attrs;
rb_funcall2(dr->handler, instruct_id, 2, args);
}
dr->str = 0;

return 0;
Expand Down Expand Up @@ -519,20 +516,25 @@ read_comment(SaxDrive dr) {
static int
read_element(SaxDrive dr) {
VALUE name = Qnil;
VALUE attrs = Qnil;
char c;
int closed;

if ('\0' == (c = read_name_token(dr))) {
return -1;
}
name = str2sym(dr->str);
if (dr->has_start_element) {
VALUE args[1];

args[0] = name;
rb_funcall2(dr->handler, start_element_id, 1, args);
}
if ('/' == c) {
closed = 1;
} else if ('>' == c) {
closed = 0;
} else {
if (0 != read_attrs(dr, &attrs, c, '/', '>', dr->has_start_element)) {
if (0 != read_attrs(dr, c, '/', '>', 0)) {
return -1;
}
closed = ('/' == *(dr->cur - 1));
Expand All @@ -544,17 +546,14 @@ read_element(SaxDrive dr) {
return -1;
}
}
if (dr->has_start_element) {
VALUE args[2];
if (closed) {
if (dr->has_end_element) {
VALUE args[1];

args[0] = name;
args[1] = attrs;
rb_funcall2(dr->handler, start_element_id, 2, args);
if (closed && dr->has_end_element) {
args[0] = name;
rb_funcall2(dr->handler, end_element_id, 1, args);
}
}
if (!closed) {
} else {
if (0 != read_children(dr, 0)) {
return -1;
}
Expand Down Expand Up @@ -601,7 +600,7 @@ read_text(SaxDrive dr) {
}

static int
read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2, int gather) {
read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml) {
VALUE name = Qnil;
int is_encoding = 0;

Expand All @@ -618,10 +617,10 @@ read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2, int gather
if ('\0' == (c = read_name_token(dr))) {
return -1;
}
if ('?' == termc && 0 == strcmp("encoding", dr->str)) {
if (is_xml && 0 == strcmp("encoding", dr->str)) {
is_encoding = 1;
}
if (gather) {
if (dr->has_attr) {
name = str2sym(dr->str);
}
if (is_white(c)) {
Expand All @@ -639,18 +638,17 @@ read_attrs(SaxDrive dr, VALUE *attrs, char c, char termc, char term2, int gather
dr->encoding = rb_enc_find(dr->str);
}
#endif
if (gather) {
VALUE rstr = rb_str_new2(dr->str);

if (Qnil == *attrs) {
*attrs = rb_hash_new();
}
if (dr->has_attr) {
VALUE args[2];

args[0] = name;
args[1] = rb_str_new2(dr->str);
#ifdef HAVE_RUBY_ENCODING_H
if (0 != dr->encoding) {
rb_enc_associate(rstr, dr->encoding);
rb_enc_associate(args[1], dr->encoding);
}
#endif
rb_hash_aset(*attrs, name, rstr);
rb_funcall2(dr->handler, attr_id, 2, args);
}
c = next_non_white(dr);
}
Expand Down
12 changes: 8 additions & 4 deletions lib/ox/sax.rb
Expand Up @@ -26,12 +26,13 @@ module Ox
# be made public in the subclasses. If the methods remain private they will
# not be called during parsing.
#
# def instruct(target, attrs); end
# def instruct(target); end
# def attr(name, value); end
# def doctype(value); end
# def comment(value); end
# def cdata(value); end
# def text(value); end
# def start_element(name, attrs); end
# def start_element(name); end
# def end_element(name); end
#
class Sax
Expand All @@ -44,7 +45,10 @@ def initialize()
# they will not be called during parsing.
private

def instruct(target, attrs)
def instruct(target)
end

def attr(name, value)
end

def doctype(value)
Expand All @@ -59,7 +63,7 @@ def cdata(value)
def text(value)
end

def start_element(name, attrs)
def start_element(name)
end

def end_element(name)
Expand Down
22 changes: 8 additions & 14 deletions test/perf_sax.rb
Expand Up @@ -28,7 +28,6 @@
$verbose = 0
$ox_only = false
$all_cbs = false
$type_cbs = false
$filename = nil # nil indicates new file names perf.xml will be created and used
$filesize = 1000 # KBytes
$iter = 100
Expand All @@ -37,7 +36,6 @@
opts.on("-v", "increase verbosity") { $verbose += 1 }
opts.on("-x", "ox only") { $ox_only = true }
opts.on("-a", "all callbacks") { $all_cbs = true }
opts.on("-t", "typical callbacks") { $typ_cbs = true }
opts.on("-f", "--file [String]", String, "filename") { |f| $filename = f }
opts.on("-i", "--iterations [Int]", Integer, "iterations") { |i| $iter = i }
opts.on("-s", "--size [Int]", Integer, "file size in KBytes") { |s| $filesize = s }
Expand Down Expand Up @@ -79,31 +77,27 @@ def create_file(filename, size)
end

class OxSax < ::Ox::Sax
def start_element(name, attrs); end
def error(message, line, column); puts message; end
end

class OxTypSax < OxSax
def end_element(name); end
def text(value); end
end

class OxAllSax < OxSax
def start_element(name); end
def attr(name, value); end
def end_element(name); end
def instruct(target, attrs); end
def text(value); end
def instruct(target); end
def doctype(value); end
def comment(value); end
def cdata(value); end
def text(value); end
end

unless defined?(::Nokogiri).nil?
class NoSax < Nokogiri::XML::SAX::Document
def start_element(name, attrs = []); end
def error(message); puts message; end
def warning(message); puts message; end
end
class NoAllSax < NoSax
def start_element(name, attrs = []); end
def characters(text); end
def cdata_block(string); end
def comment(string); end
Expand All @@ -117,9 +111,9 @@ def xmldecl(version, encoding, standalone); end
unless defined?(::LibXML).nil?
class LxSax
include LibXML::XML::SaxParser::Callbacks
def on_start_element(element, attributes); end
end
class LxAllSax < LxSax
def on_start_element(element, attributes); end
def on_cdata_block(cdata); end
def on_characters(chars); end
def on_comment(msg); end
Expand All @@ -141,7 +135,7 @@ def on_start_element_ns(name, attributes, prefix, uri, namespaces); end

def perf_stringio()
start = Time.now
handler = $all_cbs ? OxAllSax.new() : ($typ_cbs ? OxTypSax.new() : OxSax.new())
handler = $all_cbs ? OxAllSax.new() : OxSax.new()
$iter.times do
input = StringIO.new($xml_str)
Ox.sax_parse(handler, input)
Expand Down Expand Up @@ -188,7 +182,7 @@ def perf_fileio()
puts "A #{$filesize} KByte XML file was parsed #{$iter} times for this test."
puts "\n"
start = Time.now
handler = $all_cbs ? OxAllSax.new() : ($typ_cbs ? OxTypSax.new() : OxSax.new())
handler = $all_cbs ? OxAllSax.new() : OxSax.new()
$iter.times do
input = IO.open(IO.sysopen($filename))
Ox.sax_parse(handler, input)
Expand Down

0 comments on commit 40870ff

Please sign in to comment.